FrancescoConte commited on
Commit
ddc2737
·
1 Parent(s): 3cfefb3

removed the training from the web app

Browse files
Files changed (2) hide show
  1. requirements.txt +0 -3
  2. sentiment_huggingface.py +6 -122
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
  streamlit
2
  transformers
3
  torch
4
- huggingface_hub
5
- datasets
6
  evaluate
7
  numpy
8
- argparse
 
1
  streamlit
2
  transformers
3
  torch
 
 
4
  evaluate
5
  numpy
 
sentiment_huggingface.py CHANGED
@@ -1,132 +1,19 @@
1
  # Imports
2
  import numpy as np
3
- from datasets import load_dataset
4
- import evaluate
5
- from transformers import AutoTokenizer
6
- from transformers import DataCollatorWithPadding
7
- from transformers import AutoModelForSequenceClassification
8
- from transformers import TrainingArguments, Trainer
9
  from transformers import pipeline
10
- from huggingface_hub import notebook_login
11
  import streamlit as st
12
  import torch
13
- import argparse
14
-
15
- ### Parse arguments --------------------------
16
- parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
17
- parser.add_argument(
18
- "--model",
19
- type=str,
20
- default="distilbert-base-uncased-finetuned-sst-2-english",
21
- help="Pre-trained model name or path",
22
- )
23
- parser.add_argument(
24
- "--dataset",
25
- type=str,
26
- default="imdb",
27
- help="Dataset name or path",
28
- )
29
- parser.add_argument(
30
- "--train",
31
- action="store_true",
32
- help="Train the model",
33
- )
34
- parser.add_argument(
35
- "--seed",
36
- type=int,
37
- default=42,
38
- help="Random seed",
39
- )
40
- args = parser.parse_args()
41
- train=args.train
42
- tokenizer_model=args.model
43
- dataset=args.dataset
44
- seed=args.seed
45
 
46
  # Detect device automatically
47
  device = 0 if torch.cuda.is_available() else -1
48
 
49
- if train:
50
- print(f"Training model {tokenizer_model} on dataset {dataset}")
51
- ### Pre-process data ---------------------------------
52
- # Load the IMDB dataset
53
- imdb = load_dataset(dataset)
54
-
55
- # Create smaller datasets
56
- small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
57
- small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])
58
-
59
- # Use a tokenizer
60
- #tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
61
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
62
-
63
- # Prepare the text inputs by mapping
64
- def preprocess_function(sample):
65
- return tokenizer(sample["text"], truncation=True)
66
- tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
67
- tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
68
-
69
- # Convert training samples to PyTorch tensors, concatenate them with padding -- faster!
70
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
71
-
72
- ### Train the model ---------------------------------
73
- # Define the base model
74
- model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)
75
-
76
- # Define the evaluation metrics
77
- def compute_metrics(eval_pred):
78
- load_accuracy = evaluate.load("accuracy")
79
- load_f1 = evaluate.load("f1")
80
- logits, labels = eval_pred
81
- predictions = np.argmax(logits, axis=-1)
82
- accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
83
- f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
84
- return {"accuracy": accuracy, "f1": f1}
85
-
86
- # Connect to Hugging Face Hub
87
- notebook_login()
88
 
89
- # Define a trainer
90
- repo_name = "FC_finetuning-sentiment-model-3000-samples"
91
-
92
- training_args = TrainingArguments(
93
- output_dir=repo_name,
94
- learning_rate=2e-5,
95
- per_device_train_batch_size=16,
96
- per_device_eval_batch_size=16,
97
- num_train_epochs=2,
98
- weight_decay=0.01,
99
- save_strategy="epoch",
100
- push_to_hub=True,
101
- )
102
-
103
- trainer = Trainer(
104
- model=model,
105
- args=training_args,
106
- train_dataset=tokenized_train,
107
- eval_dataset=tokenized_test,
108
- tokenizer=tokenizer,
109
- data_collator=data_collator,
110
- compute_metrics=compute_metrics,
111
- )
112
-
113
- trainer.train()
114
- trainer.push_to_hub(commit_message="Training complete")
115
-
116
- ### Evaluate the model -----------------------------
117
- trainer.evaluate()
118
- model_name="FrancescoConte/"+repo_name
119
- else:
120
- ### Load the model -------------------------------
121
- print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
122
- model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
123
-
124
- # Load tokenizer and model directly from the Hub
125
- tokenizer = AutoTokenizer.from_pretrained(model_name)
126
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
127
-
128
-
129
- ### Analyze data with the model -------------------
130
  # Define the final sentiment analysis model
131
  sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
132
 
@@ -135,10 +22,7 @@ sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
135
  #print(sentiment_model(try_text))
136
 
137
  ### Put it on streamlit -------------------
138
-
139
- # streamlit app
140
  st.title("Sentiment Analysis App")
141
-
142
  text = st.text_area("Enter text for sentiment analysis:")
143
 
144
  if st.button("Analyze"):
 
1
  # Imports
2
  import numpy as np
 
 
 
 
 
 
3
  from transformers import pipeline
 
4
  import streamlit as st
5
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # Detect device automatically
8
  device = 0 if torch.cuda.is_available() else -1
9
 
10
+ tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
11
+ dataset="imdb"
12
+ seed = 42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ ### Load the model -------------------------------
15
+ print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
16
+ model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Define the final sentiment analysis model
18
  sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
19
 
 
22
  #print(sentiment_model(try_text))
23
 
24
  ### Put it on streamlit -------------------
 
 
25
  st.title("Sentiment Analysis App")
 
26
  text = st.text_area("Enter text for sentiment analysis:")
27
 
28
  if st.button("Analyze"):