FrancescoConte commited on
Commit
fdaba9b
·
verified ·
1 Parent(s): 4126de0

First setup

Browse files
Files changed (3) hide show
  1. DOCKERFILE +21 -0
  2. requirements.txt +8 -3
  3. sentiment_huggingface.py +148 -0
DOCKERFILE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy your app files
8
+ COPY . /app
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Create a writable .streamlit directory to avoid permission issues
14
+ RUN mkdir -p /tmp/.streamlit
15
+ ENV STREAMLIT_HOME=/tmp/.streamlit
16
+
17
+ # Expose the correct port for Hugging Face Spaces (7860 is their default for Streamlit)
18
+ EXPOSE 7860
19
+
20
+ # Start the Streamlit app
21
+ CMD ["streamlit", "run", "app.py", "--server.headless=true", "--server.port=7860", "--server.address=0.0.0.0"]
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ huggingface_hub
5
+ datasets
6
+ evaluate
7
+ numpy
8
+ argparse
sentiment_huggingface.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ import numpy as np
3
+ from datasets import load_dataset
4
+ import evaluate
5
+ from transformers import AutoTokenizer
6
+ from transformers import DataCollatorWithPadding
7
+ from transformers import AutoModelForSequenceClassification
8
+ from transformers import TrainingArguments, Trainer
9
+ from transformers import pipeline
10
+ from huggingface_hub import notebook_login
11
+ import streamlit as st
12
+ import torch
13
+ import argparse
14
+
15
+ ### Parse arguments --------------------------
16
+ parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
17
+ parser.add_argument(
18
+ "--model",
19
+ type=str,
20
+ default="distilbert-base-uncased-finetuned-sst-2-english",
21
+ help="Pre-trained model name or path",
22
+ )
23
+ parser.add_argument(
24
+ "--dataset",
25
+ type=str,
26
+ default="imdb",
27
+ help="Dataset name or path",
28
+ )
29
+ parser.add_argument(
30
+ "--train",
31
+ action="store_true",
32
+ help="Train the model",
33
+ )
34
+ parser.add_argument(
35
+ "--seed",
36
+ type=int,
37
+ default=42,
38
+ help="Random seed",
39
+ )
40
+ args = parser.parse_args()
41
+ train=args.train
42
+ tokenizer_model=args.model
43
+ dataset=args.dataset
44
+ seed=args.seed
45
+
46
+ # Detect device automatically
47
+ device = 0 if torch.cuda.is_available() else -1
48
+
49
+ if train:
50
+ print(f"Training model {tokenizer_model} on dataset {dataset}")
51
+ ### Pre-process data ---------------------------------
52
+ # Load the IMDB dataset
53
+ imdb = load_dataset(dataset)
54
+
55
+ # Create smaller datasets
56
+ small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
57
+ small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])
58
+
59
+ # Use a tokenizer
60
+ #tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
61
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
62
+
63
+ # Prepare the text inputs by mapping
64
+ def preprocess_function(sample):
65
+ return tokenizer(sample["text"], truncation=True)
66
+ tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
67
+ tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
68
+
69
+ # Convert training samples to PyTorch tensors, concatenate them with padding -- faster!
70
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
71
+
72
+ ### Train the model ---------------------------------
73
+ # Define the base model
74
+ model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)
75
+
76
+ # Define the evaluation metrics
77
+ def compute_metrics(eval_pred):
78
+ load_accuracy = evaluate.load("accuracy")
79
+ load_f1 = evaluate.load("f1")
80
+ logits, labels = eval_pred
81
+ predictions = np.argmax(logits, axis=-1)
82
+ accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
83
+ f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
84
+ return {"accuracy": accuracy, "f1": f1}
85
+
86
+ # Connect to Hugging Face Hub
87
+ notebook_login()
88
+
89
+ # Define a trainer
90
+ repo_name = "FC_finetuning-sentiment-model-3000-samples"
91
+
92
+ training_args = TrainingArguments(
93
+ output_dir=repo_name,
94
+ learning_rate=2e-5,
95
+ per_device_train_batch_size=16,
96
+ per_device_eval_batch_size=16,
97
+ num_train_epochs=2,
98
+ weight_decay=0.01,
99
+ save_strategy="epoch",
100
+ push_to_hub=True,
101
+ )
102
+
103
+ trainer = Trainer(
104
+ model=model,
105
+ args=training_args,
106
+ train_dataset=tokenized_train,
107
+ eval_dataset=tokenized_test,
108
+ tokenizer=tokenizer,
109
+ data_collator=data_collator,
110
+ compute_metrics=compute_metrics,
111
+ )
112
+
113
+ trainer.train()
114
+ trainer.push_to_hub(commit_message="Training complete")
115
+
116
+ ### Evaluate the model -----------------------------
117
+ trainer.evaluate()
118
+ model_name="FrancescoConte/"+repo_name
119
+ else:
120
+ ### Load the model -------------------------------
121
+ print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
122
+ model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
123
+
124
+ # Load tokenizer and model directly from the Hub
125
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
126
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
127
+
128
+
129
+ ### Analyze data with the model -------------------
130
+ # Define the final sentiment analysis model
131
+ sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
132
+
133
+ ### Use the model --------------------
134
+ #try_text=["I love this move", "This movie sucks!"]
135
+ #print(sentiment_model(try_text))
136
+
137
+ ### Put it on streamlit -------------------
138
+
139
+ # streamlit app
140
+ st.title("Sentiment Analysis App")
141
+
142
+ text = st.text_area("Enter text for sentiment analysis:")
143
+
144
+ if st.button("Analyze"):
145
+ with st.spinner("Running inference..."):
146
+ result = sentiment_model(text)[0]
147
+ st.success(f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.5f}")
148
+