Spaces:

FrancescoConte
/

Sentiment_analysis_app

Sleeping

App Files Files Community

FrancescoConte commited on May 18

Commit

fdaba9b

verified ·

1 Parent(s): 4126de0

First setup

Browse files

Files changed (3) hide show

DOCKERFILE +21 -0
requirements.txt +8 -3
sentiment_huggingface.py +148 -0

DOCKERFILE ADDED Viewed

	@@ -0,0 +1,21 @@

+# Use an official Python image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Copy your app files
+COPY . /app
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Create a writable .streamlit directory to avoid permission issues
+RUN mkdir -p /tmp/.streamlit
+ENV STREAMLIT_HOME=/tmp/.streamlit
+# Expose the correct port for Hugging Face Spaces (7860 is their default for Streamlit)
+EXPOSE 7860
+# Start the Streamlit app
+CMD ["streamlit", "run", "app.py", "--server.headless=true", "--server.port=7860", "--server.address=0.0.0.0"]

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
-altair
-pandas
-streamlit

+streamlit
+transformers
+torch
+huggingface_hub
+datasets
+evaluate
+numpy
+argparse

sentiment_huggingface.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# Imports
+import numpy as np
+from datasets import load_dataset
+import evaluate
+from transformers import AutoTokenizer
+from transformers import DataCollatorWithPadding
+from transformers import AutoModelForSequenceClassification
+from transformers import TrainingArguments, Trainer
+from transformers import pipeline
+from huggingface_hub import notebook_login
+import streamlit as st
+import torch
+import argparse
+### Parse arguments --------------------------
+parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
+parser.add_argument(
+    "--model",
+    type=str,
+    default="distilbert-base-uncased-finetuned-sst-2-english",
+    help="Pre-trained model name or path",
+)
+parser.add_argument(
+    "--dataset",
+    type=str,
+    default="imdb",
+    help="Dataset name or path",
+)
+parser.add_argument(
+    "--train",
+    action="store_true",
+    help="Train the model",
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    default=42,
+    help="Random seed",
+)
+args = parser.parse_args()
+train=args.train
+tokenizer_model=args.model
+dataset=args.dataset
+seed=args.seed
+# Detect device automatically
+device = 0 if torch.cuda.is_available() else -1
+if train:
+    print(f"Training model {tokenizer_model} on dataset {dataset}")
+    ### Pre-process data ---------------------------------
+    # Load the IMDB dataset
+    imdb = load_dataset(dataset)
+    # Create smaller datasets
+    small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
+    small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])
+    # Use a tokenizer
+    #tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
+    # Prepare the text inputs by mapping
+    def preprocess_function(sample):
+        return tokenizer(sample["text"], truncation=True)
+    tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
+    tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
+    # Convert training samples to PyTorch tensors, concatenate them with padding -- faster!
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+    ### Train the model ---------------------------------
+    # Define the base model
+    model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)
+    # Define the evaluation metrics
+    def compute_metrics(eval_pred):
+        load_accuracy = evaluate.load("accuracy")
+        load_f1 = evaluate.load("f1")
+        logits, labels = eval_pred
+        predictions = np.argmax(logits, axis=-1)
+        accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
+        f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
+        return {"accuracy": accuracy, "f1": f1}
+    # Connect to Hugging Face Hub
+    notebook_login()
+    # Define a trainer
+    repo_name = "FC_finetuning-sentiment-model-3000-samples"
+    training_args = TrainingArguments(
+    output_dir=repo_name,
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    save_strategy="epoch",
+    push_to_hub=True,
+    )
+    trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_train,
+    eval_dataset=tokenized_test,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    )
+    trainer.train()
+    trainer.push_to_hub(commit_message="Training complete")
+    ### Evaluate the model -----------------------------
+    trainer.evaluate()
+    model_name="FrancescoConte/"+repo_name
+else:
+    ### Load the model -------------------------------
+    print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
+    model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
+    # Load tokenizer and model directly from the Hub
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+### Analyze data with the model -------------------
+# Define the final sentiment analysis model
+sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
+### Use the model --------------------
+#try_text=["I love this move", "This movie sucks!"]
+#print(sentiment_model(try_text))
+### Put it on streamlit -------------------
+# streamlit app
+st.title("Sentiment Analysis App")
+text = st.text_area("Enter text for sentiment analysis:")
+if st.button("Analyze"):
+    with st.spinner("Running inference..."):
+        result = sentiment_model(text)[0]
+        st.success(f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.5f}")