Spaces:

FrancescoConte
/

Sentiment_analysis_app

Sleeping

App Files Files Community

FrancescoConte commited on May 18

Commit

ddc2737

1 Parent(s): 3cfefb3

removed the training from the web app

Browse files

Files changed (2) hide show

requirements.txt +0 -3
sentiment_huggingface.py +6 -122

requirements.txt CHANGED Viewed

@@ -1,8 +1,5 @@
 streamlit
 transformers
 torch
-huggingface_hub
-datasets
 evaluate
 numpy
-argparse

 streamlit
 transformers
 torch
 evaluate
 numpy

sentiment_huggingface.py CHANGED Viewed

@@ -1,132 +1,19 @@
 # Imports
 import numpy as np
-from datasets import load_dataset
-import evaluate
-from transformers import AutoTokenizer
-from transformers import DataCollatorWithPadding
-from transformers import AutoModelForSequenceClassification
-from transformers import TrainingArguments, Trainer
 from transformers import pipeline
-from huggingface_hub import notebook_login
 import streamlit as st
 import torch
-import argparse
-### Parse arguments --------------------------
-parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
-parser.add_argument(
-    "--model",
-    type=str,
-    default="distilbert-base-uncased-finetuned-sst-2-english",
-    help="Pre-trained model name or path",
-)
-parser.add_argument(
-    "--dataset",
-    type=str,
-    default="imdb",
-    help="Dataset name or path",
-)
-parser.add_argument(
-    "--train",
-    action="store_true",
-    help="Train the model",
-)
-parser.add_argument(
-    "--seed",
-    type=int,
-    default=42,
-    help="Random seed",
-)
-args = parser.parse_args()
-train=args.train
-tokenizer_model=args.model
-dataset=args.dataset
-seed=args.seed
 # Detect device automatically
 device = 0 if torch.cuda.is_available() else -1
-if train:
-    print(f"Training model {tokenizer_model} on dataset {dataset}")
-    ### Pre-process data ---------------------------------
-    # Load the IMDB dataset
-    imdb = load_dataset(dataset)
-    # Create smaller datasets
-    small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
-    small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])
-    # Use a tokenizer
-    #tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
-    # Prepare the text inputs by mapping
-    def preprocess_function(sample):
-        return tokenizer(sample["text"], truncation=True)
-    tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
-    tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
-    # Convert training samples to PyTorch tensors, concatenate them with padding -- faster!
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-    ### Train the model ---------------------------------
-    # Define the base model
-    model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)
-    # Define the evaluation metrics
-    def compute_metrics(eval_pred):
-        load_accuracy = evaluate.load("accuracy")
-        load_f1 = evaluate.load("f1")
-        logits, labels = eval_pred
-        predictions = np.argmax(logits, axis=-1)
-        accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
-        f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
-        return {"accuracy": accuracy, "f1": f1}
-    # Connect to Hugging Face Hub
-    notebook_login()
-    # Define a trainer
-    repo_name = "FC_finetuning-sentiment-model-3000-samples"
-    training_args = TrainingArguments(
-    output_dir=repo_name,
-    learning_rate=2e-5,
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=16,
-    num_train_epochs=2,
-    weight_decay=0.01,
-    save_strategy="epoch",
-    push_to_hub=True,
-    )
-    trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=tokenized_train,
-    eval_dataset=tokenized_test,
-    tokenizer=tokenizer,
-    data_collator=data_collator,
-    compute_metrics=compute_metrics,
-    )
-    trainer.train()
-    trainer.push_to_hub(commit_message="Training complete")
-    ### Evaluate the model -----------------------------
-    trainer.evaluate()
-    model_name="FrancescoConte/"+repo_name
-else:
-    ### Load the model -------------------------------
-    print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
-    model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
-    # Load tokenizer and model directly from the Hub
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-### Analyze data with the model -------------------
 # Define the final sentiment analysis model
 sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
@@ -135,10 +22,7 @@ sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
 #print(sentiment_model(try_text))
 ### Put it on streamlit -------------------
-# streamlit app
 st.title("Sentiment Analysis App")
 text = st.text_area("Enter text for sentiment analysis:")
 if st.button("Analyze"):

 # Imports
 import numpy as np
 from transformers import pipeline
 import streamlit as st
 import torch
 # Detect device automatically
 device = 0 if torch.cuda.is_available() else -1
+tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
+dataset="imdb"
+seed = 42
+### Load the model -------------------------------
+print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
+model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
 # Define the final sentiment analysis model
 sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
 #print(sentiment_model(try_text))
 ### Put it on streamlit -------------------
 st.title("Sentiment Analysis App")
 text = st.text_area("Enter text for sentiment analysis:")
 if st.button("Analyze"):