# Imports import numpy as np from datasets import load_dataset import evaluate from transformers import AutoTokenizer from transformers import DataCollatorWithPadding from transformers import AutoModelForSequenceClassification from transformers import TrainingArguments, Trainer from transformers import pipeline from huggingface_hub import notebook_login import streamlit as st import torch import argparse ### Parse arguments -------------------------- parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face") parser.add_argument( "--model", type=str, default="distilbert-base-uncased-finetuned-sst-2-english", help="Pre-trained model name or path", ) parser.add_argument( "--dataset", type=str, default="imdb", help="Dataset name or path", ) parser.add_argument( "--train", action="store_true", help="Train the model", ) parser.add_argument( "--seed", type=int, default=42, help="Random seed", ) args = parser.parse_args() train=args.train tokenizer_model=args.model dataset=args.dataset seed=args.seed # Detect device automatically device = 0 if torch.cuda.is_available() else -1 if train: print(f"Training model {tokenizer_model} on dataset {dataset}") ### Pre-process data --------------------------------- # Load the IMDB dataset imdb = load_dataset(dataset) # Create smaller datasets small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))]) small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))]) # Use a tokenizer #tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english" tokenizer = AutoTokenizer.from_pretrained(tokenizer_model) # Prepare the text inputs by mapping def preprocess_function(sample): return tokenizer(sample["text"], truncation=True) tokenized_train = small_train_dataset.map(preprocess_function, batched=True) tokenized_test = small_test_dataset.map(preprocess_function, batched=True) # Convert training samples to PyTorch tensors, concatenate them with padding -- faster! data_collator = DataCollatorWithPadding(tokenizer=tokenizer) ### Train the model --------------------------------- # Define the base model model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2) # Define the evaluation metrics def compute_metrics(eval_pred): load_accuracy = evaluate.load("accuracy") load_f1 = evaluate.load("f1") logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"] f1 = load_f1.compute(predictions=predictions, references=labels)["f1"] return {"accuracy": accuracy, "f1": f1} # Connect to Hugging Face Hub notebook_login() # Define a trainer repo_name = "FC_finetuning-sentiment-model-3000-samples" training_args = TrainingArguments( output_dir=repo_name, learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=2, weight_decay=0.01, save_strategy="epoch", push_to_hub=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_test, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) trainer.train() trainer.push_to_hub(commit_message="Training complete") ### Evaluate the model ----------------------------- trainer.evaluate() model_name="FrancescoConte/"+repo_name else: ### Load the model ------------------------------- print(f"Using {tokenizer_model} model previously trained on dataset {dataset}") model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples" # Load tokenizer and model directly from the Hub tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) ### Analyze data with the model ------------------- # Define the final sentiment analysis model sentiment_model = pipeline(model=model_name, task="sentiment-analysis") ### Use the model -------------------- #try_text=["I love this move", "This movie sucks!"] #print(sentiment_model(try_text)) ### Put it on streamlit ------------------- # streamlit app st.title("Sentiment Analysis App") text = st.text_area("Enter text for sentiment analysis:") if st.button("Analyze"): with st.spinner("Running inference..."): result = sentiment_model(text)[0] st.success(f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.5f}")