|
|
|
import numpy as np
|
|
from datasets import load_dataset
|
|
import evaluate
|
|
from transformers import AutoTokenizer
|
|
from transformers import DataCollatorWithPadding
|
|
from transformers import AutoModelForSequenceClassification
|
|
from transformers import TrainingArguments, Trainer
|
|
from transformers import pipeline
|
|
from huggingface_hub import notebook_login
|
|
import streamlit as st
|
|
import torch
|
|
import argparse
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
default="distilbert-base-uncased-finetuned-sst-2-english",
|
|
help="Pre-trained model name or path",
|
|
)
|
|
parser.add_argument(
|
|
"--dataset",
|
|
type=str,
|
|
default="imdb",
|
|
help="Dataset name or path",
|
|
)
|
|
parser.add_argument(
|
|
"--train",
|
|
action="store_true",
|
|
help="Train the model",
|
|
)
|
|
parser.add_argument(
|
|
"--seed",
|
|
type=int,
|
|
default=42,
|
|
help="Random seed",
|
|
)
|
|
args = parser.parse_args()
|
|
train=args.train
|
|
tokenizer_model=args.model
|
|
dataset=args.dataset
|
|
seed=args.seed
|
|
|
|
|
|
device = 0 if torch.cuda.is_available() else -1
|
|
|
|
if train:
|
|
print(f"Training model {tokenizer_model} on dataset {dataset}")
|
|
|
|
|
|
imdb = load_dataset(dataset)
|
|
|
|
|
|
small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
|
|
small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
|
|
|
|
|
|
def preprocess_function(sample):
|
|
return tokenizer(sample["text"], truncation=True)
|
|
tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
|
|
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
|
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
|
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)
|
|
|
|
|
|
def compute_metrics(eval_pred):
|
|
load_accuracy = evaluate.load("accuracy")
|
|
load_f1 = evaluate.load("f1")
|
|
logits, labels = eval_pred
|
|
predictions = np.argmax(logits, axis=-1)
|
|
accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
|
|
f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
|
|
return {"accuracy": accuracy, "f1": f1}
|
|
|
|
|
|
notebook_login()
|
|
|
|
|
|
repo_name = "FC_finetuning-sentiment-model-3000-samples"
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir=repo_name,
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=16,
|
|
per_device_eval_batch_size=16,
|
|
num_train_epochs=2,
|
|
weight_decay=0.01,
|
|
save_strategy="epoch",
|
|
push_to_hub=True,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_train,
|
|
eval_dataset=tokenized_test,
|
|
tokenizer=tokenizer,
|
|
data_collator=data_collator,
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
|
|
trainer.train()
|
|
trainer.push_to_hub(commit_message="Training complete")
|
|
|
|
|
|
trainer.evaluate()
|
|
model_name="FrancescoConte/"+repo_name
|
|
else:
|
|
|
|
print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
|
|
model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.title("Sentiment Analysis App")
|
|
|
|
text = st.text_area("Enter text for sentiment analysis:")
|
|
|
|
if st.button("Analyze"):
|
|
with st.spinner("Running inference..."):
|
|
result = sentiment_model(text)[0]
|
|
st.success(f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.5f}")
|
|
|
|
|