File size: 4,923 Bytes
fdaba9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# Imports
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from huggingface_hub import notebook_login
import streamlit as st
import torch
import argparse
### Parse arguments --------------------------
parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
parser.add_argument(
"--model",
type=str,
default="distilbert-base-uncased-finetuned-sst-2-english",
help="Pre-trained model name or path",
)
parser.add_argument(
"--dataset",
type=str,
default="imdb",
help="Dataset name or path",
)
parser.add_argument(
"--train",
action="store_true",
help="Train the model",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed",
)
args = parser.parse_args()
train=args.train
tokenizer_model=args.model
dataset=args.dataset
seed=args.seed
# Detect device automatically
device = 0 if torch.cuda.is_available() else -1
if train:
print(f"Training model {tokenizer_model} on dataset {dataset}")
### Pre-process data ---------------------------------
# Load the IMDB dataset
imdb = load_dataset(dataset)
# Create smaller datasets
small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])
# Use a tokenizer
#tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
# Prepare the text inputs by mapping
def preprocess_function(sample):
return tokenizer(sample["text"], truncation=True)
tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
# Convert training samples to PyTorch tensors, concatenate them with padding -- faster!
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
### Train the model ---------------------------------
# Define the base model
model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)
# Define the evaluation metrics
def compute_metrics(eval_pred):
load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
return {"accuracy": accuracy, "f1": f1}
# Connect to Hugging Face Hub
notebook_login()
# Define a trainer
repo_name = "FC_finetuning-sentiment-model-3000-samples"
training_args = TrainingArguments(
output_dir=repo_name,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
save_strategy="epoch",
push_to_hub=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.push_to_hub(commit_message="Training complete")
### Evaluate the model -----------------------------
trainer.evaluate()
model_name="FrancescoConte/"+repo_name
else:
### Load the model -------------------------------
print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"
# Load tokenizer and model directly from the Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
### Analyze data with the model -------------------
# Define the final sentiment analysis model
sentiment_model = pipeline(model=model_name, task="sentiment-analysis")
### Use the model --------------------
#try_text=["I love this move", "This movie sucks!"]
#print(sentiment_model(try_text))
### Put it on streamlit -------------------
# streamlit app
st.title("Sentiment Analysis App")
text = st.text_area("Enter text for sentiment analysis:")
if st.button("Analyze"):
with st.spinner("Running inference..."):
result = sentiment_model(text)[0]
st.success(f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.5f}")
|