File size: 4,923 Bytes
fdaba9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Imports
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from huggingface_hub import notebook_login
import streamlit as st
import torch
import argparse

### Parse arguments --------------------------
parser = argparse.ArgumentParser(description="Sentiment analysis with Hugging Face")
parser.add_argument(
    "--model",
    type=str,
    default="distilbert-base-uncased-finetuned-sst-2-english",
    help="Pre-trained model name or path",
)
parser.add_argument(
    "--dataset",
    type=str,
    default="imdb",
    help="Dataset name or path",
)
parser.add_argument(
    "--train",
    action="store_true",
    help="Train the model",
)
parser.add_argument(
    "--seed",
    type=int,
    default=42,
    help="Random seed",
)
args = parser.parse_args()
train=args.train
tokenizer_model=args.model
dataset=args.dataset
seed=args.seed

# Detect device automatically
device = 0 if torch.cuda.is_available() else -1

if train:
    print(f"Training model {tokenizer_model} on dataset {dataset}")
    ### Pre-process data ---------------------------------
    # Load the IMDB dataset
    imdb = load_dataset(dataset)

    # Create smaller datasets
    small_train_dataset = imdb["train"].shuffle(seed=seed).select([i for i in list(range(3000))])
    small_test_dataset = imdb["test"].shuffle(seed=seed).select([i for i in list(range(3000))])

    # Use a tokenizer
    #tokenizer_model="distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

    # Prepare the text inputs by mapping
    def preprocess_function(sample):
        return tokenizer(sample["text"], truncation=True)
    tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
    tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

    # Convert training samples to PyTorch tensors, concatenate them with padding -- faster!
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    ### Train the model ---------------------------------
    # Define the base model
    model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=2)

    # Define the evaluation metrics
    def compute_metrics(eval_pred):
        load_accuracy = evaluate.load("accuracy")
        load_f1 = evaluate.load("f1")
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
        f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
        return {"accuracy": accuracy, "f1": f1}

    # Connect to Hugging Face Hub
    notebook_login()

    # Define a trainer
    repo_name = "FC_finetuning-sentiment-model-3000-samples"
    
    training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
    )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.push_to_hub(commit_message="Training complete")

    ### Evaluate the model -----------------------------
    trainer.evaluate()
    model_name="FrancescoConte/"+repo_name
else:
    ### Load the model -------------------------------
    print(f"Using {tokenizer_model} model previously trained on dataset {dataset}")
    model_name = "FrancescoConte/FC_finetuning-sentiment-model-3000-samples"

    # Load tokenizer and model directly from the Hub
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)


### Analyze data with the model -------------------
# Define the final sentiment analysis model
sentiment_model = pipeline(model=model_name, task="sentiment-analysis")

### Use the model --------------------
#try_text=["I love this move", "This movie sucks!"]
#print(sentiment_model(try_text))

### Put it on streamlit -------------------

# streamlit app
st.title("Sentiment Analysis App")

text = st.text_area("Enter text for sentiment analysis:")

if st.button("Analyze"):
    with st.spinner("Running inference..."):
        result = sentiment_model(text)[0]
        st.success(f"**Label:** {result['label']}\n\n**Confidence:** {result['score']:.5f}")