teguhsuby commited on
Commit
2fc050b
·
1 Parent(s): eb90797
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import joblib
4
+ from transformers import AutoTokenizer
5
+ from dinstilBert import MultiTaskBERT
6
+
7
+ model = MultiTaskBERT()
8
+ model.load_state_dict(torch.load("model.pt", map_location="cpu"))
9
+ model.eval()
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
12
+ le = joblib.load("label_encoder.pkl")
13
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ model.to(device)
15
+
16
+ def predict(text):
17
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
18
+ with torch.no_grad():
19
+ sentiment_logits, lang_logits = model(inputs["input_ids"], inputs["attention_mask"])
20
+ pred_sentiment = sentiment_logits.argmax(dim=1).item()
21
+ pred_lang = lang_logits.argmax(dim=1).item()
22
+
23
+ sentiment_label = "positive" if pred_sentiment == 1 else "negative"
24
+ lang_label = le.inverse_transform([pred_lang])[0]
25
+
26
+ return sentiment_label, lang_label
27
+
28
+
29
+
30
+ interface = gr.Interface(
31
+ fn=predict,
32
+ inputs=gr.Textbox(label="Masukkan Teks Dalam Bahasa (Inggris/Belanda/Spanyol/Perancis)"),
33
+ outputs=[
34
+ gr.Textbox(label="Prediksi Sentiment"),
35
+ gr.Textbox(label="Prediksi Bahasa")
36
+ ],
37
+ title="Multitask DistilBERT: Sentiment + Language",
38
+ description="Prediksi sentimen dan bahasa dari teks menggunakan model multitask DistilBERT."
39
+ )
40
+
41
+ interface.launch()
dinstilBert.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ import torch.nn as nn
3
+
4
+ class MultiTaskBERT(nn.Module):
5
+ def __init__(self, num_lang_classes=4, num_sentiment_classes=2):
6
+
7
+ super().__init__()
8
+ self.bert = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
9
+ self.dropout = nn.Dropout(0.3)
10
+ self.sentiment_head = nn.Linear(768, num_sentiment_classes)
11
+ self.lang_head = nn.Linear(768, num_lang_classes)
12
+
13
+ def forward(self, input_ids, attention_mask):
14
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
15
+ pooled_output = outputs.last_hidden_state[:, 0]
16
+ pooled_output = self.dropout(pooled_output)
17
+ sentiment_logits = self.sentiment_head(pooled_output)
18
+ lang_logits = self.lang_head(pooled_output)
19
+ return sentiment_logits, lang_logits
label_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d807aacf466425ba71f7fd36b79cca5de98feef504ff31f280a9a293c94ee71
3
+ size 493
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04990bd653e7fbefb47284f5cb46939dccc0922afef52d601ce879a202b8c745
3
+ size 538989962
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ scikit-learn
5
+ sentencepiece
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "token": "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",
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "DistilBertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff