naa142 commited on
Commit
5ae0fb5
ยท
verified ยท
1 Parent(s): f250f13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -57
app.py CHANGED
@@ -1,39 +1,24 @@
1
- # app.py
2
  import streamlit as st
3
  import torch
4
  import torch.nn as nn
5
- from transformers import AutoTokenizer, DebertaV3Model
6
 
7
- # โœ… Set device - force CPU to avoid meta tensor issues
8
- device = torch.device("cpu")
9
 
10
- # โœ… Load tokenizer directly
11
- try:
12
- tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
13
- st.sidebar.success("โœ… Tokenizer loaded successfully")
14
- except Exception as e:
15
- st.error(f"Failed to load tokenizer: {e}")
16
- st.stop()
17
 
18
- # โœ… Define model with explicit DebertaV3 base
19
  class ScoringModel(nn.Module):
20
- def __init__(self, dropout_rate=0.242):
21
  super().__init__()
22
-
23
- # Use the specific model class instead of AutoModel
24
- try:
25
- self.base = DebertaV3Model.from_pretrained("microsoft/deberta-v3-small")
26
- st.sidebar.success("โœ… Base model loaded successfully")
27
- except Exception as e:
28
- st.error(f"Failed to load base model: {e}")
29
- st.stop()
30
-
31
  self.base.gradient_checkpointing_enable()
32
  self.dropout1 = nn.Dropout(dropout_rate)
33
  self.dropout2 = nn.Dropout(dropout_rate)
34
  self.dropout3 = nn.Dropout(dropout_rate)
35
  self.classifier = nn.Linear(self.base.config.hidden_size, 1)
36
-
37
  def forward(self, input_ids, attention_mask):
38
  hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
39
  logits = (self.classifier(self.dropout1(hidden)) +
@@ -41,20 +26,15 @@ class ScoringModel(nn.Module):
41
  self.classifier(self.dropout3(hidden))) / 3
42
  return logits
43
 
44
- # โœ… Initialize and load weights
45
- try:
46
- model = ScoringModel()
47
- model = model.to(device)
48
- model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
49
- model.eval()
50
- st.sidebar.success("โœ… Model weights loaded successfully")
51
- except Exception as e:
52
- st.error(f"Failed to load model weights: {e}")
53
- st.write("Check if your 'scoring_model.pt' file is properly uploaded.")
54
- st.stop()
55
-
56
- # โœ… Setup Streamlit page
57
  st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
 
58
  st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
59
  st.markdown("---")
60
 
@@ -62,22 +42,25 @@ st.markdown("---")
62
  with st.sidebar:
63
  st.header("โ„น๏ธ About")
64
  st.markdown("""
65
- This app evaluates **which AI response is better** given a prompt.
66
 
67
- **How it works:**
68
- - You enter a **prompt** and two **responses**.
69
- - The model predicts **which response** is of **higher quality**.
70
 
71
- Powered by a **fine-tuned DeBERTa-v3-small** model ๐Ÿš€
72
  """)
73
 
74
  # โœ… Main input section
75
  col1, col2 = st.columns(2)
 
76
  with col1:
77
  prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
 
78
  with col2:
79
  st.markdown("<br>", unsafe_allow_html=True)
80
  st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
 
81
  response_a = st.text_area("โœ๏ธ Response A", height=100)
82
  response_b = st.text_area("โœ๏ธ Response B", height=100)
83
 
@@ -86,41 +69,37 @@ if st.button("๐Ÿ” Evaluate Responses"):
86
  if prompt and response_a and response_b:
87
  text_a = f"Prompt: {prompt} [SEP] {response_a}"
88
  text_b = f"Prompt: {prompt} [SEP] {response_b}"
89
-
90
  encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
91
  encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
92
-
93
  encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
94
  encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
95
-
96
  with torch.no_grad():
97
  score_a = model(**encoded_a).squeeze()
98
  score_b = model(**encoded_b).squeeze()
99
-
100
  prob_a = torch.sigmoid(score_a).item()
101
  prob_b = torch.sigmoid(score_b).item()
102
-
103
  # โœ… Nice result display
104
  st.subheader("๐Ÿ”ฎ Prediction Result")
 
105
  if prob_b > prob_a:
106
- st.success(f"โœ… **Response B is better!** (Confidence: {prob_b:.4f})")
107
  else:
108
- st.success(f"โœ… **Response A is better!** (Confidence: {prob_a:.4f})")
109
-
110
  # โœ… Probability metrics in 2 columns
111
  mcol1, mcol2 = st.columns(2)
112
  mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
113
  mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
114
-
115
  # โœ… Bar chart comparison
116
  st.markdown("---")
117
  st.subheader("๐Ÿ“Š Confidence Comparison")
 
118
  st.bar_chart({"Confidence": [prob_a, prob_b]})
119
  else:
120
- st.warning("โš ๏ธ Please fill in **all fields** before evaluating!")
121
-
122
- # Make app accessible externally when deployed on Hugging Face Spaces
123
- if __name__ == "__main__":
124
- import os
125
- # Get port from environment variable or use default
126
- port = int(os.environ.get("PORT", 8501))
 
 
1
  import streamlit as st
2
  import torch
3
  import torch.nn as nn
4
+ from transformers import AutoTokenizer, AutoModel
5
 
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
7
 
8
+ # โœ… 1. Load tokenizer from current directory
9
+ tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
 
 
 
 
 
10
 
11
+ # โœ… 2. Define the model
12
  class ScoringModel(nn.Module):
13
+ def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
14
  super().__init__()
15
+ self.base = AutoModel.from_pretrained(base_model_name)
 
 
 
 
 
 
 
 
16
  self.base.gradient_checkpointing_enable()
17
  self.dropout1 = nn.Dropout(dropout_rate)
18
  self.dropout2 = nn.Dropout(dropout_rate)
19
  self.dropout3 = nn.Dropout(dropout_rate)
20
  self.classifier = nn.Linear(self.base.config.hidden_size, 1)
21
+
22
  def forward(self, input_ids, attention_mask):
23
  hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
24
  logits = (self.classifier(self.dropout1(hidden)) +
 
26
  self.classifier(self.dropout3(hidden))) / 3
27
  return logits
28
 
29
+ # โœ… 3. Initialize and load weights
30
+ model = ScoringModel()
31
+ model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
32
+ model = model.to(device)
33
+ model.eval()
34
+
35
+ # โœ… 4. Setup Streamlit page
 
 
 
 
 
 
36
  st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
37
+
38
  st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
39
  st.markdown("---")
40
 
 
42
  with st.sidebar:
43
  st.header("โ„น๏ธ About")
44
  st.markdown("""
45
+ This app evaluates *which AI response is better* given a prompt.
46
 
47
+ *How it works:*
48
+ - You enter a *prompt* and two *responses*.
49
+ - The model predicts *which response* is of *higher quality*.
50
 
51
+ Powered by a *fine-tuned DeBERTa-v3-small* model ๐Ÿš€
52
  """)
53
 
54
  # โœ… Main input section
55
  col1, col2 = st.columns(2)
56
+
57
  with col1:
58
  prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
59
+
60
  with col2:
61
  st.markdown("<br>", unsafe_allow_html=True)
62
  st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
63
+
64
  response_a = st.text_area("โœ๏ธ Response A", height=100)
65
  response_b = st.text_area("โœ๏ธ Response B", height=100)
66
 
 
69
  if prompt and response_a and response_b:
70
  text_a = f"Prompt: {prompt} [SEP] {response_a}"
71
  text_b = f"Prompt: {prompt} [SEP] {response_b}"
72
+
73
  encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
74
  encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
75
+
76
  encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
77
  encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
78
+
79
  with torch.no_grad():
80
  score_a = model(**encoded_a).squeeze()
81
  score_b = model(**encoded_b).squeeze()
82
+
83
  prob_a = torch.sigmoid(score_a).item()
84
  prob_b = torch.sigmoid(score_b).item()
85
+
86
  # โœ… Nice result display
87
  st.subheader("๐Ÿ”ฎ Prediction Result")
88
+
89
  if prob_b > prob_a:
90
+ st.success(f"โœ… *Response B is better!* (Confidence: {prob_b:.4f})")
91
  else:
92
+ st.success(f"โœ… *Response A is better!* (Confidence: {prob_a:.4f})")
93
+
94
  # โœ… Probability metrics in 2 columns
95
  mcol1, mcol2 = st.columns(2)
96
  mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
97
  mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
98
+
99
  # โœ… Bar chart comparison
100
  st.markdown("---")
101
  st.subheader("๐Ÿ“Š Confidence Comparison")
102
+
103
  st.bar_chart({"Confidence": [prob_a, prob_b]})
104
  else:
105
+ st.warning("โš ๏ธ Please fill in *all fields* before evaluating!")