naa142 commited on
Commit
9399894
ยท
verified ยท
1 Parent(s): bb5fd62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -66
app.py CHANGED
@@ -1,29 +1,16 @@
 
 
1
  import streamlit as st
2
  import torch
3
  import torch.nn as nn
4
  from transformers import AutoTokenizer, AutoModel
5
 
6
- # ๐Ÿ“œ Setup page
7
- st.set_page_config(page_title="LLM Response Evaluator", page_icon="๐Ÿค–", layout="centered")
8
-
9
- # ๐Ÿš€ Sidebar with Instructions
10
- with st.sidebar:
11
- st.title("โ„น๏ธ How to Use")
12
- st.markdown("""
13
- 1. Enter a **prompt** you want responses to.
14
- 2. Fill in **Response A** and **Response B**.
15
- 3. Click **Evaluate** to see which response is better!
16
- """)
17
- st.markdown("---")
18
- st.info("This app uses a fine-tuned DeBERTa model to judge the better response!")
19
-
20
- # ๐Ÿ”ฅ Device setup
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
 
23
- # โœ… Load tokenizer
24
- tokenizer = AutoTokenizer.from_pretrained("./")
25
 
26
- # โœ… Define ScoringModel class
27
  class ScoringModel(nn.Module):
28
  def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
29
  super().__init__()
@@ -41,61 +28,83 @@ class ScoringModel(nn.Module):
41
  self.classifier(self.dropout3(hidden))) / 3
42
  return logits
43
 
44
- # โœ… Load fine-tuned model
45
- model = ScoringModel().to(device)
46
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 
47
  model.eval()
48
 
49
- # ๐ŸŽฏ App main title
50
- st.title("๐Ÿš€ LLM Fine-Tuned Response Evaluator")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # โœ๏ธ Input fields
53
- prompt = st.text_area("๐Ÿง  Enter the prompt", placeholder="e.g., What is a dataset?")
54
- response_a = st.text_area("๐Ÿ”ต Response A", placeholder="e.g., A dataset is a collection of data.")
55
- response_b = st.text_area("๐ŸŸ  Response B", placeholder="e.g., It is nothing important.")
56
 
57
- # ๐Ÿš€ Centered Evaluate Button
58
- col1, col2, col3 = st.columns([1,2,1])
59
  with col2:
60
- if st.button("โœจ Evaluate Responses", use_container_width=True):
61
- if prompt.strip() == "" or response_a.strip() == "" or response_b.strip() == "":
62
- st.error("โŒ Please fill all fields (prompt, response A, response B) before evaluating!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  else:
64
- # Tokenize
65
- with torch.no_grad():
66
- text_a = prompt + " [SEP] " + response_a
67
- text_b = prompt + " [SEP] " + response_b
68
-
69
- encoded_a = tokenizer(text_a, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
70
- encoded_b = tokenizer(text_b, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
71
-
72
- input_ids_a = encoded_a["input_ids"].to(device)
73
- attention_mask_a = encoded_a["attention_mask"].to(device)
74
- input_ids_b = encoded_b["input_ids"].to(device)
75
- attention_mask_b = encoded_b["attention_mask"].to(device)
76
-
77
- score_a = model(input_ids_a, attention_mask_a).squeeze()
78
- score_b = model(input_ids_b, attention_mask_b).squeeze()
79
-
80
- prob_a = torch.sigmoid(score_a).item()
81
- prob_b = torch.sigmoid(score_b).item()
82
-
83
- # ๐Ÿ“Š Show results
84
- st.subheader("๐Ÿ”Ž Prediction Results")
85
- st.metric(label="Confidence for Response A ๐Ÿ”ต", value=f"{prob_a:.4f}")
86
- st.metric(label="Confidence for Response B ๐ŸŸ ", value=f"{prob_b:.4f}")
87
-
88
- st.subheader("๐Ÿ“Š Confidence Comparison")
89
- st.bar_chart({
90
- "Confidence": [prob_a, prob_b],
91
- })
92
-
93
- if prob_a > prob_b:
94
- st.success(f"โœ… Response A is better! (Score: {prob_a:.4f})")
95
- st.markdown(f"**๐Ÿ”ต Response A:** {response_a}")
96
- else:
97
- st.success(f"โœ… Response B is better! (Score: {prob_b:.4f})")
98
- st.markdown(f"**๐ŸŸ  Response B:** {response_b}")
99
 
100
 
101
 
 
1
+ # app.py
2
+
3
  import streamlit as st
4
  import torch
5
  import torch.nn as nn
6
  from transformers import AutoTokenizer, AutoModel
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
 
10
+ # โœ… 1. Load tokenizer from current directory
11
+ tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
12
 
13
+ # โœ… 2. Define the model
14
  class ScoringModel(nn.Module):
15
  def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
16
  super().__init__()
 
28
  self.classifier(self.dropout3(hidden))) / 3
29
  return logits
30
 
31
+ # โœ… 3. Initialize and load weights
32
+ model = ScoringModel()
33
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
34
+ model = model.to(device)
35
  model.eval()
36
 
37
+ # โœ… 4. Setup Streamlit page
38
+ st.set_page_config(page_title="๐Ÿง  LLM Response Evaluator", page_icon="๐Ÿ“", layout="wide")
39
+
40
+ st.markdown("<h1 style='text-align: center;'>๐Ÿง  LLM Response Evaluator</h1>", unsafe_allow_html=True)
41
+ st.markdown("---")
42
+
43
+ # โœ… Sidebar Info
44
+ with st.sidebar:
45
+ st.header("โ„น๏ธ About")
46
+ st.markdown("""
47
+ This app evaluates **which AI response is better** given a prompt.
48
+
49
+ **How it works:**
50
+ - You enter a **prompt** and two **responses**.
51
+ - The model predicts **which response** is of **higher quality**.
52
+
53
+ Powered by a **fine-tuned DeBERTa-v3-small** model ๐Ÿš€
54
+ """)
55
+
56
+ # โœ… Main input section
57
+ col1, col2 = st.columns(2)
58
 
59
+ with col1:
60
+ prompt = st.text_area("๐Ÿ“ Enter the Prompt", height=150)
 
 
61
 
 
 
62
  with col2:
63
+ st.markdown("<br>", unsafe_allow_html=True)
64
+ st.markdown("๐Ÿ‘‰ Provide two possible responses below:")
65
+
66
+ response_a = st.text_area("โœ๏ธ Response A", height=100)
67
+ response_b = st.text_area("โœ๏ธ Response B", height=100)
68
+
69
+ # โœ… Evaluation
70
+ if st.button("๐Ÿ” Evaluate Responses"):
71
+ if prompt and response_a and response_b:
72
+ text_a = f"Prompt: {prompt} [SEP] {response_a}"
73
+ text_b = f"Prompt: {prompt} [SEP] {response_b}"
74
+
75
+ encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
76
+ encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
77
+
78
+ encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
79
+ encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
80
+
81
+ with torch.no_grad():
82
+ score_a = model(**encoded_a).squeeze()
83
+ score_b = model(**encoded_b).squeeze()
84
+
85
+ prob_a = torch.sigmoid(score_a).item()
86
+ prob_b = torch.sigmoid(score_b).item()
87
+
88
+ # โœ… Nice result display
89
+ st.subheader("๐Ÿ”ฎ Prediction Result")
90
+
91
+ if prob_b > prob_a:
92
+ st.success(f"โœ… **Response B is better!** (Confidence: {prob_b:.4f})")
93
  else:
94
+ st.success(f"โœ… **Response A is better!** (Confidence: {prob_a:.4f})")
95
+
96
+ # โœ… Probability metrics in 2 columns
97
+ mcol1, mcol2 = st.columns(2)
98
+ mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
99
+ mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
100
+
101
+ # โœ… Bar chart comparison
102
+ st.markdown("---")
103
+ st.subheader("๐Ÿ“Š Confidence Comparison")
104
+
105
+ st.bar_chart({"Confidence": [prob_a, prob_b]})
106
+ else:
107
+ st.warning("โš ๏ธ Please fill in **all fields** before evaluating!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110