naa142 commited on
Commit
bb5fd62
Β·
verified Β·
1 Parent(s): 90b9df2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -38
app.py CHANGED
@@ -1,16 +1,29 @@
1
- # app.py
2
-
3
  import streamlit as st
4
  import torch
5
  import torch.nn as nn
6
  from transformers import AutoTokenizer, AutoModel
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
 
10
- # βœ… 1. Load the tokenizer from current directory
11
- tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
12
 
13
- # βœ… 2. Define your ScoringModel
14
  class ScoringModel(nn.Module):
15
  def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
16
  super().__init__()
@@ -28,46 +41,61 @@ class ScoringModel(nn.Module):
28
  self.classifier(self.dropout3(hidden))) / 3
29
  return logits
30
 
31
- # βœ… 3. Instantiate model and load weights
32
  model = ScoringModel().to(device)
33
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
34
  model.eval()
35
 
36
- # βœ… 4. Streamlit App
37
- st.set_page_config(page_title="🧠 Response Evaluator", page_icon="πŸš€", layout="centered")
38
- st.title("πŸš€ Response Quality Predictor")
39
-
40
- prompt = st.text_area("Enter the prompt", height=150)
41
- response_a = st.text_area("Response A", height=100)
42
- response_b = st.text_area("Response B", height=100)
43
-
44
- if st.button("Evaluate Responses"):
45
- if prompt and response_a and response_b:
46
- text_a = f"Prompt: {prompt} [SEP] {response_a}"
47
- text_b = f"Prompt: {prompt} [SEP] {response_b}"
48
-
49
- encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
50
- encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
51
 
52
- encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
53
- encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
 
 
54
 
55
- with torch.no_grad():
56
- score_a = model(**encoded_a).squeeze()
57
- score_b = model(**encoded_b).squeeze()
58
-
59
- prob_a = torch.sigmoid(score_a).item()
60
- prob_b = torch.sigmoid(score_b).item()
61
-
62
- if prob_b > prob_a:
63
- st.success(f"βœ… Model predicts: **Response B** is better! (Confidence: {prob_b:.4f})")
64
  else:
65
- st.success(f"βœ… Model predicts: **Response A** is better! (Confidence: {prob_a:.4f})")
66
-
67
- st.metric("Probability A", f"{prob_a:.4f}")
68
- st.metric("Probability B", f"{prob_b:.4f}")
69
- else:
70
- st.warning("⚠️ Please fill in all the fields first!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
 
 
 
 
1
  import streamlit as st
2
  import torch
3
  import torch.nn as nn
4
  from transformers import AutoTokenizer, AutoModel
5
 
6
+ # πŸ“œ Setup page
7
+ st.set_page_config(page_title="LLM Response Evaluator", page_icon="πŸ€–", layout="centered")
8
+
9
+ # πŸš€ Sidebar with Instructions
10
+ with st.sidebar:
11
+ st.title("ℹ️ How to Use")
12
+ st.markdown("""
13
+ 1. Enter a **prompt** you want responses to.
14
+ 2. Fill in **Response A** and **Response B**.
15
+ 3. Click **Evaluate** to see which response is better!
16
+ """)
17
+ st.markdown("---")
18
+ st.info("This app uses a fine-tuned DeBERTa model to judge the better response!")
19
+
20
+ # πŸ”₯ Device setup
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
 
23
+ # βœ… Load tokenizer
24
+ tokenizer = AutoTokenizer.from_pretrained("./")
25
 
26
+ # βœ… Define ScoringModel class
27
  class ScoringModel(nn.Module):
28
  def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
29
  super().__init__()
 
41
  self.classifier(self.dropout3(hidden))) / 3
42
  return logits
43
 
44
+ # βœ… Load fine-tuned model
45
  model = ScoringModel().to(device)
46
  model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
47
  model.eval()
48
 
49
+ # 🎯 App main title
50
+ st.title("πŸš€ LLM Fine-Tuned Response Evaluator")
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # ✏️ Input fields
53
+ prompt = st.text_area("🧠 Enter the prompt", placeholder="e.g., What is a dataset?")
54
+ response_a = st.text_area("πŸ”΅ Response A", placeholder="e.g., A dataset is a collection of data.")
55
+ response_b = st.text_area("🟠 Response B", placeholder="e.g., It is nothing important.")
56
 
57
+ # πŸš€ Centered Evaluate Button
58
+ col1, col2, col3 = st.columns([1,2,1])
59
+ with col2:
60
+ if st.button("✨ Evaluate Responses", use_container_width=True):
61
+ if prompt.strip() == "" or response_a.strip() == "" or response_b.strip() == "":
62
+ st.error("❌ Please fill all fields (prompt, response A, response B) before evaluating!")
 
 
 
63
  else:
64
+ # Tokenize
65
+ with torch.no_grad():
66
+ text_a = prompt + " [SEP] " + response_a
67
+ text_b = prompt + " [SEP] " + response_b
68
+
69
+ encoded_a = tokenizer(text_a, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
70
+ encoded_b = tokenizer(text_b, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
71
+
72
+ input_ids_a = encoded_a["input_ids"].to(device)
73
+ attention_mask_a = encoded_a["attention_mask"].to(device)
74
+ input_ids_b = encoded_b["input_ids"].to(device)
75
+ attention_mask_b = encoded_b["attention_mask"].to(device)
76
+
77
+ score_a = model(input_ids_a, attention_mask_a).squeeze()
78
+ score_b = model(input_ids_b, attention_mask_b).squeeze()
79
+
80
+ prob_a = torch.sigmoid(score_a).item()
81
+ prob_b = torch.sigmoid(score_b).item()
82
+
83
+ # πŸ“Š Show results
84
+ st.subheader("πŸ”Ž Prediction Results")
85
+ st.metric(label="Confidence for Response A πŸ”΅", value=f"{prob_a:.4f}")
86
+ st.metric(label="Confidence for Response B 🟠", value=f"{prob_b:.4f}")
87
+
88
+ st.subheader("πŸ“Š Confidence Comparison")
89
+ st.bar_chart({
90
+ "Confidence": [prob_a, prob_b],
91
+ })
92
+
93
+ if prob_a > prob_b:
94
+ st.success(f"βœ… Response A is better! (Score: {prob_a:.4f})")
95
+ st.markdown(f"**πŸ”΅ Response A:** {response_a}")
96
+ else:
97
+ st.success(f"βœ… Response B is better! (Score: {prob_b:.4f})")
98
+ st.markdown(f"**🟠 Response B:** {response_b}")
99
 
100
 
101