Spaces:

naa142
/

llmfinetune

Sleeping

App Files Files Community

naa142 commited on Apr 27

Commit

bb5fd62

verified ·

1 Parent(s): 90b9df2

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -38

app.py CHANGED Viewed

@@ -1,16 +1,29 @@
-# app.py
 import streamlit as st
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ✅ 1. Load the tokenizer from current directory
-tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
-# ✅ 2. Define your ScoringModel
 class ScoringModel(nn.Module):
     def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
@@ -28,46 +41,61 @@ class ScoringModel(nn.Module):
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
-# ✅ 3. Instantiate model and load weights
 model = ScoringModel().to(device)
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model.eval()
-# ✅ 4. Streamlit App
-st.set_page_config(page_title="🧠 Response Evaluator", page_icon="🚀", layout="centered")
-st.title("🚀 Response Quality Predictor")
-prompt = st.text_area("Enter the prompt", height=150)
-response_a = st.text_area("Response A", height=100)
-response_b = st.text_area("Response B", height=100)
-if st.button("Evaluate Responses"):
-    if prompt and response_a and response_b:
-        text_a = f"Prompt: {prompt} [SEP] {response_a}"
-        text_b = f"Prompt: {prompt} [SEP] {response_b}"
-        encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
-        encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
-        encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
-        encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
-        with torch.no_grad():
-            score_a = model(**encoded_a).squeeze()
-            score_b = model(**encoded_b).squeeze()
-        prob_a = torch.sigmoid(score_a).item()
-        prob_b = torch.sigmoid(score_b).item()
-        if prob_b > prob_a:
-            st.success(f"✅ Model predicts: **Response B** is better! (Confidence: {prob_b:.4f})")
         else:
-            st.success(f"✅ Model predicts: **Response A** is better! (Confidence: {prob_a:.4f})")
-        st.metric("Probability A", f"{prob_a:.4f}")
-        st.metric("Probability B", f"{prob_b:.4f}")
-    else:
-        st.warning("⚠️ Please fill in all the fields first!")

 import streamlit as st
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel
+# 📜 Setup page
+st.set_page_config(page_title="LLM Response Evaluator", page_icon="🤖", layout="centered")
+# 🚀 Sidebar with Instructions
+with st.sidebar:
+    st.title("ℹ️ How to Use")
+    st.markdown("""
+    1. Enter a **prompt** you want responses to.
+    2. Fill in **Response A** and **Response B**.
+    3. Click **Evaluate** to see which response is better!
+    """)
+    st.markdown("---")
+    st.info("This app uses a fine-tuned DeBERTa model to judge the better response!")
+# 🔥 Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ✅ Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("./")
+# ✅ Define ScoringModel class
 class ScoringModel(nn.Module):
     def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
+# ✅ Load fine-tuned model
 model = ScoringModel().to(device)
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model.eval()
+# 🎯 App main title
+st.title("🚀 LLM Fine-Tuned Response Evaluator")
+# ✏️ Input fields
+prompt = st.text_area("🧠 Enter the prompt", placeholder="e.g., What is a dataset?")
+response_a = st.text_area("🔵 Response A", placeholder="e.g., A dataset is a collection of data.")
+response_b = st.text_area("🟠 Response B", placeholder="e.g., It is nothing important.")
+# 🚀 Centered Evaluate Button
+col1, col2, col3 = st.columns([1,2,1])
+with col2:
+    if st.button("✨ Evaluate Responses", use_container_width=True):
+        if prompt.strip() == "" or response_a.strip() == "" or response_b.strip() == "":
+            st.error("❌ Please fill all fields (prompt, response A, response B) before evaluating!")
         else:
+            # Tokenize
+            with torch.no_grad():
+                text_a = prompt + " [SEP] " + response_a
+                text_b = prompt + " [SEP] " + response_b
+                encoded_a = tokenizer(text_a, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
+                encoded_b = tokenizer(text_b, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
+                input_ids_a = encoded_a["input_ids"].to(device)
+                attention_mask_a = encoded_a["attention_mask"].to(device)
+                input_ids_b = encoded_b["input_ids"].to(device)
+                attention_mask_b = encoded_b["attention_mask"].to(device)
+                score_a = model(input_ids_a, attention_mask_a).squeeze()
+                score_b = model(input_ids_b, attention_mask_b).squeeze()
+                prob_a = torch.sigmoid(score_a).item()
+                prob_b = torch.sigmoid(score_b).item()
+            # 📊 Show results
+            st.subheader("🔎 Prediction Results")
+            st.metric(label="Confidence for Response A 🔵", value=f"{prob_a:.4f}")
+            st.metric(label="Confidence for Response B 🟠", value=f"{prob_b:.4f}")
+            st.subheader("📊 Confidence Comparison")
+            st.bar_chart({
+                "Confidence": [prob_a, prob_b],
+            })
+            if prob_a > prob_b:
+                st.success(f"✅ Response A is better! (Score: {prob_a:.4f})")
+                st.markdown(f"**🔵 Response A:** {response_a}")
+            else:
+                st.success(f"✅ Response B is better! (Score: {prob_b:.4f})")
+                st.markdown(f"**🟠 Response B:** {response_b}")