Spaces:

naa142
/

llmfinetune

Sleeping

App Files Files Community

naa142 commited on Apr 27

Commit

9399894

verified ·

1 Parent(s): bb5fd62

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -66

app.py CHANGED Viewed

@@ -1,29 +1,16 @@
 import streamlit as st
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel
-# 📜 Setup page
-st.set_page_config(page_title="LLM Response Evaluator", page_icon="🤖", layout="centered")
-# 🚀 Sidebar with Instructions
-with st.sidebar:
-    st.title("ℹ️ How to Use")
-    st.markdown("""
-    1. Enter a **prompt** you want responses to.
-    2. Fill in **Response A** and **Response B**.
-    3. Click **Evaluate** to see which response is better!
-    """)
-    st.markdown("---")
-    st.info("This app uses a fine-tuned DeBERTa model to judge the better response!")
-# 🔥 Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ✅ Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained("./")
-# ✅ Define ScoringModel class
 class ScoringModel(nn.Module):
     def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
@@ -41,61 +28,83 @@ class ScoringModel(nn.Module):
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
-# ✅ Load fine-tuned model
-model = ScoringModel().to(device)
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
 model.eval()
-# 🎯 App main title
-st.title("🚀 LLM Fine-Tuned Response Evaluator")
-# ✏️ Input fields
-prompt = st.text_area("🧠 Enter the prompt", placeholder="e.g., What is a dataset?")
-response_a = st.text_area("🔵 Response A", placeholder="e.g., A dataset is a collection of data.")
-response_b = st.text_area("🟠 Response B", placeholder="e.g., It is nothing important.")
-# 🚀 Centered Evaluate Button
-col1, col2, col3 = st.columns([1,2,1])
 with col2:
-    if st.button("✨ Evaluate Responses", use_container_width=True):
-        if prompt.strip() == "" or response_a.strip() == "" or response_b.strip() == "":
-            st.error("❌ Please fill all fields (prompt, response A, response B) before evaluating!")
         else:
-            # Tokenize
-            with torch.no_grad():
-                text_a = prompt + " [SEP] " + response_a
-                text_b = prompt + " [SEP] " + response_b
-                encoded_a = tokenizer(text_a, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
-                encoded_b = tokenizer(text_b, truncation=True, padding="max_length", max_length=186, return_tensors="pt")
-                input_ids_a = encoded_a["input_ids"].to(device)
-                attention_mask_a = encoded_a["attention_mask"].to(device)
-                input_ids_b = encoded_b["input_ids"].to(device)
-                attention_mask_b = encoded_b["attention_mask"].to(device)
-                score_a = model(input_ids_a, attention_mask_a).squeeze()
-                score_b = model(input_ids_b, attention_mask_b).squeeze()
-                prob_a = torch.sigmoid(score_a).item()
-                prob_b = torch.sigmoid(score_b).item()
-            # 📊 Show results
-            st.subheader("🔎 Prediction Results")
-            st.metric(label="Confidence for Response A 🔵", value=f"{prob_a:.4f}")
-            st.metric(label="Confidence for Response B 🟠", value=f"{prob_b:.4f}")
-            st.subheader("📊 Confidence Comparison")
-            st.bar_chart({
-                "Confidence": [prob_a, prob_b],
-            })
-            if prob_a > prob_b:
-                st.success(f"✅ Response A is better! (Score: {prob_a:.4f})")
-                st.markdown(f"**🔵 Response A:** {response_a}")
-            else:
-                st.success(f"✅ Response B is better! (Score: {prob_b:.4f})")
-                st.markdown(f"**🟠 Response B:** {response_b}")

+# app.py
 import streamlit as st
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ✅ 1. Load tokenizer from current directory
+tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
+# ✅ 2. Define the model
 class ScoringModel(nn.Module):
     def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
+# ✅ 3. Initialize and load weights
+model = ScoringModel()
 model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
+model = model.to(device)
 model.eval()
+# ✅ 4. Setup Streamlit page
+st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
+st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
+st.markdown("---")
+# ✅ Sidebar Info
+with st.sidebar:
+    st.header("ℹ️ About")
+    st.markdown("""
+    This app evaluates **which AI response is better** given a prompt.
+    **How it works:**
+    - You enter a **prompt** and two **responses**.
+    - The model predicts **which response** is of **higher quality**.
+    Powered by a **fine-tuned DeBERTa-v3-small** model 🚀
+    """)
+# ✅ Main input section
+col1, col2 = st.columns(2)
+with col1:
+    prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
+    st.markdown("<br>", unsafe_allow_html=True)
+    st.markdown("👉 Provide two possible responses below:")
+response_a = st.text_area("✏️ Response A", height=100)
+response_b = st.text_area("✏️ Response B", height=100)
+# ✅ Evaluation
+if st.button("🔍 Evaluate Responses"):
+    if prompt and response_a and response_b:
+        text_a = f"Prompt: {prompt} [SEP] {response_a}"
+        text_b = f"Prompt: {prompt} [SEP] {response_b}"
+        encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
+        encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
+        encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
+        encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
+        with torch.no_grad():
+            score_a = model(**encoded_a).squeeze()
+            score_b = model(**encoded_b).squeeze()
+        prob_a = torch.sigmoid(score_a).item()
+        prob_b = torch.sigmoid(score_b).item()
+        # ✅ Nice result display
+        st.subheader("🔮 Prediction Result")
+        if prob_b > prob_a:
+            st.success(f"✅ **Response B is better!** (Confidence: {prob_b:.4f})")
         else:
+            st.success(f"✅ **Response A is better!** (Confidence: {prob_a:.4f})")
+        # ✅ Probability metrics in 2 columns
+        mcol1, mcol2 = st.columns(2)
+        mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
+        mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
+        # ✅ Bar chart comparison
+        st.markdown("---")
+        st.subheader("📊 Confidence Comparison")
+        st.bar_chart({"Confidence": [prob_a, prob_b]})
+    else:
+        st.warning("⚠️ Please fill in **all fields** before evaluating!")