Spaces:

naa142
/

llmfinetune

Sleeping

App Files Files Community

naa142 commited on Apr 30

Commit

5ae0fb5

verified ·

1 Parent(s): f250f13

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -57

app.py CHANGED Viewed

@@ -1,39 +1,24 @@
-# app.py
 import streamlit as st
 import torch
 import torch.nn as nn
-from transformers import AutoTokenizer, DebertaV3Model
-# ✅ Set device - force CPU to avoid meta tensor issues
-device = torch.device("cpu")
-# ✅ Load tokenizer directly
-try:
-    tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
-    st.sidebar.success("✅ Tokenizer loaded successfully")
-except Exception as e:
-    st.error(f"Failed to load tokenizer: {e}")
-    st.stop()
-# ✅ Define model with explicit DebertaV3 base
 class ScoringModel(nn.Module):
-    def __init__(self, dropout_rate=0.242):
         super().__init__()
-        # Use the specific model class instead of AutoModel
-        try:
-            self.base = DebertaV3Model.from_pretrained("microsoft/deberta-v3-small")
-            st.sidebar.success("✅ Base model loaded successfully")
-        except Exception as e:
-            st.error(f"Failed to load base model: {e}")
-            st.stop()
         self.base.gradient_checkpointing_enable()
         self.dropout1 = nn.Dropout(dropout_rate)
         self.dropout2 = nn.Dropout(dropout_rate)
         self.dropout3 = nn.Dropout(dropout_rate)
         self.classifier = nn.Linear(self.base.config.hidden_size, 1)
     def forward(self, input_ids, attention_mask):
         hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
         logits = (self.classifier(self.dropout1(hidden)) +
@@ -41,20 +26,15 @@ class ScoringModel(nn.Module):
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
-# ✅ Initialize and load weights
-try:
-    model = ScoringModel()
-    model = model.to(device)
-    model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
-    model.eval()
-    st.sidebar.success("✅ Model weights loaded successfully")
-except Exception as e:
-    st.error(f"Failed to load model weights: {e}")
-    st.write("Check if your 'scoring_model.pt' file is properly uploaded.")
-    st.stop()
-# ✅ Setup Streamlit page
 st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
 st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
 st.markdown("---")
@@ -62,22 +42,25 @@ st.markdown("---")
 with st.sidebar:
     st.header("ℹ️ About")
     st.markdown("""
-    This app evaluates **which AI response is better** given a prompt.
-    **How it works:**
-    - You enter a **prompt** and two **responses**.
-    - The model predicts **which response** is of **higher quality**.
-    Powered by a **fine-tuned DeBERTa-v3-small** model 🚀
     """)
 # ✅ Main input section
 col1, col2 = st.columns(2)
 with col1:
     prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
     st.markdown("<br>", unsafe_allow_html=True)
     st.markdown("👉 Provide two possible responses below:")
 response_a = st.text_area("✏️ Response A", height=100)
 response_b = st.text_area("✏️ Response B", height=100)
@@ -86,41 +69,37 @@ if st.button("🔍 Evaluate Responses"):
     if prompt and response_a and response_b:
         text_a = f"Prompt: {prompt} [SEP] {response_a}"
         text_b = f"Prompt: {prompt} [SEP] {response_b}"
         encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
         encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
         with torch.no_grad():
             score_a = model(**encoded_a).squeeze()
             score_b = model(**encoded_b).squeeze()
         prob_a = torch.sigmoid(score_a).item()
         prob_b = torch.sigmoid(score_b).item()
         # ✅ Nice result display
         st.subheader("🔮 Prediction Result")
         if prob_b > prob_a:
-            st.success(f"✅ **Response B is better!** (Confidence: {prob_b:.4f})")
         else:
-            st.success(f"✅ **Response A is better!** (Confidence: {prob_a:.4f})")
         # ✅ Probability metrics in 2 columns
         mcol1, mcol2 = st.columns(2)
         mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
         mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
         # ✅ Bar chart comparison
         st.markdown("---")
         st.subheader("📊 Confidence Comparison")
         st.bar_chart({"Confidence": [prob_a, prob_b]})
     else:
-        st.warning("⚠️ Please fill in **all fields** before evaluating!")
-# Make app accessible externally when deployed on Hugging Face Spaces
-if __name__ == "__main__":
-    import os
-    # Get port from environment variable or use default
-    port = int(os.environ.get("PORT", 8501))

 import streamlit as st
 import torch
 import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ✅ 1. Load tokenizer from current directory
+tokenizer = AutoTokenizer.from_pretrained(".", local_files_only=True)
+# ✅ 2. Define the model
 class ScoringModel(nn.Module):
+    def __init__(self, base_model_name="microsoft/deberta-v3-small", dropout_rate=0.242):
         super().__init__()
+        self.base = AutoModel.from_pretrained(base_model_name)
         self.base.gradient_checkpointing_enable()
         self.dropout1 = nn.Dropout(dropout_rate)
         self.dropout2 = nn.Dropout(dropout_rate)
         self.dropout3 = nn.Dropout(dropout_rate)
         self.classifier = nn.Linear(self.base.config.hidden_size, 1)
     def forward(self, input_ids, attention_mask):
         hidden = self.base(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
         logits = (self.classifier(self.dropout1(hidden)) +
                   self.classifier(self.dropout3(hidden))) / 3
         return logits
+# ✅ 3. Initialize and load weights
+model = ScoringModel()
+model.load_state_dict(torch.load("scoring_model.pt", map_location=device))
+model = model.to(device)
+model.eval()
+# ✅ 4. Setup Streamlit page
 st.set_page_config(page_title="🧠 LLM Response Evaluator", page_icon="📝", layout="wide")
 st.markdown("<h1 style='text-align: center;'>🧠 LLM Response Evaluator</h1>", unsafe_allow_html=True)
 st.markdown("---")
 with st.sidebar:
     st.header("ℹ️ About")
     st.markdown("""
+    This app evaluates *which AI response is better* given a prompt.
+    *How it works:*
+    - You enter a *prompt* and two *responses*.
+    - The model predicts *which response* is of *higher quality*.
+    Powered by a *fine-tuned DeBERTa-v3-small* model 🚀
     """)
 # ✅ Main input section
 col1, col2 = st.columns(2)
 with col1:
     prompt = st.text_area("📝 Enter the Prompt", height=150)
 with col2:
     st.markdown("<br>", unsafe_allow_html=True)
     st.markdown("👉 Provide two possible responses below:")
 response_a = st.text_area("✏️ Response A", height=100)
 response_b = st.text_area("✏️ Response B", height=100)
     if prompt and response_a and response_b:
         text_a = f"Prompt: {prompt} [SEP] {response_a}"
         text_b = f"Prompt: {prompt} [SEP] {response_b}"
         encoded_a = tokenizer(text_a, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_b = tokenizer(text_b, return_tensors='pt', padding='max_length', truncation=True, max_length=186)
         encoded_a = {k: v.to(device) for k, v in encoded_a.items() if k in ["input_ids", "attention_mask"]}
         encoded_b = {k: v.to(device) for k, v in encoded_b.items() if k in ["input_ids", "attention_mask"]}
         with torch.no_grad():
             score_a = model(**encoded_a).squeeze()
             score_b = model(**encoded_b).squeeze()
         prob_a = torch.sigmoid(score_a).item()
         prob_b = torch.sigmoid(score_b).item()
         # ✅ Nice result display
         st.subheader("🔮 Prediction Result")
         if prob_b > prob_a:
+            st.success(f"✅ *Response B is better!* (Confidence: {prob_b:.4f})")
         else:
+            st.success(f"✅ *Response A is better!* (Confidence: {prob_a:.4f})")
         # ✅ Probability metrics in 2 columns
         mcol1, mcol2 = st.columns(2)
         mcol1.metric(label="Confidence A", value=f"{prob_a:.4f}")
         mcol2.metric(label="Confidence B", value=f"{prob_b:.4f}")
         # ✅ Bar chart comparison
         st.markdown("---")
         st.subheader("📊 Confidence Comparison")
         st.bar_chart({"Confidence": [prob_a, prob_b]})
     else:
+        st.warning("⚠️ Please fill in *all fields* before evaluating!")