Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on May 22

Commit

5bf8193

verified ·

1 Parent(s): 5e87f41

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -38

app.py CHANGED Viewed

@@ -79,10 +79,10 @@ def create_lexical_bow_mask(input_ids_batch, vocab_size, tokenizer):
 # --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
-# These functions take single text input for the Explorer tab
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
-        return "SPLADE-cocondenser-distil model is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(model_splade.device) for k, v in inputs.items()}
@@ -96,7 +96,7 @@ def get_splade_cocondenser_representation(text):
             dim=1
         )[0].squeeze() # Squeeze is fine here as it's a single input
     else:
-        return "Model output structure not as expected for SPLADE-cocondenser-distil. 'logits' not found."
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
@@ -120,16 +120,16 @@ def get_splade_cocondenser_representation(text):
         for term, weight in sorted_representation:
             formatted_output += f"- **{term}**: {weight:.4f}\n"
-    formatted_output += "\n--- Sparse Vector Info ---\n"
-    formatted_output += f"Total non-zero terms in vector: {len(indices)}\n"
-    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
-    return formatted_output
 def get_splade_lexical_representation(text):
     if tokenizer_splade_lexical is None or model_splade_lexical is None:
-        return "SPLADE-v3-Lexical model is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade_lexical(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(model_splade_lexical.device) for k, v in inputs.items()}
@@ -143,7 +143,7 @@ def get_splade_lexical_representation(text):
             dim=1
         )[0].squeeze() # Squeeze is fine here
     else:
-        return "Model output structure not as expected for SPLADE-v3-Lexical. 'logits' not found."
     # Always apply lexical mask for this model's specific behavior
     vocab_size = tokenizer_splade_lexical.vocab_size
@@ -175,16 +175,16 @@ def get_splade_lexical_representation(text):
         for term, weight in sorted_representation:
             formatted_output += f"- **{term}**: {weight:.4f}\n"
-    formatted_output += "\n--- Raw Sparse Vector Info ---\n"
-    formatted_output += f"Total non-zero terms in vector: {len(indices)}\n"
-    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
-    return formatted_output
 def get_splade_doc_representation(text):
     if tokenizer_splade_doc is None: # No longer need model_splade_doc to be loaded for 'logits'
-        return "SPLADE-v3-Doc tokenizer is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(torch.device("cpu")) for k, v in inputs.items()} # Ensure on CPU for direct mask creation
@@ -220,11 +220,11 @@ def get_splade_doc_representation(text):
                 break
             formatted_output += f"- **{term}**\n"
-    formatted_output += "\n--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
-    formatted_output += f"Total activated terms: {len(indices)}\n"
-    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
-    return formatted_output
 # --- Unified Prediction Function for the Explorer Tab ---
@@ -236,7 +236,7 @@ def predict_representation_explorer(model_choice, text):
     elif model_choice == "Binary Bag-of-Words": # Changed name
         return get_splade_doc_representation(text)
     else:
-        return "Please select a model."
 # --- Core Representation Functions (Return RAW TENSORS - for Dot Product Tab) ---
 # These functions remain unchanged from the previous iteration, as they return the raw tensors.
@@ -339,10 +339,10 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
             else:
                 formatted_output += f"- **{term}**: {weight:.4f}\n"
-    formatted_output += f"\nTotal non-zero terms: {len(indices)}\n"
-    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
-    return formatted_output
 # --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
@@ -376,11 +376,16 @@ def calculate_dot_product_and_representations_independent(query_model_choice, do
     dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
     # Format representations
     query_rep_str = f"Query Representation ({query_model_name_display}):\n"
-    query_rep_str += format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
     doc_rep_str = f"Document Representation ({doc_model_name_display}):\n"
-    doc_rep_str += format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
     # Combine output
     full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
@@ -397,30 +402,50 @@ with gr.Blocks(title="SPLADE Demos") as demo:
     with gr.Tabs():
         with gr.TabItem("Sparse Representation"):
-            gr.Markdown("### Produce a Sparse Representation of of an Input Text")
-            gr.Interface(
-                fn=predict_representation_explorer,
-                inputs=[
-                    gr.Radio(
                         [
                             "MLM encoder (SPLADE-cocondenser-distil)",
                             "MLP encoder (SPLADE-v3-lexical)",
-                            "Binary Bag-of-Words" # Changed name here
                         ],
                         label="Choose Sparse Encoder",
                         value="MLM encoder (SPLADE-cocondenser-distil)"
-                    ),
-                    gr.Textbox(
                         lines=5,
                         label="Enter your query or document text here:",
                         placeholder="e.g., Why is Padua the nicest city in Italy?"
                     )
-                ],
-                outputs=gr.Markdown(),
-                allow_flagging="never",
-                # live=True # Setting live=True might be slow for complex models on every keystroke
             )
         with gr.TabItem("Compare Encoders"): # NEW TAB
             gr.Markdown("### Calculate Dot Product Similarity between Query and Document")
             gr.Markdown("Select **independent** SPLADE models to encode your query and document, then see their sparse representations and their similarity score.")
@@ -429,7 +454,7 @@ with gr.Blocks(title="SPLADE Demos") as demo:
             model_choices = [
                 "MLM encoder (SPLADE-cocondenser-distil)",
                 "MLP encoder (SPLADE-v3-lexical)",
-                "Binary Bag-of-Words" # Changed name here
             ]
             gr.Interface(

 # --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
+# These functions now return a tuple: (main_representation_str, info_str)
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
+        return "SPLADE-cocondenser-distil model is not loaded. Please check the console for loading errors.", ""
     inputs = tokenizer_splade(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(model_splade.device) for k, v in inputs.items()}
             dim=1
         )[0].squeeze() # Squeeze is fine here as it's a single input
     else:
+        return "Model output structure not as expected for SPLADE-cocondenser-distil. 'logits' not found.", ""
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         for term, weight in sorted_representation:
             formatted_output += f"- **{term}**: {weight:.4f}\n"
+    info_output = f"--- Sparse Vector Info ---\n"
+    info_output += f"Total non-zero terms in vector: {len(indices)}\n"
+    info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
+    return formatted_output, info_output
 def get_splade_lexical_representation(text):
     if tokenizer_splade_lexical is None or model_splade_lexical is None:
+        return "SPLADE-v3-Lexical model is not loaded. Please check the console for loading errors.", ""
     inputs = tokenizer_splade_lexical(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(model_splade_lexical.device) for k, v in inputs.items()}
             dim=1
         )[0].squeeze() # Squeeze is fine here
     else:
+        return "Model output structure not as expected for SPLADE-v3-Lexical. 'logits' not found.", ""
     # Always apply lexical mask for this model's specific behavior
     vocab_size = tokenizer_splade_lexical.vocab_size
         for term, weight in sorted_representation:
             formatted_output += f"- **{term}**: {weight:.4f}\n"
+    info_output = f"--- Raw Sparse Vector Info ---\n"
+    info_output += f"Total non-zero terms in vector: {len(indices)}\n"
+    info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
+    return formatted_output, info_output
 def get_splade_doc_representation(text):
     if tokenizer_splade_doc is None: # No longer need model_splade_doc to be loaded for 'logits'
+        return "SPLADE-v3-Doc tokenizer is not loaded. Please check the console for loading errors.", ""
     inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(torch.device("cpu")) for k, v in inputs.items()} # Ensure on CPU for direct mask creation
                 break
             formatted_output += f"- **{term}**\n"
+    info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
+    info_output += f"Total activated terms: {len(indices)}\n"
+    info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
+    return formatted_output, info_output
 # --- Unified Prediction Function for the Explorer Tab ---
     elif model_choice == "Binary Bag-of-Words": # Changed name
         return get_splade_doc_representation(text)
     else:
+        return "Please select a model.", "" # Return two empty strings for consistency
 # --- Core Representation Functions (Return RAW TENSORS - for Dot Product Tab) ---
 # These functions remain unchanged from the previous iteration, as they return the raw tensors.
             else:
                 formatted_output += f"- **{term}**: {weight:.4f}\n"
+    info_output = f"\nTotal non-zero terms: {len(indices)}\n"
+    info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
+    return formatted_output, info_output # Now returns two strings
 # --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
     dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
     # Format representations
+    # These functions now return two strings (main_output, info_output)
+    query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
+    doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
     query_rep_str = f"Query Representation ({query_model_name_display}):\n"
+    query_rep_str += query_main_rep_str + "\n" + query_info_str
     doc_rep_str = f"Document Representation ({doc_model_name_display}):\n"
+    doc_rep_str += doc_main_rep_str + "\n" + doc_info_str
     # Combine output
     full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
     with gr.Tabs():
         with gr.TabItem("Sparse Representation"):
+            gr.Markdown("### Produce a Sparse Representation of an Input Text")
+            with gr.Row():
+                with gr.Column(scale=1): # Left column for inputs and info
+                    model_radio = gr.Radio(
                         [
                             "MLM encoder (SPLADE-cocondenser-distil)",
                             "MLP encoder (SPLADE-v3-lexical)",
+                            "Binary Bag-of-Words"
                         ],
                         label="Choose Sparse Encoder",
                         value="MLM encoder (SPLADE-cocondenser-distil)"
+                    )
+                    input_text = gr.Textbox(
                         lines=5,
                         label="Enter your query or document text here:",
                         placeholder="e.g., Why is Padua the nicest city in Italy?"
                     )
+                    # New Markdown component for the info output
+                    info_output_display = gr.Markdown(
+                        value="",
+                        label="Vector Information",
+                        elem_id="info_output_display" # Add an ID for potential CSS if needed
+                    )
+                with gr.Column(scale=2): # Right column for the main representation output
+                    main_representation_output = gr.Markdown()
+            # Connect the interface elements
+            model_radio.change(
+                fn=predict_representation_explorer,
+                inputs=[model_radio, input_text],
+                outputs=[main_representation_output, info_output_display]
             )
+            input_text.change(
+                fn=predict_representation_explorer,
+                inputs=[model_radio, input_text],
+                outputs=[main_representation_output, info_output_display]
+            )
+            # Initial call to populate on load (optional, but good for demo)
+            demo.load(
+                fn=lambda: predict_representation_explorer(model_radio.value, input_text.value),
+                outputs=[main_representation_output, info_output_display]
+            )
         with gr.TabItem("Compare Encoders"): # NEW TAB
             gr.Markdown("### Calculate Dot Product Similarity between Query and Document")
             gr.Markdown("Select **independent** SPLADE models to encode your query and document, then see their sparse representations and their similarity score.")
             model_choices = [
                 "MLM encoder (SPLADE-cocondenser-distil)",
                 "MLP encoder (SPLADE-v3-lexical)",
+                "Binary Bag-of-Words"
             ]
             gr.Interface(