Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on May 21

Commit

ab88097

verified ·

1 Parent(s): c1ea33e

removed uncoil, added spladev3 lexical

Browse files

Files changed (1) hide show

app.py +53 -66

app.py CHANGED Viewed

@@ -5,39 +5,36 @@ import torch
 # --- Model Loading ---
 tokenizer_splade = None
 model_splade = None
-tokenizer_unicoil = None
-model_unicoil = None
-# Load SPLADE v3 model
 try:
     tokenizer_splade = AutoTokenizer.from_pretrained("naver/splade-cocondenser-selfdistil")
     model_splade = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-selfdistil")
     model_splade.eval() # Set to evaluation mode for inference
-    print("SPLADE v3 model loaded successfully!")
 except Exception as e:
-    print(f"Error loading SPLADE model: {e}")
     print("Please ensure you have accepted any user access agreements on the Hugging Face Hub page for 'naver/splade-cocondenser-selfdistil'.")
-# Load UNICOIL model for binary sparse encoding
-# Load UNICOIL model for binary sparse encoding
 try:
-    unicoil_model_name = "castorini/unicoil-msmarco-passage"
-    tokenizer_unicoil = AutoTokenizer.from_pretrained(unicoil_model_name)
-    # --- FIX IS HERE ---
-    model_unicoil = AutoModelForMaskedLM.from_pretrained(unicoil_model_name)
-    # -------------------
-    model_unicoil.eval() # Set to evaluation mode for inference
-    print(f"UNICOIL model '{unicoil_model_name}' loaded successfully!")
 except Exception as e:
-    print(f"Error loading UNICOIL model: {e}")
-    print(f"Please ensure '{unicoil_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
 # --- Core Representation Functions ---
 def get_splade_representation(text):
     if tokenizer_splade is None or model_splade is None:
-        return "SPLADE model is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(model_splade.device) for k, v in inputs.items()}
@@ -51,7 +48,7 @@ def get_splade_representation(text):
             dim=1
         )[0].squeeze()
     else:
-        return "Model output structure not as expected for SPLADE. 'logits' not found."
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
@@ -68,7 +65,7 @@ def get_splade_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = "SPLADE Representation (All Non-Zero Terms):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
@@ -82,69 +79,59 @@ def get_splade_representation(text):
     return formatted_output
-def get_unicoil_binary_representation(text):
-    if tokenizer_unicoil is None or model_unicoil is None:
-        return "UNICOIL model is not loaded. Please check the console for loading errors."
-    inputs = tokenizer_unicoil(text, return_tensors="pt", padding=True, truncation=True)
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
-    inputs = {k: v.to(model_unicoil.device) for k, v in inputs.items()}
     with torch.no_grad():
-        output = model_unicoil(**inputs)
-    if not hasattr(output, "logits"):
-        return "UNICOIL model output structure not as expected. 'logits' not found."
-    logits = output.logits.squeeze(0)  # [seq_len, vocab_size]
-    token_ids = input_ids.squeeze(0)   # [seq_len]
-    mask = attention_mask.squeeze(0)   # [seq_len]
-    transformed_scores = torch.log(1 + torch.exp(logits))  # softplus
-    token_scores = transformed_scores[range(len(token_ids)), token_ids]  # only scores for input tokens
-    token_scores = token_scores * mask  # mask out padding
-    # Binarize: threshold scores > 0.5 (tune as needed)
-    binary_mask = (token_scores > 0.5)
-    activated_token_ids = token_ids[binary_mask].cpu().tolist()
-    # Map token ids to strings
-    binary_terms = {}
-    for token_id in activated_token_ids:
-        decoded_token = tokenizer_unicoil.decode([token_id])
         if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
-            binary_terms[decoded_token] = 1
-    sorted_binary_terms = sorted(binary_terms.items(), key=lambda item: item[0])
-    formatted_output = "UNICOIL Binary Sparse Representation (Activated Terms):\n"
-    if not sorted_binary_terms:
-        formatted_output += "No significant terms activated for this input.\n"
     else:
-        for i, (term, _) in enumerate(sorted_binary_terms):
-            if i >= 50:
-                formatted_output += f"...and {len(sorted_binary_terms) - 50} more terms.\n"
-                break
-            formatted_output += f"- **{term}**\n"
-    formatted_output += "\n--- Raw Binary Sparse Vector Info ---\n"
-    formatted_output += f"Total activated terms: {len(sorted_binary_terms)}\n"
-    formatted_output += f"Sparsity: {1 - (len(sorted_binary_terms) / tokenizer_unicoil.vocab_size):.2%}\n"
     return formatted_output
 # --- Unified Prediction Function for Gradio ---
 def predict_representation(model_choice, text):
-    if model_choice == "SPLADE":
         return get_splade_representation(text)
-    elif model_choice == "UNICOIL (Binary Sparse)":
-        return get_unicoil_binary_representation(text)
     else:
         return "Please select a model."
@@ -153,9 +140,9 @@ demo = gr.Interface(
     fn=predict_representation,
     inputs=[
         gr.Radio(
-            ["SPLADE", "UNICOIL"], # Added UNICOIL option
             label="Choose Representation Model",
-            value="SPLADE" # Default selection
         ),
         gr.Textbox(
             lines=5,
@@ -165,7 +152,7 @@ demo = gr.Interface(
     ],
     outputs=gr.Markdown(),
     title="🌌 Sparse and Binary Sparse Representation Generator",
-    description="Enter any text to see its SPLADE sparse vector or UNICOIL binary sparse representation.",
     allow_flagging="never"
 )

 # --- Model Loading ---
 tokenizer_splade = None
 model_splade = None
+tokenizer_splade_lexical = None
+model_splade_lexical = None
+# Load SPLADE v3 model (original)
 try:
     tokenizer_splade = AutoTokenizer.from_pretrained("naver/splade-cocondenser-selfdistil")
     model_splade = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-selfdistil")
     model_splade.eval() # Set to evaluation mode for inference
+    print("SPLADE v3 (cocondenser) model loaded successfully!")
 except Exception as e:
+    print(f"Error loading SPLADE (cocondenser) model: {e}")
     print("Please ensure you have accepted any user access agreements on the Hugging Face Hub page for 'naver/splade-cocondenser-selfdistil'.")
+# Load SPLADE v3 Lexical model
 try:
+    splade_lexical_model_name = "naver/splade-v3-lexical"
+    tokenizer_splade_lexical = AutoTokenizer.from_pretrained(splade_lexical_model_name)
+    model_splade_lexical = AutoModelForMaskedLM.from_pretrained(splade_lexical_model_name)
+    model_splade_lexical.eval() # Set to evaluation mode for inference
+    print(f"SPLADE v3 Lexical model '{splade_lexical_model_name}' loaded successfully!")
 except Exception as e:
+    print(f"Error loading SPLADE v3 Lexical model: {e}")
+    print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
 # --- Core Representation Functions ---
 def get_splade_representation(text):
     if tokenizer_splade is None or model_splade is None:
+        return "SPLADE (cocondenser) model is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade(text, return_tensors="pt", padding=True, truncation=True)
     inputs = {k: v.to(model_splade.device) for k, v in inputs.items()}
             dim=1
         )[0].squeeze()
     else:
+        return "Model output structure not as expected for SPLADE (cocondenser). 'logits' not found."
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "SPLADE (cocondenser) Representation (All Non-Zero Terms):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
     return formatted_output
+def get_splade_lexical_representation(text):
+    if tokenizer_splade_lexical is None or model_splade_lexical is None:
+        return "SPLADE v3 Lexical model is not loaded. Please check the console for loading errors."
+    inputs = tokenizer_splade_lexical(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_splade_lexical.device) for k, v in inputs.items()}
     with torch.no_grad():
+        output = model_splade_lexical(**inputs)
+    if hasattr(output, 'logits'):
+        splade_vector = torch.max(
+            torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
+            dim=1
+        )[0].squeeze()
+    else:
+        return "Model output structure not as expected for SPLADE v3 Lexical. 'logits' not found."
+    indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
+    if not isinstance(indices, list):
+        indices = [indices]
+    values = splade_vector[indices].cpu().tolist()
+    token_weights = dict(zip(indices, values))
+    meaningful_tokens = {}
+    for token_id, weight in token_weights.items():
+        decoded_token = tokenizer_splade_lexical.decode([token_id])
         if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
+            meaningful_tokens[decoded_token] = weight
+    sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "SPLADE v3 Lexical Representation (All Non-Zero Terms):\n"
+    if not sorted_representation:
+        formatted_output += "No significant terms found for this input.\n"
     else:
+        for term, weight in sorted_representation:
+            formatted_output += f"- **{term}**: {weight:.4f}\n"
+    formatted_output += "\n--- Raw SPLADE Vector Info ---\n"
+    formatted_output += f"Total non-zero terms in vector: {len(indices)}\n"
+    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
     return formatted_output
 # --- Unified Prediction Function for Gradio ---
 def predict_representation(model_choice, text):
+    if model_choice == "SPLADE (cocondenser)":
         return get_splade_representation(text)
+    elif model_choice == "SPLADE-v3-Lexical":
+        return get_splade_lexical_representation(text)
     else:
         return "Please select a model."
     fn=predict_representation,
     inputs=[
         gr.Radio(
+            ["SPLADE (cocondenser)", "SPLADE-v3-Lexical"], # Updated options
             label="Choose Representation Model",
+            value="SPLADE (cocondenser)" # Default selection
         ),
         gr.Textbox(
             lines=5,
     ],
     outputs=gr.Markdown(),
     title="🌌 Sparse and Binary Sparse Representation Generator",
+    description="Enter any text to see its SPLADE sparse vector or SPLADE-v3-Lexical representation.",
     allow_flagging="never"
 )