Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on May 22

Commit

4e0cddb

verified ·

1 Parent(s): b0796be

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -351

app.py CHANGED Viewed

@@ -2,12 +2,10 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
 import numpy as np
-from tqdm.auto import tqdm
-import os
-import ir_datasets
-import random # Added for random selection
-# --- Model Loading (Keep as is) ---
 tokenizer_splade = None
 model_splade = None
 tokenizer_splade_lexical = None
@@ -48,44 +46,7 @@ except Exception as e:
     print(f"Please ensure '{splade_doc_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
-# --- Global Variables for Document Index and Qrels ---
-document_representations = {} # Stores {doc_id: sparse_vector}
-document_texts = {}           # Stores {doc_id: doc_text}
-queries_texts = {}            # Stores {query_id: query_text}
-qrels_data = {}               # Stores {query_id: [{doc_id: str, relevance: int}, ...]}
-initial_doc_model_for_indexing = "SPLADE-cocondenser-distil" # Fixed for initial demo index
-# --- Load Cranfield Corpus, Queries, and Qrels using ir_datasets ---
-def load_cranfield_corpus_ir_datasets():
-    global document_texts, queries_texts, qrels_data
-    print("Loading Cranfield corpus, queries, and qrels using ir_datasets...")
-    try:
-        dataset = ir_datasets.load("cranfield")
-        # Load documents
-        for doc in tqdm(dataset.docs_iter(), desc="Loading Cranfield documents"):
-            document_texts[doc.doc_id] = doc.text.strip()
-        print(f"Loaded {len(document_texts)} documents from Cranfield corpus.")
-        # Load queries
-        for query in tqdm(dataset.queries_iter(), desc="Loading Cranfield queries"):
-            queries_texts[query.query_id] = query.text.strip()
-        print(f"Loaded {len(queries_texts)} queries from Cranfield corpus.")
-        # Load qrels
-        for qrel in tqdm(dataset.qrels_iter(), desc="Loading Cranfield qrels"):
-            if qrel.query_id not in qrels_data:
-                qrels_data[qrel.query_id] = []
-            qrels_data[qrel.query_id].append({"doc_id": qrel.doc_id, "relevance": qrel.relevance})
-        print(f"Loaded qrels for {len(qrels_data)} queries.")
-    except Exception as e:
-        print(f"Error loading Cranfield corpus with ir_datasets: {e}")
-        print("Please ensure 'ir_datasets' is installed and your internet connection is stable.")
-# --- Helper function for lexical mask (now handles batches) ---
 def create_lexical_bow_mask(input_ids_batch, vocab_size, tokenizer):
     """
     Creates a batch of lexical BOW masks.
@@ -118,7 +79,7 @@ def create_lexical_bow_mask(input_ids_batch, vocab_size, tokenizer):
 # --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
-# These functions still take single text input for the Explorer tab
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
         return "SPLADE-cocondenser-distil model is not loaded. Please check the console for loading errors."
@@ -284,270 +245,10 @@ def predict_representation_explorer(model_choice, text):
         return "Please select a model."
-# --- Internal Core Representation Functions (now handle batches) ---
-def get_splade_cocondenser_representation_internal(texts, tokenizer, model):
-    """
-    Generates SPLADE representations for a batch of texts.
-    texts: list of strings
-    tokenizer: the tokenizer object
-    model: the SPLADE model
-    Returns: torch.Tensor of shape (batch_size, vocab_size) or None
-    """
-    if tokenizer is None or model is None: return None
-    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        output = model(**inputs)
-    if hasattr(output, 'logits'):
-        # torch.max(..., dim=1)[0] reduces along sequence_length dimension,
-        # resulting in (batch_size, vocab_size)
-        splade_vectors = torch.max(
-            torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
-            dim=1
-        )[0]
-        return splade_vectors
-    else:
-        print("Model output structure not as expected for SPLADE-cocondenser-distil. 'logits' not found.")
-        return None
-def get_splade_lexical_representation_internal(texts, tokenizer, model):
-    """
-    Generates SPLADE-Lexical representations for a batch of texts.
-    texts: list of strings
-    tokenizer: the tokenizer object
-    model: the SPLADE-Lexical model
-    Returns: torch.Tensor of shape (batch_size, vocab_size) or None
-    """
-    if tokenizer is None or model is None: return None
-    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad(): output = model(**inputs)
-    if hasattr(output, 'logits'):
-        splade_vectors = torch.max(torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1), dim=1)[0]
-        vocab_size = tokenizer.vocab_size
-        # create_lexical_bow_mask now returns (batch_size, vocab_size)
-        bow_masks = create_lexical_bow_mask(inputs['input_ids'], vocab_size, tokenizer)
-        splade_vectors = splade_vectors * bow_masks # Element-wise multiplication, shapes (batch_size, vocab_size)
-        return splade_vectors
-    else:
-        print("Model output structure not as expected for SPLADE-v3-Lexical. 'logits' not found.")
-        return None
-def get_splade_doc_representation_internal(texts, tokenizer, model):
-    """
-    Generates SPLADE-Doc (binary) representations for a batch of texts.
-    texts: list of strings
-    tokenizer: the tokenizer object
-    model: the SPLADE-Doc model (not directly used for logits, but for device)
-    Returns: torch.Tensor of shape (batch_size, vocab_size) or None
-    """
-    if tokenizer is None or model is None: return None
-    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()} # Ensure inputs are on the correct device
-    vocab_size = tokenizer.vocab_size
-    # create_lexical_bow_mask now returns (batch_size, vocab_size)
-    binary_splade_vectors = create_lexical_bow_mask(inputs['input_ids'], vocab_size, tokenizer)
-    return binary_splade_vectors
-# --- Document Indexing Function (now uses batching) ---
-def index_documents(doc_model_choice):
-    global document_representations
-    if document_representations:
-        print("Documents already indexed. Skipping re-indexing.")
-        return True
-    tokenizer_to_use = None
-    model_to_use = None
-    representation_func_to_use = None
-    if doc_model_choice == "SPLADE-cocondenser-distil":
-        if tokenizer_splade is None or model_splade is None:
-            print("SPLADE-cocondenser-distil model not loaded for indexing.")
-            return False
-        tokenizer_to_use = tokenizer_splade
-        model_to_use = model_splade
-        representation_func_to_use = get_splade_cocondenser_representation_internal
-    elif doc_model_choice == "SPLADE-v3-Lexical":
-        if tokenizer_splade_lexical is None or model_splade_lexical is None:
-            print("SPLADE-v3-Lexical model not loaded for indexing.")
-            return False
-        tokenizer_to_use = tokenizer_splade_lexical
-        model_to_use = model_splade_lexical
-        representation_func_to_use = get_splade_lexical_representation_internal
-    elif doc_model_choice == "SPLADE-v3-Doc":
-        if tokenizer_splade_doc is None or model_splade_doc is None:
-            print("SPLADE-v3-Doc model not loaded for indexing.")
-            return False
-        tokenizer_to_use = tokenizer_splade_doc
-        model_to_use = model_splade_doc
-        representation_func_to_use = get_splade_doc_representation_internal
-    else:
-        print(f"Invalid model choice for document indexing: {doc_model_choice}")
-        return False
-    print(f"Indexing documents using {doc_model_choice}...")
-    doc_ids_list = list(document_texts.keys())
-    doc_texts_list = list(document_texts.values())
-    # --- BATCH SIZE FOR INDEXING ---
-    batch_size = 32 # You can adjust this value based on memory and performance
-    document_representations = {} # Ensure it's clear we're (re)building the index
-    # Iterate through documents in batches
-    for i in tqdm(range(0, len(doc_ids_list), batch_size), desc="Indexing Documents in Batches"):
-        batch_doc_ids = doc_ids_list[i:i + batch_size]
-        batch_doc_texts = doc_texts_list[i:i + batch_size]
-        sparse_vectors_batch = representation_func_to_use(batch_doc_texts, tokenizer_to_use, model_to_use)
-        if sparse_vectors_batch is not None:
-            # sparse_vectors_batch will have shape (batch_size, vocab_size)
-            for j, doc_id in enumerate(batch_doc_ids):
-                # Store each document's vector
-                document_representations[doc_id] = sparse_vectors_batch[j].cpu()
-        else:
-            print(f"Warning: Failed to get representation for a batch starting with doc_id {batch_doc_ids[0]}")
-    print(f"Finished indexing {len(document_representations)} documents.")
-    return True
-# --- Retrieval Function (for Retrieval Tab) ---
-def retrieve_documents(query_text, query_model_choice, indexed_doc_model_name, top_k=5):
-    if not document_representations:
-        return "Document index is not loaded or empty. Please ensure documents are indexed.", []
-    query_vector = None
-    query_tokenizer = None
-    query_model = None
-    # These internal calls still use single text input for the query
-    if query_model_choice == "SPLADE-cocondenser-distil (weighting and expansion)":
-        query_tokenizer = tokenizer_splade
-        query_model = model_splade
-        query_vector = get_splade_cocondenser_representation_internal([query_text], query_tokenizer, query_model)
-    elif query_model_choice == "SPLADE-v3-Lexical (weighting)":
-        query_tokenizer = tokenizer_splade_lexical
-        query_model = model_splade_lexical
-        query_vector = get_splade_lexical_representation_internal([query_text], query_tokenizer, query_model)
-    elif query_model_choice == "SPLADE-v3-Doc (binary)":
-        query_tokenizer = tokenizer_splade_doc
-        query_model = model_splade_doc
-        query_vector = get_splade_doc_representation_internal([query_text], query_tokenizer, query_model)
-    else:
-        return "Invalid query model choice.", []
-    if query_vector is None:
-        return "Failed to get query representation. Check console for model loading errors.", []
-    # Since internal functions now return batches, take the first (and only) item for single query
-    query_vector = query_vector.squeeze(0).cpu()
-    scores = {}
-    for doc_id, doc_vec in document_representations.items():
-        score = torch.dot(query_vector, doc_vec).item()
-        scores[doc_id] = score
-    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
-    top_results = sorted_scores[:top_k]
-    formatted_output = f"Retrieval Results for Query: '{query_text}'\n"
-    formatted_output += f"Using Query Model: **{query_model_choice}**\n"
-    formatted_output += f"Documents Indexed with: **{indexed_doc_model_name}**\n\n"
-    if not top_results:
-        formatted_output += "No documents found or scored.\n"
-    else:
-        for i, (doc_id, score) in enumerate(top_results):
-            doc_text = document_texts.get(doc_id, "Document text not available.")
-            formatted_output += f"**{i+1}. Document ID: {doc_id}** (Score: {score:.4f})\n"
-            formatted_output += f"> {doc_text[:300]}...\n\n"
-    return formatted_output, top_results
-# --- Unified Prediction Function for Gradio (for Retrieval Tab) ---
-def predict_retrieval_gradio(query_text, query_model_choice, selected_doc_model_display_only):
-    formatted_output, _ = retrieve_documents(query_text, query_model_choice, initial_doc_model_for_indexing, top_k=5)
-    return formatted_output
-# --- New function to get specific retrieval examples ---
-def get_specific_retrieval_examples():
-    if not queries_texts or not qrels_data or not document_texts:
-        return "Queries, qrels, or documents not loaded. Please check initial loading."
-    high_qrel_threshold = 3 # Relevance score of 3 or 4 for Cranfield is generally considered high
-    low_qrel_threshold = 1  # Relevance score of 0 or 1 for Cranfield is generally considered low
-    eligible_query_ids = []
-    for qid, qrels in qrels_data.items():
-        has_high_qrel = any(item['relevance'] >= high_qrel_threshold for item in qrels)
-        has_low_qrel = any(item['relevance'] <= low_qrel_threshold for item in qrels)
-        if has_high_qrel and has_low_qrel:
-            eligible_query_ids.append(qid)
-    if not eligible_query_ids:
-        return "Could not find a query with both high and low relevance documents in the loaded qrels."
-    # Pick a random eligible query
-    random_query_id = random.choice(eligible_query_ids)
-    full_query_text = queries_texts.get(random_query_id, "Query text not found.")
-    query_snippet = full_query_text[:300] + "..." if len(full_query_text) > 300 else full_query_text
-    qrels_for_query = qrels_data[random_query_id]
-    high_qrel_docs = [item for item in qrels_for_query if item['relevance'] >= high_qrel_threshold]
-    low_qrel_docs = [item for item in qrels_for_query if item['relevance'] <= low_qrel_threshold]
-    selected_high_doc_id = random.choice(high_qrel_docs)['doc_id'] if high_qrel_docs else None
-    selected_low_doc_id = random.choice(low_qrel_docs)['doc_id'] if low_qrel_docs else None
-    output_str = f"### Random Query Example\n\n"
-    output_str += f"**Query ID:** {random_query_id}\n"
-    output_str += f"**Query Snippet:** {query_snippet}\n\n" # Changed to snippet
-    if selected_high_doc_id:
-        full_doc_text = document_texts.get(selected_high_doc_id, "Document text not available.")
-        doc_snippet = full_doc_text[:500] + "..." if len(full_doc_text) > 500 else full_doc_text
-        output_str += f"### Highly Relevant Document (Qrel >= {high_qrel_threshold})\n"
-        output_str += f"**Document ID:** {selected_high_doc_id}\n"
-        output_str += f"**Document Snippet:** {doc_snippet}\n\n" # Changed to snippet
-    else:
-        output_str += "No highly relevant document found for this query.\n\n"
-    if selected_low_doc_id:
-        full_doc_text = document_texts.get(selected_low_doc_id, "Document text not available.")
-        doc_snippet = full_doc_text[:500] + "..." if len(full_doc_text) > 500 else full_doc_text
-        output_str += f"### Lowly Relevant Document (Qrel <= {low_qrel_threshold})\n"
-        output_str += f"**Document ID:** {selected_low_doc_id}\n"
-        output_str += f"**Document Snippet:** {doc_snippet}\n\n" # Changed to snippet
-    else:
-        output_str += "No lowly relevant document found for this query.\n\n"
-    return output_str
-# --- Initial Load and Indexing Calls ---
-# This part runs once when the app starts.
-load_cranfield_corpus_ir_datasets()
-if initial_doc_model_for_indexing == "SPLADE-cocondenser-distil" and model_splade is not None:
-    index_documents(initial_doc_model_for_indexing)
-elif initial_doc_model_for_indexing == "SPLADE-v3-Lexical" and model_splade_lexical is not None:
-    index_documents(initial_doc_model_for_indexing)
-elif initial_doc_model_for_indexing == "SPLADE-v3-Doc" and model_splade_doc is not None:
-    index_documents(initial_doc_model_for_indexing)
-else:
-    print(f"Skipping document indexing: Model '{initial_doc_model_for_indexing}' failed to load or is not a valid choice for indexing.")
 # --- Gradio Interface Setup with Tabs ---
 with gr.Blocks(title="SPLADE Demos") as demo:
-    gr.Markdown("# 🌌 SPLADE Demos: Sparse Representation Explorer & Document Retrieval")
-    gr.Markdown("Explore different SPLADE models and their sparse representation types, or perform document retrieval on a test collection.")
     with gr.Tabs():
         with gr.TabItem("Sparse Representation Explorer"):
@@ -575,49 +276,4 @@ with gr.Blocks(title="SPLADE Demos") as demo:
                 # live=True # Setting live=True might be slow for complex models on every keystroke
             )
-        with gr.TabItem("Document Retrieval Demo"):
-            gr.Markdown("### Retrieve Documents from Cranfield Collection")
-            gr.Interface(
-                fn=predict_retrieval_gradio,
-                inputs=[
-                    gr.Textbox(
-                        lines=3,
-                        label="Enter your query text here:",
-                        placeholder="e.g., Does high-dose vitamin C cure cancer?"
-                    ),
-                    gr.Radio(
-                        [
-                            "SPLADE-cocondenser-distil (weighting and expansion)",
-                            "SPLADE-v3-Lexical (weighting)",
-                            "SPLADE-v3-Doc (binary)"
-                        ],
-                        label="Choose Query Representation Model",
-                        value="SPLADE-cocondenser-distil (weighting and expansion)"
-                    ),
-                    gr.Radio(
-                        [
-                            "SPLADE-cocondenser-distil",
-                            "SPLADE-v3-Lexical",
-                            "SPLADE-v3-Doc"
-                        ],
-                        label=f"Document Index Model (Pre-indexed with: {initial_doc_model_for_indexing})",
-                        value=initial_doc_model_for_indexing,
-                        interactive=False # This radio is fixed for simplicity
-                    )
-                ],
-                outputs=gr.Markdown(),
-                allow_flagging="never",
-                # live=True # retrieval is too heavy for live
-            )
-            gr.Markdown("---") # Separator
-            gr.Markdown("### Get Specific Retrieval Examples")
-            specific_example_output = gr.Markdown()
-            specific_example_button = gr.Button("Get Random Query with High/Low Qrel Docs")
-            specific_example_button.click(
-                fn=get_specific_retrieval_examples,
-                inputs=[],
-                outputs=specific_example_output
-            )
 demo.launch()

 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
 import numpy as np
+from tqdm.auto import tqdm # Still useful for model loading progress if desired, but not strictly necessary for this simplified version
+import os # Still useful for general purpose, but not explicitly used in this simplified version
+# --- Model Loading ---
 tokenizer_splade = None
 model_splade = None
 tokenizer_splade_lexical = None
     print(f"Please ensure '{splade_doc_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
+# --- Helper function for lexical mask (now handles batches, but used for single input here) ---
 def create_lexical_bow_mask(input_ids_batch, vocab_size, tokenizer):
     """
     Creates a batch of lexical BOW masks.
 # --- Core Representation Functions (Return Formatted Strings - for Explorer Tab) ---
+# These functions take single text input for the Explorer tab
 def get_splade_cocondenser_representation(text):
     if tokenizer_splade is None or model_splade is None:
         return "SPLADE-cocondenser-distil model is not loaded. Please check the console for loading errors."
         return "Please select a model."
 # --- Gradio Interface Setup with Tabs ---
 with gr.Blocks(title="SPLADE Demos") as demo:
+    gr.Markdown("# 🌌 SPLADE Demos: Sparse Representation Explorer") # Updated title
+    gr.Markdown("Explore different SPLADE models and their sparse representation types.") # Updated description
     with gr.Tabs():
         with gr.TabItem("Sparse Representation Explorer"):
                 # live=True # Setting live=True might be slow for complex models on every keystroke
             )
 demo.launch()