Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -113,7 +113,7 @@ def get_splade_cocondenser_representation(text):
|
|
113 |
|
114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
115 |
|
116 |
-
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n"
|
117 |
if not sorted_representation:
|
118 |
formatted_output += "No significant terms found for this input.\n"
|
119 |
else:
|
@@ -121,10 +121,10 @@ def get_splade_cocondenser_representation(text):
|
|
121 |
terms_list = []
|
122 |
for term, weight in sorted_representation:
|
123 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
124 |
-
formatted_output += ", ".join(terms_list) + "."
|
125 |
|
126 |
-
info_output = f"--- Sparse Vector Info ---\n"
|
127 |
-
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
128 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
|
129 |
|
130 |
return formatted_output, info_output
|
@@ -171,7 +171,7 @@ def get_splade_lexical_representation(text):
|
|
171 |
|
172 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
173 |
|
174 |
-
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n"
|
175 |
if not sorted_representation:
|
176 |
formatted_output += "No significant terms found for this input.\n"
|
177 |
else:
|
@@ -179,10 +179,10 @@ def get_splade_lexical_representation(text):
|
|
179 |
terms_list = []
|
180 |
for term, weight in sorted_representation:
|
181 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
182 |
-
formatted_output += ", ".join(terms_list) + "."
|
183 |
|
184 |
-
info_output = f"--- Raw Sparse Vector Info ---\n"
|
185 |
-
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
186 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
|
187 |
|
188 |
return formatted_output, info_output
|
@@ -216,7 +216,7 @@ def get_splade_doc_representation(text):
|
|
216 |
|
217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
218 |
|
219 |
-
formatted_output = "Binary Bag-of-Words Representation:\n\n"
|
220 |
if not sorted_representation:
|
221 |
formatted_output += "No significant terms found for this input.\n"
|
222 |
else:
|
@@ -224,11 +224,10 @@ def get_splade_doc_representation(text):
|
|
224 |
terms_list = []
|
225 |
for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
|
226 |
terms_list.append(f"**{term}**")
|
227 |
-
formatted_output += ", ".join(terms_list) + "."
|
228 |
|
229 |
-
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" #
|
230 |
-
info_output += f"Total activated terms: {len(indices)}\n"
|
231 |
-
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
|
232 |
|
233 |
return formatted_output, info_output
|
234 |
|
@@ -332,13 +331,12 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
332 |
else:
|
333 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
334 |
|
335 |
-
formatted_output = ""
|
336 |
if not sorted_representation:
|
337 |
formatted_output += "No significant terms found.\n"
|
338 |
else:
|
339 |
terms_list = []
|
340 |
for i, (term, weight) in enumerate(sorted_representation):
|
341 |
-
# Limit display for very long lists, but ensure it's still a paragraph if cut
|
342 |
if i >= 50:
|
343 |
terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
|
344 |
break
|
@@ -346,12 +344,13 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
346 |
terms_list.append(f"**{term}**")
|
347 |
else:
|
348 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
349 |
-
formatted_output += ", ".join(terms_list) + "."
|
350 |
|
351 |
-
|
352 |
-
info_output
|
|
|
353 |
|
354 |
-
return formatted_output, info_output
|
355 |
|
356 |
|
357 |
# --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
|
@@ -360,8 +359,8 @@ def get_model_assets(model_choice_str):
|
|
360 |
return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
|
361 |
elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
|
362 |
return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
|
363 |
-
elif model_choice_str == "Binary Bag-of-Words":
|
364 |
-
return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words"
|
365 |
else:
|
366 |
return None, None, False, "Unknown Model"
|
367 |
|
|
|
113 |
|
114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
115 |
|
116 |
+
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n"
|
117 |
if not sorted_representation:
|
118 |
formatted_output += "No significant terms found for this input.\n"
|
119 |
else:
|
|
|
121 |
terms_list = []
|
122 |
for term, weight in sorted_representation:
|
123 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
124 |
+
formatted_output += ", ".join(terms_list) + "."
|
125 |
|
126 |
+
info_output = f"--- Sparse Vector Info ---\n" # Line 1
|
127 |
+
info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
|
128 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
|
129 |
|
130 |
return formatted_output, info_output
|
|
|
171 |
|
172 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
173 |
|
174 |
+
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n"
|
175 |
if not sorted_representation:
|
176 |
formatted_output += "No significant terms found for this input.\n"
|
177 |
else:
|
|
|
179 |
terms_list = []
|
180 |
for term, weight in sorted_representation:
|
181 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
182 |
+
formatted_output += ", ".join(terms_list) + "."
|
183 |
|
184 |
+
info_output = f"--- Raw Sparse Vector Info ---\n" # Line 1
|
185 |
+
info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
|
186 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
|
187 |
|
188 |
return formatted_output, info_output
|
|
|
216 |
|
217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
218 |
|
219 |
+
formatted_output = "Binary Bag-of-Words Representation:\n\n"
|
220 |
if not sorted_representation:
|
221 |
formatted_output += "No significant terms found for this input.\n"
|
222 |
else:
|
|
|
224 |
terms_list = []
|
225 |
for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
|
226 |
terms_list.append(f"**{term}**")
|
227 |
+
formatted_output += ", ".join(terms_list) + "."
|
228 |
|
229 |
+
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Line 1
|
230 |
+
info_output += f"Total activated terms: {len(indices)} Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n" # Line 2
|
|
|
231 |
|
232 |
return formatted_output, info_output
|
233 |
|
|
|
331 |
else:
|
332 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
333 |
|
334 |
+
formatted_output = ""
|
335 |
if not sorted_representation:
|
336 |
formatted_output += "No significant terms found.\n"
|
337 |
else:
|
338 |
terms_list = []
|
339 |
for i, (term, weight) in enumerate(sorted_representation):
|
|
|
340 |
if i >= 50:
|
341 |
terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
|
342 |
break
|
|
|
344 |
terms_list.append(f"**{term}**")
|
345 |
else:
|
346 |
terms_list.append(f"**{term}**: {weight:.4f}")
|
347 |
+
formatted_output += ", ".join(terms_list) + "."
|
348 |
|
349 |
+
# This is the line that will now always be split into two
|
350 |
+
info_output = f"Total non-zero terms: {len(indices)}\n" # Line 1
|
351 |
+
info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n" # Line 2
|
352 |
|
353 |
+
return formatted_output, info_output
|
354 |
|
355 |
|
356 |
# --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
|
|
|
359 |
return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
|
360 |
elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
|
361 |
return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
|
362 |
+
elif model_choice_str == "Binary Bag-of-Words":
|
363 |
+
return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words"
|
364 |
else:
|
365 |
return None, None, False, "Unknown Model"
|
366 |
|