SiddharthAK commited on
Commit
ea39258
·
verified ·
1 Parent(s): da0c779

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -21
app.py CHANGED
@@ -113,7 +113,7 @@ def get_splade_cocondenser_representation(text):
113
 
114
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
115
 
116
- formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n" # Added newline
117
  if not sorted_representation:
118
  formatted_output += "No significant terms found for this input.\n"
119
  else:
@@ -121,10 +121,10 @@ def get_splade_cocondenser_representation(text):
121
  terms_list = []
122
  for term, weight in sorted_representation:
123
  terms_list.append(f"**{term}**: {weight:.4f}")
124
- formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
125
 
126
- info_output = f"--- Sparse Vector Info ---\n"
127
- info_output += f"Total non-zero terms in vector: {len(indices)}\n"
128
  info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
129
 
130
  return formatted_output, info_output
@@ -171,7 +171,7 @@ def get_splade_lexical_representation(text):
171
 
172
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
173
 
174
- formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n" # Added newline
175
  if not sorted_representation:
176
  formatted_output += "No significant terms found for this input.\n"
177
  else:
@@ -179,10 +179,10 @@ def get_splade_lexical_representation(text):
179
  terms_list = []
180
  for term, weight in sorted_representation:
181
  terms_list.append(f"**{term}**: {weight:.4f}")
182
- formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
183
 
184
- info_output = f"--- Raw Sparse Vector Info ---\n"
185
- info_output += f"Total non-zero terms in vector: {len(indices)}\n"
186
  info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
187
 
188
  return formatted_output, info_output
@@ -216,7 +216,7 @@ def get_splade_doc_representation(text):
216
 
217
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
218
 
219
- formatted_output = "Binary Bag-of-Words Representation:\n\n" # Changed title, added newline
220
  if not sorted_representation:
221
  formatted_output += "No significant terms found for this input.\n"
222
  else:
@@ -224,11 +224,10 @@ def get_splade_doc_representation(text):
224
  terms_list = []
225
  for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
226
  terms_list.append(f"**{term}**")
227
- formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
228
 
229
- info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
230
- info_output += f"Total activated terms: {len(indices)}\n"
231
- info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
232
 
233
  return formatted_output, info_output
234
 
@@ -332,13 +331,12 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
332
  else:
333
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
334
 
335
- formatted_output = "" # Removed initial newline to allow control outside
336
  if not sorted_representation:
337
  formatted_output += "No significant terms found.\n"
338
  else:
339
  terms_list = []
340
  for i, (term, weight) in enumerate(sorted_representation):
341
- # Limit display for very long lists, but ensure it's still a paragraph if cut
342
  if i >= 50:
343
  terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
344
  break
@@ -346,12 +344,13 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
346
  terms_list.append(f"**{term}**")
347
  else:
348
  terms_list.append(f"**{term}**: {weight:.4f}")
349
- formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
350
 
351
- info_output = f"Total non-zero terms: {len(indices)}\n"
352
- info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
 
353
 
354
- return formatted_output, info_output # Now returns two strings
355
 
356
 
357
  # --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
@@ -360,8 +359,8 @@ def get_model_assets(model_choice_str):
360
  return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
361
  elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
362
  return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
363
- elif model_choice_str == "Binary Bag-of-Words": # Changed name
364
- return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words" # Changed name
365
  else:
366
  return None, None, False, "Unknown Model"
367
 
 
113
 
114
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
115
 
116
+ formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n"
117
  if not sorted_representation:
118
  formatted_output += "No significant terms found for this input.\n"
119
  else:
 
121
  terms_list = []
122
  for term, weight in sorted_representation:
123
  terms_list.append(f"**{term}**: {weight:.4f}")
124
+ formatted_output += ", ".join(terms_list) + "."
125
 
126
+ info_output = f"--- Sparse Vector Info ---\n" # Line 1
127
+ info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
128
  info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
129
 
130
  return formatted_output, info_output
 
171
 
172
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
173
 
174
+ formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n"
175
  if not sorted_representation:
176
  formatted_output += "No significant terms found for this input.\n"
177
  else:
 
179
  terms_list = []
180
  for term, weight in sorted_representation:
181
  terms_list.append(f"**{term}**: {weight:.4f}")
182
+ formatted_output += ", ".join(terms_list) + "."
183
 
184
+ info_output = f"--- Raw Sparse Vector Info ---\n" # Line 1
185
+ info_output += f"Total non-zero terms in vector: {len(indices)}\n" # Line 2 (and onwards for sparsity)
186
  info_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_lexical.vocab_size):.2%}\n"
187
 
188
  return formatted_output, info_output
 
216
 
217
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
218
 
219
+ formatted_output = "Binary Bag-of-Words Representation:\n\n"
220
  if not sorted_representation:
221
  formatted_output += "No significant terms found for this input.\n"
222
  else:
 
224
  terms_list = []
225
  for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
226
  terms_list.append(f"**{term}**")
227
+ formatted_output += ", ".join(terms_list) + "."
228
 
229
+ info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Line 1
230
+ info_output += f"Total activated terms: {len(indices)} Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n" # Line 2
 
231
 
232
  return formatted_output, info_output
233
 
 
331
  else:
332
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
333
 
334
+ formatted_output = ""
335
  if not sorted_representation:
336
  formatted_output += "No significant terms found.\n"
337
  else:
338
  terms_list = []
339
  for i, (term, weight) in enumerate(sorted_representation):
 
340
  if i >= 50:
341
  terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
342
  break
 
344
  terms_list.append(f"**{term}**")
345
  else:
346
  terms_list.append(f"**{term}**: {weight:.4f}")
347
+ formatted_output += ", ".join(terms_list) + "."
348
 
349
+ # This is the line that will now always be split into two
350
+ info_output = f"Total non-zero terms: {len(indices)}\n" # Line 1
351
+ info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n" # Line 2
352
 
353
+ return formatted_output, info_output
354
 
355
 
356
  # --- NEW/MODIFIED: Helper to get the correct vector function, tokenizer, and binary flag ---
 
359
  return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
360
  elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
361
  return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
362
+ elif model_choice_str == "Binary Bag-of-Words":
363
+ return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words"
364
  else:
365
  return None, None, False, "Unknown Model"
366