seanpedrickcase commited on
Commit
4be845d
·
1 Parent(s): cced757

Validation processes can now be run independent from available logging data

Browse files
app.py CHANGED
@@ -233,16 +233,16 @@ with app:
233
 
234
  with gr.Accordion("1. Extract topics - go to first tab for file upload, model choice, and other settings before clicking this button", open = True):
235
  context_textbox.render()
236
- if SHOW_ADDITIONAL_INSTRUCTION_TEXTBOXES == "True":
237
  additional_summary_instructions_textbox = gr.Textbox(value="", visible=True, label="Additional summary instructions")
238
  else:
239
  additional_summary_instructions_textbox = gr.Textbox(value="", visible=False, label="Additional summary instructions")
240
 
241
  extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
242
- topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False, height=FILE_INPUT_HEIGHT)
243
 
244
- with gr.Accordion("1b. Validate topics - run validation on previously extracted topics", open = False):
245
- if SHOW_ADDITIONAL_INSTRUCTION_TEXTBOXES == "True":
246
  additional_validation_issues_textbox = gr.Textbox(value="", visible=True, label="Additional validation issues for the model to consider (bullet-point list)")
247
  else:
248
  additional_validation_issues_textbox = gr.Textbox(value="", visible=False, label="Additional validation issues for the model to consider (bullet-point list)")
@@ -467,11 +467,11 @@ with app:
467
  validate_topics_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
468
  success(load_in_data_file,
469
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches]).\
 
470
  success(fn=validate_topics_wrapper,
471
  inputs=[file_data_state,
472
  master_reference_df_state,
473
  master_unique_topics_df_state,
474
- logged_content_df,
475
  working_data_file_name_textbox,
476
  in_colnames,
477
  batch_size_number,
@@ -493,7 +493,9 @@ with app:
493
  original_data_file_name_textbox,
494
  additional_validation_issues_textbox,
495
  max_time_for_loop_num,
496
- sentiment_checkbox],
 
 
497
  outputs=[display_topic_table_markdown,
498
  master_topic_df_state,
499
  master_unique_topics_df_state,
 
233
 
234
  with gr.Accordion("1. Extract topics - go to first tab for file upload, model choice, and other settings before clicking this button", open = True):
235
  context_textbox.render()
236
+ if SHOW_ADDITIONAL_INSTRUCTION_TEXTBOXES == "True":
237
  additional_summary_instructions_textbox = gr.Textbox(value="", visible=True, label="Additional summary instructions")
238
  else:
239
  additional_summary_instructions_textbox = gr.Textbox(value="", visible=False, label="Additional summary instructions")
240
 
241
  extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
242
+ topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=True, height=FILE_INPUT_HEIGHT, file_count="multiple")
243
 
244
+ with gr.Accordion("1b. Validate topics - validate previous results with an LLM", open = False):
245
+ if SHOW_ADDITIONAL_INSTRUCTION_TEXTBOXES == "True":
246
  additional_validation_issues_textbox = gr.Textbox(value="", visible=True, label="Additional validation issues for the model to consider (bullet-point list)")
247
  else:
248
  additional_validation_issues_textbox = gr.Textbox(value="", visible=False, label="Additional validation issues for the model to consider (bullet-point list)")
 
467
  validate_topics_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
468
  success(load_in_data_file,
469
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches]).\
470
+ success(load_in_previous_data_files, inputs=[topic_extraction_output_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
471
  success(fn=validate_topics_wrapper,
472
  inputs=[file_data_state,
473
  master_reference_df_state,
474
  master_unique_topics_df_state,
 
475
  working_data_file_name_textbox,
476
  in_colnames,
477
  batch_size_number,
 
493
  original_data_file_name_textbox,
494
  additional_validation_issues_textbox,
495
  max_time_for_loop_num,
496
+ in_data_files,
497
+ sentiment_checkbox,
498
+ logged_content_df],
499
  outputs=[display_topic_table_markdown,
500
  master_topic_df_state,
501
  master_unique_topics_df_state,
tools/helper_functions.py CHANGED
@@ -374,6 +374,10 @@ def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
374
  if "Group" not in reference_df.columns:
375
  reference_df["Group"] = "All"
376
 
 
 
 
 
377
  out_topic_summary_df = (reference_df.groupby(["General topic", "Subtopic", "Sentiment", "Group"])
378
  .agg({
379
  'Response References': 'size', # Count the number of references
 
374
  if "Group" not in reference_df.columns:
375
  reference_df["Group"] = "All"
376
 
377
+ # Ensure 'Start row of group' column is numeric to avoid comparison errors
378
+ if 'Start row of group' in reference_df.columns:
379
+ reference_df['Start row of group'] = pd.to_numeric(reference_df['Start row of group'], errors='coerce')
380
+
381
  out_topic_summary_df = (reference_df.groupby(["General topic", "Subtopic", "Sentiment", "Group"])
382
  .agg({
383
  'Response References': 'size', # Count the number of references
tools/llm_api_call.py CHANGED
@@ -11,10 +11,10 @@ from tqdm import tqdm
11
  from gradio import Progress
12
  from typing import List, Tuple, Any
13
  from io import StringIO
14
- from tools.prompts import initial_table_prompt, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, negative_neutral_positive_sentiment_prompt, negative_or_positive_sentiment_prompt, default_sentiment_prompt, validation_prompt_prefix_default, previous_table_introduction_default, validation_prompt_suffix_default
15
  from tools.helper_functions import read_file, put_columns_in_df, wrap_text, load_in_data_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, generate_zero_shot_topics_df, clean_column_name, create_topic_summary_df_from_reference_table
16
  from tools.llm_funcs import construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
17
- from tools.config import RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES, ENABLE_VALIDATION
18
  from tools.aws_functions import connect_to_bedrock_runtime
19
  from tools.dedup_summaries import sample_reference_table_summaries, summarise_output_topics, deduplicate_topics, overall_summary, process_debug_output_iteration
20
 
@@ -49,11 +49,90 @@ def normalise_string(text:str):
49
  return text
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def validate_topics(
53
  file_data: pd.DataFrame,
54
  reference_df: pd.DataFrame,
55
  topic_summary_df: pd.DataFrame,
56
- logged_content: list,
57
  file_name: str,
58
  chosen_cols: List[str],
59
  batch_size: int,
@@ -76,6 +155,7 @@ def validate_topics(
76
  additional_validation_issues_provided: str = "",
77
  max_time_for_loop: int = MAX_TIME_FOR_LOOP,
78
  sentiment_checkbox: str = "Negative or Positive",
 
79
  progress = gr.Progress(track_tqdm=True)
80
  ) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
81
  """
@@ -86,7 +166,6 @@ def validate_topics(
86
  - file_data (pd.DataFrame): The input data to validate
87
  - reference_df (pd.DataFrame): The reference dataframe from the original run
88
  - topic_summary_df (pd.DataFrame): The topic summary dataframe from the original run
89
- - logged_content (list): The logged content from the original run
90
  - file_name (str): Name of the file being processed
91
  - chosen_cols (List[str]): Columns to process
92
  - batch_size (int): Size of each batch
@@ -108,6 +187,7 @@ def validate_topics(
108
  - original_full_file_name (str): Original file name
109
  - additional_validation_issues_provided (str): Additional validation issues provided
110
  - max_time_for_loop (int): Maximum time for the loop
 
111
  - progress: Progress bar object
112
 
113
  Returns:
@@ -154,6 +234,8 @@ def validate_topics(
154
  validation_all_file_names_content = list()
155
 
156
  # Extract previous summaries from logged content for validation
 
 
157
  all_responses_content = [item.get('response', '') for item in logged_content if 'response' in item]
158
 
159
  # Initialize validation dataframes
@@ -165,9 +247,7 @@ def validate_topics(
165
  if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = sentiment_prefix + negative_neutral_positive_sentiment_prompt + sentiment_suffix
166
  elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
167
  elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
168
- else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
169
-
170
-
171
 
172
  # Validation loop through all batches
173
  validation_latest_batch_completed = 0
@@ -188,18 +268,28 @@ def validate_topics(
188
  validation_response_reference_format = ""
189
  else:
190
  validation_response_reference_format = "\n" + default_response_reference_format
 
 
 
 
 
191
 
192
- # If the validation batch of responses contains at least one instance of text
193
- if not validation_batch_basic_response_df.empty:
194
-
 
 
 
195
  # Get the previous table from all_responses_content for this batch
196
  if validation_latest_batch_completed < len(all_responses_content):
197
  previous_table_content = all_responses_content[validation_latest_batch_completed]
 
198
  else:
199
- previous_table_content = ""
 
200
 
201
  # Always use the consolidated topics from the first run for validation
202
- validation_formatted_system_prompt = add_existing_topics_system_prompt.format(
203
  consultation_context=context_textbox, column_name=chosen_cols
204
  )
205
 
@@ -254,7 +344,7 @@ def validate_topics(
254
  if produce_structured_summary_radio != "Yes":
255
  validation_formatted_summary_prompt = add_existing_topics_prompt.format(
256
  validate_prompt_prefix=validation_prompt_prefix_default,
257
- response_table=validation_normalised_simple_markdown_table,
258
  topics=validation_unique_topics_markdown,
259
  topic_assignment=validation_topic_assignment_prompt,
260
  force_single_topic=validation_force_single_topic_prompt,
@@ -267,7 +357,7 @@ def validate_topics(
267
  )
268
  else:
269
  validation_formatted_summary_prompt = structured_summary_prompt.format(
270
- response_table=validation_normalised_simple_markdown_table,
271
  topics=validation_unique_topics_markdown,
272
  summary_format=additional_instructions_summary_format
273
  )
@@ -310,6 +400,11 @@ def validate_topics(
310
  task_type="Validation",
311
  assistant_prefill=add_existing_topics_assistant_prefill
312
  )
 
 
 
 
 
313
 
314
  # Collect conversation metadata from validation batch
315
  if validation_current_metadata_content_logged:
@@ -324,11 +419,49 @@ def validate_topics(
324
  validation_all_validated_content.append("Yes")
325
  validation_all_task_type_content.append("Validation")
326
  validation_all_file_names_content.append(original_full_file_name)
 
 
327
 
328
  # Update validation dataframes with validation results
329
- validation_reference_df = validation_new_reference_df.dropna(how='all')
330
- validation_topic_summary_df = validation_new_topic_summary_df.dropna(how='all')
331
- validation_topics_table = validation_new_topic_df.dropna(how='all')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
  else:
334
  print("Current validation batch of responses contains no text, moving onto next. Batch number:", str(validation_latest_batch_completed + 1), ". Start row:", validation_start_row, ". End row:", validation_end_row)
@@ -362,6 +495,13 @@ def validate_topics(
362
  # Ensure consistent Topic number assignment by recreating topic_summary_df from reference_df
363
  if not validation_reference_df.empty:
364
  validation_topic_summary_df = create_topic_summary_df_from_reference_table(validation_reference_df)
 
 
 
 
 
 
 
365
 
366
  print("Validation process completed.")
367
 
@@ -372,7 +512,6 @@ def validate_topics_wrapper(
372
  file_data: pd.DataFrame,
373
  reference_df: pd.DataFrame,
374
  topic_summary_df: pd.DataFrame,
375
- logged_content: List[dict],
376
  file_name: str,
377
  chosen_cols: List[str],
378
  batch_size: int,
@@ -394,7 +533,9 @@ def validate_topics_wrapper(
394
  original_full_file_name: str,
395
  additional_validation_issues_provided: str,
396
  max_time_for_loop: int,
 
397
  sentiment_checkbox: str = "Negative or Positive",
 
398
  progress = gr.Progress(track_tqdm=True)
399
  ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
400
  """
@@ -405,7 +546,6 @@ def validate_topics_wrapper(
405
  file_data (pd.DataFrame): The input data to validate.
406
  reference_df (pd.DataFrame): The reference dataframe from the original run.
407
  topic_summary_df (pd.DataFrame): The topic summary dataframe from the original run.
408
- logged_content (List[dict]): The logged content from the original run.
409
  file_name (str): Name of the file being processed.
410
  chosen_cols (List[str]): Columns to process.
411
  batch_size (int): Size of each batch.
@@ -427,6 +567,9 @@ def validate_topics_wrapper(
427
  original_full_file_name (str): Original file name.
428
  additional_validation_issues_provided (str): Additional validation issues provided.
429
  max_time_for_loop (int): Maximum time for the loop.
 
 
 
430
 
431
  Returns:
432
  Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
@@ -434,6 +577,35 @@ def validate_topics_wrapper(
434
  total_input_tokens, total_output_tokens, total_llm_calls, and a list of output file paths.
435
  """
436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  # Get unique Group values from the input dataframes
438
  unique_groups = list()
439
  if "Group" in reference_df.columns and not reference_df["Group"].isnull().all():
@@ -500,7 +672,6 @@ def validate_topics_wrapper(
500
  file_data=group_file_data,
501
  reference_df=group_reference_df,
502
  topic_summary_df=group_topic_summary_df,
503
- logged_content=logged_content,
504
  file_name=file_name,
505
  chosen_cols=chosen_cols,
506
  batch_size=batch_size,
@@ -522,7 +693,8 @@ def validate_topics_wrapper(
522
  original_full_file_name=original_full_file_name,
523
  additional_validation_issues_provided=additional_validation_issues_provided,
524
  max_time_for_loop=max_time_for_loop,
525
- sentiment_checkbox=sentiment_checkbox
 
526
  )
527
 
528
  # Accumulate results
@@ -540,9 +712,10 @@ def validate_topics_wrapper(
540
  group_input_tokens, group_output_tokens, group_llm_calls = calculate_tokens_from_metadata(
541
  validation_conversation_metadata_str, model_choice, model_name_map
542
  )
543
- acc_input_tokens += group_input_tokens
544
- acc_output_tokens += group_output_tokens
545
- acc_llm_calls += group_llm_calls
 
546
 
547
  print(f"Group {current_group} validation completed.")
548
 
@@ -572,6 +745,13 @@ def validate_topics_wrapper(
572
  acc_reference_df = acc_reference_df.merge(acc_topic_summary_df[["General topic", "Subtopic", "Sentiment", "Topic number"]], on=["General topic", "Subtopic", "Sentiment"], how="left")
573
  elif "Main heading" in acc_topic_summary_df.columns:
574
  acc_reference_df = acc_reference_df.merge(acc_topic_summary_df[["Main heading", "Subheading", "Topic number"]], on=["Main heading", "Subheading"], how="left")
 
 
 
 
 
 
 
575
 
576
  # Save consolidated validation dataframes to CSV
577
  if not acc_reference_df.empty:
@@ -652,10 +832,13 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
652
  # Simplify table to just responses column and the Response reference number
653
  basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=verify_titles)
654
 
655
- file_len = len(basic_response_data["Reference"])
 
 
 
 
 
656
 
657
- # Subset the data for the current batch
658
- start_row = (batch_number * batch_size)
659
  if start_row > file_len + 1:
660
  print("Start row greater than file row length")
661
  return simplified_csv_table_path, normalised_simple_markdown_table, file_name
@@ -663,7 +846,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
663
  raise Exception("Start row is below 0")
664
 
665
  if ((start_row + batch_size) - 1) <= file_len + 1:
666
- end_row = ((start_row + batch_size) - 1)
667
  else:
668
  end_row = file_len + 1
669
 
@@ -933,7 +1116,7 @@ def write_llm_output_and_logs(response_text: str,
933
  # Convert conversation to string and add to log outputs
934
  whole_conversation_str = '\n'.join(whole_conversation)
935
  all_metadata_content_str = '\n'.join(all_metadata_content)
936
- start_row_reported = start_row + 1
937
 
938
  # Need to reduce output file names as full length files may be too long
939
  model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
@@ -947,6 +1130,30 @@ def write_llm_output_and_logs(response_text: str,
947
  with open(whole_conversation_path_meta, "w", encoding='utf-8-sig', errors='replace') as f: f.write(all_metadata_content_str)
948
  log_files_output_paths.append(whole_conversation_path_meta)
949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
  # Convert response text to a markdown table
951
  try:
952
  topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
@@ -1009,25 +1216,28 @@ def write_llm_output_and_logs(response_text: str,
1009
  existing_reference_numbers = False
1010
 
1011
  batch_basic_response_df["Reference"] = batch_basic_response_df["Reference"].astype(str)
 
1012
 
1013
  # Iterate through each row in the original DataFrame
1014
  for index, row in topic_with_response_df.iterrows():
1015
- references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
 
1016
 
1017
- if batch_size_number == 1: references = "1"
 
1018
 
1019
  # Filter out references that are outside the valid range
1020
  if references:
1021
  try:
1022
- # Convert all references to integers and keep only those within valid range
1023
  ref_numbers = [int(ref) for ref in references]
1024
- references = [str(ref) for ref in ref_numbers if 1 <= ref <= batch_size_number]
1025
  except ValueError:
1026
  # If any reference can't be converted to int, skip this row
1027
  print("Response value could not be converted to number:", references)
1028
  continue
1029
  else:
1030
- references = ""
1031
 
1032
  topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
1033
  subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
@@ -1056,7 +1266,7 @@ def write_llm_output_and_logs(response_text: str,
1056
  if batch_basic_response_df.empty:
1057
  # --- Scenario 1: The DataFrame is empty, so we calculate the reference ---
1058
  try:
1059
- response_ref_no = str(int(ref) + int(start_row))
1060
  except ValueError:
1061
  print(f"Reference '{ref}' is not a number and was skipped.")
1062
  continue # Skip to the next 'ref' in the loop
@@ -1073,11 +1283,10 @@ def write_llm_output_and_logs(response_text: str,
1073
  print(f"Reference '{ref}' not found in the DataFrame.")
1074
  continue # Skip to the next 'ref' in the loop
1075
 
1076
-
1077
  # This code runs for every *valid* reference that wasn't skipped by 'continue'.
1078
  # It uses the 'response_ref_no' calculated in the if/else block above.
1079
  reference_data.append({
1080
- 'Response References': response_ref_no,
1081
  'General topic': topic,
1082
  'Subtopic': subtopic,
1083
  'Sentiment': sentiment,
@@ -1092,7 +1301,7 @@ def write_llm_output_and_logs(response_text: str,
1092
  response_ref_no = 0 # Default value when no references are provided
1093
 
1094
  reference_data.append({
1095
- 'Response References': response_ref_no,
1096
  'General topic': topic,
1097
  'Subtopic': subtopic,
1098
  'Sentiment': sentiment,
@@ -1118,7 +1327,7 @@ def write_llm_output_and_logs(response_text: str,
1118
  # Try converting response references column to int, keep as string if fails
1119
  if existing_reference_numbers is True:
1120
  try:
1121
- out_reference_df["Response References"] = out_reference_df["Response References"].astype(int)
1122
  except Exception as e:
1123
  print("Could not convert Response References column to integer due to", e)
1124
 
@@ -1276,8 +1485,7 @@ def process_batch_with_llm(
1276
  if "Local" in model_source and reasoning_suffix:
1277
  formatted_system_prompt = formatted_system_prompt + "\n" + reasoning_suffix
1278
 
1279
- # Check if the input text exceeds the LLM context length
1280
- from tools.config import LLM_CONTEXT_LENGTH
1281
 
1282
  # Combine system prompt and user prompt for token counting
1283
  full_input_text = formatted_system_prompt + "\n" + formatted_prompt
@@ -1330,6 +1538,8 @@ def process_batch_with_llm(
1330
  all_metadata_content, log_files_output_paths, task_type=task_type
1331
  )
1332
 
 
 
1333
  return (
1334
  new_topic_df, new_reference_df, new_topic_summary_df, is_error,
1335
  current_prompt_content_logged, current_summary_content_logged,
@@ -1528,6 +1738,9 @@ def extract_topics(in_data_file: gr.FileData,
1528
  else:
1529
  progress(0.1, desc="Querying large language model")
1530
 
 
 
 
1531
  if latest_batch_completed < num_batches:
1532
 
1533
  # Load file
@@ -1567,6 +1780,12 @@ def extract_topics(in_data_file: gr.FileData,
1567
  if batch_basic_response_df.shape[0] == 1: response_reference_format = "" # Blank, as the topics will always refer to the single response provided, '1'
1568
  else: response_reference_format = "\n" + default_response_reference_format
1569
 
 
 
 
 
 
 
1570
  # If the latest batch of responses contains at least one instance of text
1571
  if not batch_basic_response_df.empty:
1572
 
@@ -1643,13 +1862,15 @@ def extract_topics(in_data_file: gr.FileData,
1643
  # Should the outputs force only one single topic assignment per response?
1644
  if force_single_topic_radio != "Yes": force_single_topic_prompt = ""
1645
  else:
1646
- topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
 
 
1647
 
1648
  # Format the summary prompt with the response table and topics
1649
  if produce_structured_summary_radio != "Yes":
1650
  formatted_summary_prompt = add_existing_topics_prompt.format(
1651
  validate_prompt_prefix="",
1652
- response_table=normalised_simple_markdown_table,
1653
  topics=unique_topics_markdown,
1654
  topic_assignment=topic_assignment_prompt,
1655
  force_single_topic=force_single_topic_prompt,
@@ -1661,7 +1882,7 @@ def extract_topics(in_data_file: gr.FileData,
1661
  validate_prompt_suffix=""
1662
  )
1663
  else:
1664
- formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
1665
  topics=unique_topics_markdown, summary_format=additional_instructions_summary_format)
1666
 
1667
  batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size}_col_{in_column_cleaned}"
@@ -1751,7 +1972,7 @@ def extract_topics(in_data_file: gr.FileData,
1751
  if produce_structured_summary_radio != "Yes":
1752
  formatted_initial_table_prompt = initial_table_prompt.format(
1753
  validate_prompt_prefix="",
1754
- response_table=normalised_simple_markdown_table,
1755
  sentiment_choices=sentiment_prompt,
1756
  response_reference_format=response_reference_format,
1757
  add_existing_topics_summary_format=additional_instructions_summary_format,
@@ -1761,7 +1982,7 @@ def extract_topics(in_data_file: gr.FileData,
1761
  )
1762
  else:
1763
  unique_topics_markdown="No suggested headings for this summary"
1764
- formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
1765
 
1766
  batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size}_col_{in_column_cleaned}"
1767
 
@@ -1836,6 +2057,8 @@ def extract_topics(in_data_file: gr.FileData,
1836
  print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
1837
 
1838
  # Increase latest file completed count unless we are over the last batch number, then go back around
 
 
1839
  if latest_batch_completed <= num_batches:
1840
  latest_batch_completed += 1
1841
 
@@ -1881,7 +2104,6 @@ def extract_topics(in_data_file: gr.FileData,
1881
  file_data=file_data,
1882
  reference_df=existing_reference_df,
1883
  topic_summary_df=existing_topic_summary_df,
1884
- logged_content=group_combined_logged_content,
1885
  file_name=file_name,
1886
  chosen_cols=chosen_cols,
1887
  batch_size=batch_size,
@@ -1903,7 +2125,8 @@ def extract_topics(in_data_file: gr.FileData,
1903
  output_debug_files=output_debug_files,
1904
  original_full_file_name=original_full_file_name,
1905
  max_time_for_loop=max_time_for_loop,
1906
- sentiment_checkbox=sentiment_checkbox
 
1907
  )
1908
 
1909
  # Add validation conversation metadata to the main conversation metadata
 
11
  from gradio import Progress
12
  from typing import List, Tuple, Any
13
  from io import StringIO
14
+ from tools.prompts import initial_table_prompt, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, negative_neutral_positive_sentiment_prompt, negative_or_positive_sentiment_prompt, default_sentiment_prompt, validation_prompt_prefix_default, previous_table_introduction_default, validation_prompt_suffix_default, validation_system_prompt
15
  from tools.helper_functions import read_file, put_columns_in_df, wrap_text, load_in_data_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, generate_zero_shot_topics_df, clean_column_name, create_topic_summary_df_from_reference_table
16
  from tools.llm_funcs import construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
17
+ from tools.config import RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES, ENABLE_VALIDATION, LLM_CONTEXT_LENGTH
18
  from tools.aws_functions import connect_to_bedrock_runtime
19
  from tools.dedup_summaries import sample_reference_table_summaries, summarise_output_topics, deduplicate_topics, overall_summary, process_debug_output_iteration
20
 
 
49
  return text
50
 
51
 
52
+ def reconstruct_markdown_table_from_reference_df(reference_df: pd.DataFrame, start_row: int = None, end_row: int = None) -> tuple[str, pd.DataFrame]:
53
+ """
54
+ Reconstructs a markdown table from reference_df data when all_responses_content is missing.
55
+ Filters to only include rows from the current batch if start_row and end_row are provided.
56
+
57
+ Parameters:
58
+ - reference_df (pd.DataFrame): The reference dataframe containing topic analysis data
59
+ - start_row (int, optional): The starting row number for the current batch
60
+ - end_row (int, optional): The ending row number for the current batch
61
+
62
+ Returns:
63
+ - tuple[str, pd.DataFrame]: A tuple containing:
64
+ - str: A markdown table string in the required format
65
+ - pd.DataFrame: A pandas DataFrame with the same data as the markdown table
66
+ """
67
+ if reference_df.empty:
68
+ return "", pd.DataFrame()
69
+
70
+ # Filter reference_df to current batch if start_row and end_row are provided
71
+ filtered_df = reference_df.copy()
72
+ if start_row is not None and end_row is not None:
73
+ # Convert Response References to numeric for filtering
74
+ filtered_df['Response References'] = pd.to_numeric(filtered_df['Response References'], errors='coerce')
75
+ # Filter to only include rows where Response References fall within the current batch range
76
+ filtered_df = filtered_df[
77
+ (filtered_df['Response References'] >= start_row + 1) &
78
+ (filtered_df['Response References'] <= end_row + 1)
79
+ ]
80
+
81
+ if filtered_df.empty:
82
+ return "", pd.DataFrame()
83
+
84
+ # Group by General topic, Subtopic, and Sentiment to aggregate response references
85
+ grouped_df = filtered_df.groupby(['General topic', 'Subtopic', 'Sentiment']).agg({
86
+ 'Response References': lambda x: ', '.join(map(str, sorted(x.unique()))),
87
+ 'Summary': 'first' # Take the first summary for each group
88
+ }).reset_index()
89
+
90
+ # Adjust response references to be relative to the batch (subtract start_row if provided)
91
+ if start_row is not None:
92
+ # Convert response references to relative numbers by subtracting start_row
93
+ def adjust_references(refs_str):
94
+ if not refs_str or refs_str == '':
95
+ return refs_str
96
+ try:
97
+ # Split by comma, convert to int, subtract start_row, convert back to string
98
+ refs = [str(int(ref.strip()) - start_row) for ref in refs_str.split(',') if ref.strip().isdigit()]
99
+ return ', '.join(refs)
100
+ except (ValueError, TypeError):
101
+ return refs_str
102
+
103
+ grouped_df['Response References'] = grouped_df['Response References'].apply(adjust_references)
104
+
105
+ # Clean up the data to handle any NaN values and remove "Rows x to y: " prefix from summary
106
+ cleaned_df = grouped_df.copy()
107
+ for col in ['General topic', 'Subtopic', 'Sentiment', 'Response References', 'Summary']:
108
+ cleaned_df[col] = cleaned_df[col].fillna("").astype(str)
109
+
110
+ # Remove "Rows x to y: " prefix from summary if present
111
+ cleaned_df['Summary'] = cleaned_df['Summary'].apply(
112
+ lambda x: re.sub(r'^Rows\s+\d+\s+to\s+\d+:\s*', '', x) if isinstance(x, str) else x
113
+ )
114
+
115
+ # Create the markdown table
116
+ markdown_table = "| General topic | Subtopic | Sentiment | Response References | Summary |\n"
117
+ markdown_table += "|---|---|---|---|---|\n"
118
+
119
+ for _, row in cleaned_df.iterrows():
120
+ general_topic = row['General topic']
121
+ subtopic = row['Subtopic']
122
+ sentiment = row['Sentiment']
123
+ response_refs = row['Response References']
124
+ summary = row['Summary']
125
+
126
+ # Add row to markdown table
127
+ markdown_table += f"| {general_topic} | {subtopic} | {sentiment} | {response_refs} | {summary} |\n"
128
+
129
+ return markdown_table, cleaned_df
130
+
131
+
132
  def validate_topics(
133
  file_data: pd.DataFrame,
134
  reference_df: pd.DataFrame,
135
  topic_summary_df: pd.DataFrame,
 
136
  file_name: str,
137
  chosen_cols: List[str],
138
  batch_size: int,
 
155
  additional_validation_issues_provided: str = "",
156
  max_time_for_loop: int = MAX_TIME_FOR_LOOP,
157
  sentiment_checkbox: str = "Negative or Positive",
158
+ logged_content: list = None,
159
  progress = gr.Progress(track_tqdm=True)
160
  ) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
161
  """
 
166
  - file_data (pd.DataFrame): The input data to validate
167
  - reference_df (pd.DataFrame): The reference dataframe from the original run
168
  - topic_summary_df (pd.DataFrame): The topic summary dataframe from the original run
 
169
  - file_name (str): Name of the file being processed
170
  - chosen_cols (List[str]): Columns to process
171
  - batch_size (int): Size of each batch
 
187
  - original_full_file_name (str): Original file name
188
  - additional_validation_issues_provided (str): Additional validation issues provided
189
  - max_time_for_loop (int): Maximum time for the loop
190
+ - logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
191
  - progress: Progress bar object
192
 
193
  Returns:
 
234
  validation_all_file_names_content = list()
235
 
236
  # Extract previous summaries from logged content for validation
237
+ if logged_content is None:
238
+ logged_content = list()
239
  all_responses_content = [item.get('response', '') for item in logged_content if 'response' in item]
240
 
241
  # Initialize validation dataframes
 
247
  if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = sentiment_prefix + negative_neutral_positive_sentiment_prompt + sentiment_suffix
248
  elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = sentiment_prefix + negative_or_positive_sentiment_prompt + sentiment_suffix
249
  elif sentiment_checkbox == "Do not assess sentiment": sentiment_prompt = "" # Just remove line completely. Previous: sentiment_prefix + do_not_assess_sentiment_prompt + sentiment_suffix
250
+ else: sentiment_prompt = sentiment_prefix + default_sentiment_prompt + sentiment_suffix
 
 
251
 
252
  # Validation loop through all batches
253
  validation_latest_batch_completed = 0
 
268
  validation_response_reference_format = ""
269
  else:
270
  validation_response_reference_format = "\n" + default_response_reference_format
271
+
272
+ if validation_normalised_simple_markdown_table:
273
+ validation_response_table_prompt = "Response table:\n" + validation_normalised_simple_markdown_table
274
+ else:
275
+ validation_response_table_prompt = ""
276
 
277
+ # If the validation batch of responses contains at least one instance of text. The function will first try to get the previous table from logged outputs, and will reconstruct the table from reference_df data if not available.
278
+ if not reference_df.empty:
279
+ validation_latest_batch_completed = int(validation_latest_batch_completed)
280
+ validation_start_row = int(validation_start_row)
281
+ validation_end_row = int(validation_end_row)
282
+
283
  # Get the previous table from all_responses_content for this batch
284
  if validation_latest_batch_completed < len(all_responses_content):
285
  previous_table_content = all_responses_content[validation_latest_batch_completed]
286
+ _, previous_topic_df = reconstruct_markdown_table_from_reference_df(reference_df, validation_start_row, validation_end_row)
287
  else:
288
+ # Try to reconstruct markdown table from reference_df data
289
+ previous_table_content, previous_topic_df = reconstruct_markdown_table_from_reference_df(reference_df, validation_start_row, validation_end_row)
290
 
291
  # Always use the consolidated topics from the first run for validation
292
+ validation_formatted_system_prompt = validation_system_prompt.format(
293
  consultation_context=context_textbox, column_name=chosen_cols
294
  )
295
 
 
344
  if produce_structured_summary_radio != "Yes":
345
  validation_formatted_summary_prompt = add_existing_topics_prompt.format(
346
  validate_prompt_prefix=validation_prompt_prefix_default,
347
+ response_table=validation_response_table_prompt,
348
  topics=validation_unique_topics_markdown,
349
  topic_assignment=validation_topic_assignment_prompt,
350
  force_single_topic=validation_force_single_topic_prompt,
 
357
  )
358
  else:
359
  validation_formatted_summary_prompt = structured_summary_prompt.format(
360
+ response_table=validation_response_table_prompt,
361
  topics=validation_unique_topics_markdown,
362
  summary_format=additional_instructions_summary_format
363
  )
 
400
  task_type="Validation",
401
  assistant_prefill=add_existing_topics_assistant_prefill
402
  )
403
+
404
+ if validation_new_topic_df.empty:
405
+ validation_new_topic_df = previous_topic_df
406
+ # print("Validation new topic df is empty, using previous topic df:", validation_new_topic_df)
407
+ # print("Validation new topic df columns:", validation_new_topic_df.columns)
408
 
409
  # Collect conversation metadata from validation batch
410
  if validation_current_metadata_content_logged:
 
419
  validation_all_validated_content.append("Yes")
420
  validation_all_task_type_content.append("Validation")
421
  validation_all_file_names_content.append(original_full_file_name)
422
+
423
+ print("Appended to logs")
424
 
425
  # Update validation dataframes with validation results
426
+ # For validation, we need to accumulate results from each batch, not overwrite them
427
+ # The validation_new_* dataframes contain the results for the current batch
428
+ # We need to concatenate them with the existing validation dataframes
429
+
430
+ # For reference_df, we need to be careful about duplicates
431
+ if not validation_new_reference_df.empty:
432
+ # Check if the new reference_df is the same as the existing one (indicating "no change" response)
433
+ # This happens when the LLM responds with "no change" and returns the existing data
434
+ if validation_new_reference_df.equals(validation_reference_df):
435
+ print("Validation new reference df is identical to existing df (no change response), skipping concatenation")
436
+ else:
437
+ print("Validation new reference df is not empty, appending new table to validation reference df")
438
+ # Remove any existing entries for this batch range to avoid duplicates
439
+ start_row_reported = int(validation_start_row) + 1
440
+ end_row_reported = int(validation_end_row) + 1
441
+ validation_reference_df["Start row of group"] = validation_reference_df["Start row of group"].astype(int)
442
+
443
+ # Remove existing entries for this batch range from validation_reference_df
444
+ if "Start row of group" in validation_reference_df.columns:
445
+ validation_reference_df = validation_reference_df[
446
+ ~((validation_reference_df["Start row of group"] >= start_row_reported) &
447
+ (validation_reference_df["Start row of group"] <= end_row_reported))
448
+ ]
449
+
450
+ # Concatenate the new results
451
+ validation_reference_df = pd.concat([validation_reference_df, validation_new_reference_df]).dropna(how='all')
452
+
453
+ # For topic summary, we need to merge/concatenate carefully to avoid duplicates
454
+ if not validation_new_topic_summary_df.empty:
455
+ # Check if the new topic_summary_df is the same as the existing one (indicating "no change" response)
456
+ if validation_new_topic_summary_df.equals(validation_topic_summary_df):
457
+ print("Validation new topic summary df is identical to existing df (no change response), skipping concatenation")
458
+ else:
459
+ # Remove duplicates and concatenate
460
+ validation_topic_summary_df = pd.concat([validation_topic_summary_df, validation_new_topic_summary_df]).drop_duplicates(['General topic', 'Subtopic', 'Sentiment']).dropna(how='all')
461
+
462
+ # For topics table, just concatenate
463
+ # if not validation_new_topic_df.empty:
464
+ # validation_topics_table = pd.concat([validation_topics_table, validation_new_topic_df]).dropna(how='all')
465
 
466
  else:
467
  print("Current validation batch of responses contains no text, moving onto next. Batch number:", str(validation_latest_batch_completed + 1), ". Start row:", validation_start_row, ". End row:", validation_end_row)
 
495
  # Ensure consistent Topic number assignment by recreating topic_summary_df from reference_df
496
  if not validation_reference_df.empty:
497
  validation_topic_summary_df = create_topic_summary_df_from_reference_table(validation_reference_df)
498
+
499
+ # Sort output dataframes
500
+ validation_reference_df["Response References"] = validation_reference_df["Response References"].astype(str).astype(int)
501
+ validation_reference_df["Start row of group"] = validation_reference_df["Start row of group"].astype(int)
502
+ validation_reference_df.sort_values(["Group", "Start row of group", "Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
503
+ validation_topic_summary_df["Number of responses"] = validation_topic_summary_df["Number of responses"].astype(int)
504
+ validation_topic_summary_df.sort_values(["Group","Number of responses", "General topic", "Subtopic", "Sentiment"], ascending=[True, False, True, True, True], inplace=True)
505
 
506
  print("Validation process completed.")
507
 
 
512
  file_data: pd.DataFrame,
513
  reference_df: pd.DataFrame,
514
  topic_summary_df: pd.DataFrame,
 
515
  file_name: str,
516
  chosen_cols: List[str],
517
  batch_size: int,
 
533
  original_full_file_name: str,
534
  additional_validation_issues_provided: str,
535
  max_time_for_loop: int,
536
+ in_data_files: Any = None,
537
  sentiment_checkbox: str = "Negative or Positive",
538
+ logged_content: List[dict] = None,
539
  progress = gr.Progress(track_tqdm=True)
540
  ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
541
  """
 
546
  file_data (pd.DataFrame): The input data to validate.
547
  reference_df (pd.DataFrame): The reference dataframe from the original run.
548
  topic_summary_df (pd.DataFrame): The topic summary dataframe from the original run.
 
549
  file_name (str): Name of the file being processed.
550
  chosen_cols (List[str]): Columns to process.
551
  batch_size (int): Size of each batch.
 
567
  original_full_file_name (str): Original file name.
568
  additional_validation_issues_provided (str): Additional validation issues provided.
569
  max_time_for_loop (int): Maximum time for the loop.
570
+ in_data_files (Any, optional): The input data files (e.g., Gradio FileData). If None, file_data must be provided.
571
+ sentiment_checkbox (str): Sentiment analysis option.
572
+ logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
573
 
574
  Returns:
575
  Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
 
577
  total_input_tokens, total_output_tokens, total_llm_calls, and a list of output file paths.
578
  """
579
 
580
+ # Handle None logged_content
581
+ if logged_content is None:
582
+ logged_content = list()
583
+
584
+ # If you have a file input but no file data it hasn't yet been loaded. Load it here.
585
+ if file_data.empty:
586
+ print("No data table found, loading from file")
587
+ try:
588
+ in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_files)
589
+ file_data, file_name, num_batches = load_in_data_file(in_data_files, chosen_cols, batch_size_default, in_excel_sheets)
590
+ except:
591
+ # Check if files and text exist
592
+ out_message = "Please enter a data file to process."
593
+ print(out_message)
594
+ raise Exception(out_message)
595
+
596
+ if file_data.shape[0] > max_rows:
597
+ out_message = "Your data has more than " + str(max_rows) + " rows, which has been set as the maximum in the application configuration."
598
+ print(out_message)
599
+ raise Exception(out_message)
600
+
601
+ if group_name is None:
602
+ print("No grouping column found")
603
+ file_data["group_col"] = "All"
604
+ group_name="group_col"
605
+
606
+ if group_name not in file_data.columns:
607
+ raise ValueError(f"Selected column '{group_name}' not found in file_data.")
608
+
609
  # Get unique Group values from the input dataframes
610
  unique_groups = list()
611
  if "Group" in reference_df.columns and not reference_df["Group"].isnull().all():
 
672
  file_data=group_file_data,
673
  reference_df=group_reference_df,
674
  topic_summary_df=group_topic_summary_df,
 
675
  file_name=file_name,
676
  chosen_cols=chosen_cols,
677
  batch_size=batch_size,
 
693
  original_full_file_name=original_full_file_name,
694
  additional_validation_issues_provided=additional_validation_issues_provided,
695
  max_time_for_loop=max_time_for_loop,
696
+ sentiment_checkbox=sentiment_checkbox,
697
+ logged_content=logged_content
698
  )
699
 
700
  # Accumulate results
 
712
  group_input_tokens, group_output_tokens, group_llm_calls = calculate_tokens_from_metadata(
713
  validation_conversation_metadata_str, model_choice, model_name_map
714
  )
715
+
716
+ acc_input_tokens += int(group_input_tokens)
717
+ acc_output_tokens += int(group_output_tokens)
718
+ acc_llm_calls += int(group_llm_calls)
719
 
720
  print(f"Group {current_group} validation completed.")
721
 
 
745
  acc_reference_df = acc_reference_df.merge(acc_topic_summary_df[["General topic", "Subtopic", "Sentiment", "Topic number"]], on=["General topic", "Subtopic", "Sentiment"], how="left")
746
  elif "Main heading" in acc_topic_summary_df.columns:
747
  acc_reference_df = acc_reference_df.merge(acc_topic_summary_df[["Main heading", "Subheading", "Topic number"]], on=["Main heading", "Subheading"], how="left")
748
+
749
+ # Sort output dataframes
750
+ acc_reference_df["Response References"] = acc_reference_df["Response References"].astype(str).astype(int)
751
+ acc_reference_df["Start row of group"] = acc_reference_df["Start row of group"].astype(int)
752
+ acc_reference_df.sort_values(["Group", "Start row of group", "Response References", "General topic", "Subtopic", "Sentiment"], inplace=True)
753
+ acc_topic_summary_df["Number of responses"] = acc_topic_summary_df["Number of responses"].astype(int)
754
+ acc_topic_summary_df.sort_values(["Group","Number of responses", "General topic", "Subtopic", "Sentiment"], ascending=[True, False, True, True, True], inplace=True)
755
 
756
  # Save consolidated validation dataframes to CSV
757
  if not acc_reference_df.empty:
 
832
  # Simplify table to just responses column and the Response reference number
833
  basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=verify_titles)
834
 
835
+ file_len = int(len(basic_response_data["Reference"]))
836
+ batch_size = int(batch_size)
837
+ batch_number = int(batch_number)
838
+
839
+ # Subset the data for the current batch
840
+ start_row = int(batch_number * batch_size)
841
 
 
 
842
  if start_row > file_len + 1:
843
  print("Start row greater than file row length")
844
  return simplified_csv_table_path, normalised_simple_markdown_table, file_name
 
846
  raise Exception("Start row is below 0")
847
 
848
  if ((start_row + batch_size) - 1) <= file_len + 1:
849
+ end_row = int((start_row + batch_size) - 1)
850
  else:
851
  end_row = file_len + 1
852
 
 
1116
  # Convert conversation to string and add to log outputs
1117
  whole_conversation_str = '\n'.join(whole_conversation)
1118
  all_metadata_content_str = '\n'.join(all_metadata_content)
1119
+ start_row_reported = int(start_row) + 1
1120
 
1121
  # Need to reduce output file names as full length files may be too long
1122
  model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
 
1130
  with open(whole_conversation_path_meta, "w", encoding='utf-8-sig', errors='replace') as f: f.write(all_metadata_content_str)
1131
  log_files_output_paths.append(whole_conversation_path_meta)
1132
 
1133
+ # Check if response is "No change" - if so, return input dataframes
1134
+ stripped_response = response_text.strip()
1135
+ if stripped_response.lower().startswith("no change"):
1136
+ print("LLM response indicates no changes needed, returning input dataframes")
1137
+
1138
+ # For "No change" responses, we need to return the existing dataframes
1139
+ # but we still need to process them through the same logic as normal processing
1140
+
1141
+ # Create empty topic_with_response_df since no new topics were generated
1142
+ topic_with_response_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment", "Response References", "Summary"])
1143
+
1144
+ # For "No change", we return the existing dataframes as-is (they already contain all the data)
1145
+ # This is equivalent to the normal processing where new_reference_df would be empty
1146
+ out_reference_df = existing_reference_df.copy()
1147
+ out_topic_summary_df = existing_topics_df.copy()
1148
+
1149
+ # Set up output file paths
1150
+ topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean_short + ".csv"
1151
+ reference_table_out_path = output_folder + batch_file_path_details + "_reference_table_" + model_choice_clean_short + ".csv"
1152
+ topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean_short + ".csv"
1153
+
1154
+ # Return the existing dataframes (no changes needed)
1155
+ return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
1156
+
1157
  # Convert response text to a markdown table
1158
  try:
1159
  topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
 
1216
  existing_reference_numbers = False
1217
 
1218
  batch_basic_response_df["Reference"] = batch_basic_response_df["Reference"].astype(str)
1219
+ batch_size_number = int(batch_size_number)
1220
 
1221
  # Iterate through each row in the original DataFrame
1222
  for index, row in topic_with_response_df.iterrows():
1223
+ references_raw = str(row.iloc[3]) if pd.notna(row.iloc[3]) else ""
1224
+ references = re.findall(r'\d+', references_raw)
1225
 
1226
+ if batch_size_number == 1:
1227
+ references = ["1"]
1228
 
1229
  # Filter out references that are outside the valid range
1230
  if references:
1231
  try:
1232
+ # Convert all references to integers and keep only those within valid range
1233
  ref_numbers = [int(ref) for ref in references]
1234
+ references = [ref for ref in ref_numbers if 1 <= int(ref) <= int(batch_size_number)]
1235
  except ValueError:
1236
  # If any reference can't be converted to int, skip this row
1237
  print("Response value could not be converted to number:", references)
1238
  continue
1239
  else:
1240
+ references = []
1241
 
1242
  topic = row.iloc[0] if pd.notna(row.iloc[0]) else ""
1243
  subtopic = row.iloc[1] if pd.notna(row.iloc[1]) else ""
 
1266
  if batch_basic_response_df.empty:
1267
  # --- Scenario 1: The DataFrame is empty, so we calculate the reference ---
1268
  try:
1269
+ response_ref_no = int(ref) + int(start_row)
1270
  except ValueError:
1271
  print(f"Reference '{ref}' is not a number and was skipped.")
1272
  continue # Skip to the next 'ref' in the loop
 
1283
  print(f"Reference '{ref}' not found in the DataFrame.")
1284
  continue # Skip to the next 'ref' in the loop
1285
 
 
1286
  # This code runs for every *valid* reference that wasn't skipped by 'continue'.
1287
  # It uses the 'response_ref_no' calculated in the if/else block above.
1288
  reference_data.append({
1289
+ 'Response References': str(response_ref_no),
1290
  'General topic': topic,
1291
  'Subtopic': subtopic,
1292
  'Sentiment': sentiment,
 
1301
  response_ref_no = 0 # Default value when no references are provided
1302
 
1303
  reference_data.append({
1304
+ 'Response References': str(response_ref_no),
1305
  'General topic': topic,
1306
  'Subtopic': subtopic,
1307
  'Sentiment': sentiment,
 
1327
  # Try converting response references column to int, keep as string if fails
1328
  if existing_reference_numbers is True:
1329
  try:
1330
+ out_reference_df["Response References"] = out_reference_df["Response References"].astype(str).astype(int)
1331
  except Exception as e:
1332
  print("Could not convert Response References column to integer due to", e)
1333
 
 
1485
  if "Local" in model_source and reasoning_suffix:
1486
  formatted_system_prompt = formatted_system_prompt + "\n" + reasoning_suffix
1487
 
1488
+
 
1489
 
1490
  # Combine system prompt and user prompt for token counting
1491
  full_input_text = formatted_system_prompt + "\n" + formatted_prompt
 
1538
  all_metadata_content, log_files_output_paths, task_type=task_type
1539
  )
1540
 
1541
+ print("Finished processing batch with LLM")
1542
+
1543
  return (
1544
  new_topic_df, new_reference_df, new_topic_summary_df, is_error,
1545
  current_prompt_content_logged, current_summary_content_logged,
 
1738
  else:
1739
  progress(0.1, desc="Querying large language model")
1740
 
1741
+ latest_batch_completed = int(latest_batch_completed)
1742
+ num_batches = int(num_batches)
1743
+
1744
  if latest_batch_completed < num_batches:
1745
 
1746
  # Load file
 
1780
  if batch_basic_response_df.shape[0] == 1: response_reference_format = "" # Blank, as the topics will always refer to the single response provided, '1'
1781
  else: response_reference_format = "\n" + default_response_reference_format
1782
 
1783
+ # If the response table is not empty, add it to the prompt with an intro line
1784
+ if normalised_simple_markdown_table:
1785
+ response_table_prompt = "Response table:\n" + normalised_simple_markdown_table
1786
+ else:
1787
+ response_table_prompt = ""
1788
+
1789
  # If the latest batch of responses contains at least one instance of text
1790
  if not batch_basic_response_df.empty:
1791
 
 
1862
  # Should the outputs force only one single topic assignment per response?
1863
  if force_single_topic_radio != "Yes": force_single_topic_prompt = ""
1864
  else:
1865
+ topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
1866
+
1867
+
1868
 
1869
  # Format the summary prompt with the response table and topics
1870
  if produce_structured_summary_radio != "Yes":
1871
  formatted_summary_prompt = add_existing_topics_prompt.format(
1872
  validate_prompt_prefix="",
1873
+ response_table=response_table_prompt,
1874
  topics=unique_topics_markdown,
1875
  topic_assignment=topic_assignment_prompt,
1876
  force_single_topic=force_single_topic_prompt,
 
1882
  validate_prompt_suffix=""
1883
  )
1884
  else:
1885
+ formatted_summary_prompt = structured_summary_prompt.format(response_table=response_table_prompt,
1886
  topics=unique_topics_markdown, summary_format=additional_instructions_summary_format)
1887
 
1888
  batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size}_col_{in_column_cleaned}"
 
1972
  if produce_structured_summary_radio != "Yes":
1973
  formatted_initial_table_prompt = initial_table_prompt.format(
1974
  validate_prompt_prefix="",
1975
+ response_table=response_table_prompt,
1976
  sentiment_choices=sentiment_prompt,
1977
  response_reference_format=response_reference_format,
1978
  add_existing_topics_summary_format=additional_instructions_summary_format,
 
1982
  )
1983
  else:
1984
  unique_topics_markdown="No suggested headings for this summary"
1985
+ formatted_initial_table_prompt = structured_summary_prompt.format(response_table=response_table_prompt, topics=unique_topics_markdown)
1986
 
1987
  batch_file_path_details = f"{file_name_clean}_batch_{latest_batch_completed + 1}_size_{batch_size}_col_{in_column_cleaned}"
1988
 
 
2057
  print("Current batch of responses contains no text, moving onto next. Batch number:", str(latest_batch_completed + 1), ". Start row:", start_row, ". End row:", end_row)
2058
 
2059
  # Increase latest file completed count unless we are over the last batch number, then go back around
2060
+ num_batches = int(num_batches)
2061
+ latest_batch_completed = int(latest_batch_completed)
2062
  if latest_batch_completed <= num_batches:
2063
  latest_batch_completed += 1
2064
 
 
2104
  file_data=file_data,
2105
  reference_df=existing_reference_df,
2106
  topic_summary_df=existing_topic_summary_df,
 
2107
  file_name=file_name,
2108
  chosen_cols=chosen_cols,
2109
  batch_size=batch_size,
 
2125
  output_debug_files=output_debug_files,
2126
  original_full_file_name=original_full_file_name,
2127
  max_time_for_loop=max_time_for_loop,
2128
+ sentiment_checkbox=sentiment_checkbox,
2129
+ logged_content=group_combined_logged_content
2130
  )
2131
 
2132
  # Add validation conversation metadata to the main conversation metadata
tools/llm_funcs.py CHANGED
@@ -1217,9 +1217,12 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
1217
 
1218
  stripped_response = response_text.strip()
1219
 
1220
- # Check if response meets our criteria (length and contains table)
1221
- if len(stripped_response) > 120 and '|' in stripped_response:
1222
- print(f"Attempt {attempt + 1} produced response with markdown table.")
 
 
 
1223
  break # Success - exit loop
1224
 
1225
  # Increase temperature for next attempt
 
1217
 
1218
  stripped_response = response_text.strip()
1219
 
1220
+ # Check if response meets our criteria (length and contains table) OR is "No change"
1221
+ if (len(stripped_response) > 120 and '|' in stripped_response) or stripped_response.lower().startswith("no change"):
1222
+ if stripped_response.lower().startswith("no change"):
1223
+ print(f"Attempt {attempt + 1} produced 'No change' response.")
1224
+ else:
1225
+ print(f"Attempt {attempt + 1} produced response with markdown table.")
1226
  break # Success - exit loop
1227
 
1228
  # Increase temperature for next attempt
tools/prompts.py CHANGED
@@ -50,10 +50,9 @@ add_existing_topics_prompt = """{validate_prompt_prefix}Your task is to create o
50
  In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
51
  Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
52
 
53
- Responses are shown in the following Response table:
54
  {response_table}
55
 
56
- Topics known to be relevant to this dataset are shown in the following Topics table:
57
  {topics}
58
 
59
  New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
@@ -62,6 +61,7 @@ New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"
62
  # VALIDATION PROMPTS
63
  ###
64
  # These are prompts used to validate previous LLM outputs, and create corrected versions of the outputs if errors are found.
 
65
 
66
  validation_prompt_prefix_default = """The following instructions were previously provided to create an output table:\n'"""
67
 
@@ -69,16 +69,13 @@ previous_table_introduction_default = """'\n\nThe following output table was cre
69
 
70
  validation_prompt_suffix_default = """\n\nBased on the above information, you need to create a corrected version of the output table. Examples of issues to correct include:
71
 
72
- - Remove rows where topics are not relevant to any responses in the provided response table.
73
- - Assign responses to topics from the provided suggested topics table if relevant (only choose from the provided list, do not create new topics).
74
- - Remove Response References for responses that are incorrectly assigned to topics.
75
- - If a response has no topic assigned, check if there is a relevant topic in the provided suggested topics table. If so, assign the response to the relevant topic.
76
- - Correct incorrect information in the summary column from the response text.{additional_validation_issues}
77
  - Any other obvious errors that you can identify.
78
 
79
- With the above issues in mind, create a new, corrected version of the markdown table below. If there are no issues to correct, return the original table.
80
-
81
- New table:"""
82
 
83
  ###
84
  # SENTIMENT CHOICES
@@ -102,7 +99,6 @@ For each of the responses in the Response table, you will create a row for each
102
 
103
  Do not add any other columns. Do not add any other text to your response.
104
 
105
- Responses are shown in the following Response table:
106
  {response_table}
107
 
108
  Headings to structure the summary are in the following table:
 
50
  In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
51
  Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
52
 
 
53
  {response_table}
54
 
55
+ Topics that are relevant to this dataset are shown in the following Topics table:
56
  {topics}
57
 
58
  New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
 
61
  # VALIDATION PROMPTS
62
  ###
63
  # These are prompts used to validate previous LLM outputs, and create corrected versions of the outputs if errors are found.
64
+ validation_system_prompt = system_prompt
65
 
66
  validation_prompt_prefix_default = """The following instructions were previously provided to create an output table:\n'"""
67
 
 
69
 
70
  validation_prompt_suffix_default = """\n\nBased on the above information, you need to create a corrected version of the output table. Examples of issues to correct include:
71
 
72
+ - Remove rows where responses are not relevant to the assigned topic, or where responses are not relevant to any topic.
73
+ - Remove rows where a topic is not assigned to any specific response.
74
+ - If the current topic assignment does not cover all information in a response, assign responses to relevant topics from the suggested topics table, or create a new topic if necessary.
75
+ - Correct incorrect information in the summary column, which is a summary of the relevant response text.{additional_validation_issues}
 
76
  - Any other obvious errors that you can identify.
77
 
78
+ With the above issues in mind, create a new, corrected version of the markdown table below. If there are no issues to correct, write simply "No change"."""
 
 
79
 
80
  ###
81
  # SENTIMENT CHOICES
 
99
 
100
  Do not add any other columns. Do not add any other text to your response.
101
 
 
102
  {response_table}
103
 
104
  Headings to structure the summary are in the following table: