seanpedrickcase commited on
Commit
70cb346
·
1 Parent(s): 0d14c19

Minor prompt improvements. Bedrock client and keys should now be correctly passed to validation function

Browse files
app.py CHANGED
@@ -504,7 +504,9 @@ with app:
504
  in_data_files,
505
  sentiment_checkbox,
506
  logged_content_df,
507
- show_previous_table_radio],
 
 
508
  outputs=[display_topic_table_markdown,
509
  master_topic_df_state,
510
  master_unique_topics_df_state,
 
504
  in_data_files,
505
  sentiment_checkbox,
506
  logged_content_df,
507
+ show_previous_table_radio,
508
+ aws_access_key_textbox,
509
+ aws_secret_key_textbox],
510
  outputs=[display_topic_table_markdown,
511
  master_topic_df_state,
512
  master_unique_topics_df_state,
tools/config.py CHANGED
@@ -397,7 +397,7 @@ LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '0.95'))
397
  LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
398
 
399
  LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
400
- LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
401
  LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
402
  LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
403
  LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
 
397
  LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
398
 
399
  LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
400
+ LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
401
  LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
402
  LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
403
  LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
tools/dedup_summaries.py CHANGED
@@ -1141,7 +1141,7 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
1141
  assistant_model = get_assistant_model()
1142
 
1143
  summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
1144
- summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
1145
 
1146
  if do_summaries == "Yes":
1147
 
@@ -1543,8 +1543,7 @@ def wrapper_summarise_output_topics_per_group(
1543
  assistant_model=assistant_model,
1544
  summarise_topic_descriptions_prompt=summarise_topic_descriptions_prompt,
1545
  summarise_topic_descriptions_system_prompt=summarise_topic_descriptions_system_prompt,
1546
- do_summaries=do_summaries,
1547
- progress=progress
1548
  )
1549
 
1550
  # Aggregate results
 
1141
  assistant_model = get_assistant_model()
1142
 
1143
  summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
1144
+ summary_loop = progress.tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
1145
 
1146
  if do_summaries == "Yes":
1147
 
 
1543
  assistant_model=assistant_model,
1544
  summarise_topic_descriptions_prompt=summarise_topic_descriptions_prompt,
1545
  summarise_topic_descriptions_system_prompt=summarise_topic_descriptions_system_prompt,
1546
+ do_summaries=do_summaries
 
1547
  )
1548
 
1549
  # Aggregate results
tools/llm_api_call.py CHANGED
@@ -162,6 +162,8 @@ def validate_topics(
162
  sentiment_checkbox: str = "Negative or Positive",
163
  logged_content: list = None,
164
  show_previous_table: str = "Yes",
 
 
165
  progress = gr.Progress(track_tqdm=True)
166
  ) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
167
  """
@@ -195,6 +197,8 @@ def validate_topics(
195
  - max_time_for_loop (int): Maximum time for the loop
196
  - logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
197
  - show_previous_table (str): Whether to show the previous table ("Yes" or "No").
 
 
198
  - progress: Progress bar object
199
 
200
  Returns:
@@ -222,8 +226,8 @@ def validate_topics(
222
  tokenizer = get_tokenizer()
223
 
224
  # Set up bedrock runtime if needed
225
- if model_source == "Bedrock":
226
- bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, "", "")
227
 
228
  # Clean file name for output
229
  file_name_clean = clean_column_name(file_name, max_length=20, front_characters=False)
@@ -554,6 +558,8 @@ def validate_topics_wrapper(
554
  sentiment_checkbox: str = "Negative or Positive",
555
  logged_content: List[dict] = None,
556
  show_previous_table: str = "Yes",
 
 
557
  progress = gr.Progress(track_tqdm=True)
558
  ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
559
  """
@@ -589,6 +595,9 @@ def validate_topics_wrapper(
589
  sentiment_checkbox (str): Sentiment analysis option.
590
  logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
591
  show_previous_table (str): Whether to show the previous table ("Yes" or "No").
 
 
 
592
 
593
  Returns:
594
  Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
@@ -714,7 +723,9 @@ def validate_topics_wrapper(
714
  max_time_for_loop=max_time_for_loop,
715
  sentiment_checkbox=sentiment_checkbox,
716
  logged_content=logged_content,
717
- show_previous_table=show_previous_table
 
 
718
  )
719
 
720
  # Accumulate results
@@ -1574,7 +1585,7 @@ def process_batch_with_llm(
1574
  MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=assistant_prefill, master=not is_first_batch
1575
  )
1576
 
1577
- print("Response text:", response_text)
1578
 
1579
  # Return output tables
1580
  topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(
@@ -2220,16 +2231,16 @@ def extract_topics(in_data_file: gr.FileData,
2220
 
2221
  all_groups_logged_content = all_groups_logged_content + group_combined_logged_content
2222
 
2223
- file_path_details = create_batch_file_path_details(file_name)
2224
 
2225
  # Create a pivoted reference table
2226
  existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
2227
 
2228
  # Save the new DataFrame to CSV
2229
- reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2230
- reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2231
- topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2232
- basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2233
 
2234
  ## Reference table mapping response numbers to topics
2235
  existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
 
162
  sentiment_checkbox: str = "Negative or Positive",
163
  logged_content: list = None,
164
  show_previous_table: str = "Yes",
165
+ aws_access_key_textbox: str = "",
166
+ aws_secret_key_textbox: str = "",
167
  progress = gr.Progress(track_tqdm=True)
168
  ) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
169
  """
 
197
  - max_time_for_loop (int): Maximum time for the loop
198
  - logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
199
  - show_previous_table (str): Whether to show the previous table ("Yes" or "No").
200
+ - aws_access_key_textbox (str): AWS access key.
201
+ - aws_secret_key_textbox (str): AWS secret key.
202
  - progress: Progress bar object
203
 
204
  Returns:
 
226
  tokenizer = get_tokenizer()
227
 
228
  # Set up bedrock runtime if needed
229
+ if model_source == "AWS":
230
+ bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
231
 
232
  # Clean file name for output
233
  file_name_clean = clean_column_name(file_name, max_length=20, front_characters=False)
 
558
  sentiment_checkbox: str = "Negative or Positive",
559
  logged_content: List[dict] = None,
560
  show_previous_table: str = "Yes",
561
+ aws_access_key_textbox: str = "",
562
+ aws_secret_key_textbox: str = "",
563
  progress = gr.Progress(track_tqdm=True)
564
  ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
565
  """
 
595
  sentiment_checkbox (str): Sentiment analysis option.
596
  logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
597
  show_previous_table (str): Whether to show the previous table ("Yes" or "No").
598
+ aws_access_key_textbox (str): AWS access key.
599
+ aws_secret_key_textbox (str): AWS secret key.
600
+ progress (gr.Progress): Progress bar object.
601
 
602
  Returns:
603
  Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
 
723
  max_time_for_loop=max_time_for_loop,
724
  sentiment_checkbox=sentiment_checkbox,
725
  logged_content=logged_content,
726
+ show_previous_table=show_previous_table,
727
+ aws_access_key_textbox=aws_access_key_textbox,
728
+ aws_secret_key_textbox=aws_secret_key_textbox
729
  )
730
 
731
  # Accumulate results
 
1585
  MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=assistant_prefill, master=not is_first_batch
1586
  )
1587
 
1588
+ #print("Response text:", response_text)
1589
 
1590
  # Return output tables
1591
  topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(
 
2231
 
2232
  all_groups_logged_content = all_groups_logged_content + group_combined_logged_content
2233
 
2234
+ #file_path_details = create_batch_file_path_details(file_name, in_column=chosen_cols)
2235
 
2236
  # Create a pivoted reference table
2237
  existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
2238
 
2239
  # Save the new DataFrame to CSV
2240
+ reference_table_out_pivot_path = output_folder + file_name_clean + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2241
+ reference_table_out_path = output_folder + file_name_clean + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2242
+ topic_summary_df_out_path = output_folder + file_name_clean + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2243
+ basic_response_data_out_path = output_folder + file_name_clean + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
2244
 
2245
  ## Reference table mapping response numbers to topics
2246
  existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
tools/prompts.py CHANGED
@@ -15,7 +15,7 @@ initial_table_system_prompt = system_prompt + markdown_additional_prompt
15
 
16
  initial_table_assistant_prefill = "|"
17
 
18
- default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
19
 
20
  initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
21
  In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
@@ -29,7 +29,6 @@ Response table:
29
 
30
  New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
31
 
32
-
33
  ###
34
  # Adding existing topics to consultation responses
35
  ###
@@ -50,11 +49,11 @@ add_existing_topics_prompt = """{validate_prompt_prefix}Your task is to create o
50
  In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
51
  Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
52
 
53
- {response_table}
54
-
55
- Topics that are relevant to this dataset are shown in the following Topics table:
56
  {topics}
57
 
 
 
58
  New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
59
 
60
  ###
 
15
 
16
  initial_table_assistant_prefill = "|"
17
 
18
+ default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do not write any other text in this column."
19
 
20
  initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
21
  In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
 
29
 
30
  New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
31
 
 
32
  ###
33
  # Adding existing topics to consultation responses
34
  ###
 
49
  In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
50
  Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
51
 
52
+ Choose from among the following topic names to assign to the responses, only if they are directlyrelevant to responses from the response table below:
 
 
53
  {topics}
54
 
55
+ {response_table}
56
+
57
  New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
58
 
59
  ###