Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on 17 days ago

Commit

70cb346

1 Parent(s): 0d14c19

Minor prompt improvements. Bedrock client and keys should now be correctly passed to validation function

Browse files

Files changed (5) hide show

app.py +3 -1
tools/config.py +1 -1
tools/dedup_summaries.py +2 -3
tools/llm_api_call.py +20 -9
tools/prompts.py +4 -5

app.py CHANGED Viewed

@@ -504,7 +504,9 @@ with app:
                 in_data_files,
                 sentiment_checkbox,
                 logged_content_df,
-                show_previous_table_radio],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
                 master_unique_topics_df_state,

                 in_data_files,
                 sentiment_checkbox,
                 logged_content_df,
+                show_previous_table_radio,
+                aws_access_key_textbox,
+                aws_secret_key_textbox],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
                 master_unique_topics_df_state,

tools/config.py CHANGED Viewed

@@ -397,7 +397,7 @@ LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '0.95'))
 LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
-LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
 LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')

 LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
+LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
 LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')

tools/dedup_summaries.py CHANGED Viewed

@@ -1141,7 +1141,7 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
         assistant_model = get_assistant_model()
     summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
-    summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
     if do_summaries == "Yes":
@@ -1543,8 +1543,7 @@ def wrapper_summarise_output_topics_per_group(
                 assistant_model=assistant_model,
                 summarise_topic_descriptions_prompt=summarise_topic_descriptions_prompt,
                 summarise_topic_descriptions_system_prompt=summarise_topic_descriptions_system_prompt,
-                do_summaries=do_summaries,
-                progress=progress
             )
             # Aggregate results

         assistant_model = get_assistant_model()
     summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
+    summary_loop = progress.tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
     if do_summaries == "Yes":
                 assistant_model=assistant_model,
                 summarise_topic_descriptions_prompt=summarise_topic_descriptions_prompt,
                 summarise_topic_descriptions_system_prompt=summarise_topic_descriptions_system_prompt,
+                do_summaries=do_summaries
             )
             # Aggregate results

tools/llm_api_call.py CHANGED Viewed

@@ -162,6 +162,8 @@ def validate_topics(
     sentiment_checkbox: str = "Negative or Positive",
     logged_content: list = None,
     show_previous_table: str = "Yes",
     progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
     """
@@ -195,6 +197,8 @@ def validate_topics(
     - max_time_for_loop (int): Maximum time for the loop
     - logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
     - show_previous_table (str): Whether to show the previous table ("Yes" or "No").
     - progress: Progress bar object
     Returns:
@@ -222,8 +226,8 @@ def validate_topics(
         tokenizer = get_tokenizer()
     # Set up bedrock runtime if needed
-    if model_source == "Bedrock":
-        bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, "", "")
     # Clean file name for output
     file_name_clean = clean_column_name(file_name, max_length=20, front_characters=False)
@@ -554,6 +558,8 @@ def validate_topics_wrapper(
     sentiment_checkbox: str = "Negative or Positive",
     logged_content: List[dict] = None,
     show_previous_table: str = "Yes",
     progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
     """
@@ -589,6 +595,9 @@ def validate_topics_wrapper(
         sentiment_checkbox (str): Sentiment analysis option.
         logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
         show_previous_table (str): Whether to show the previous table ("Yes" or "No").
     Returns:
         Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
@@ -714,7 +723,9 @@ def validate_topics_wrapper(
                 max_time_for_loop=max_time_for_loop,
                 sentiment_checkbox=sentiment_checkbox,
                 logged_content=logged_content,
-                show_previous_table=show_previous_table
             )
             # Accumulate results
@@ -1574,7 +1585,7 @@ def process_batch_with_llm(
         MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=assistant_prefill, master=not is_first_batch
     )
-    print("Response text:", response_text)
     # Return output tables
     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(
@@ -2220,16 +2231,16 @@ def extract_topics(in_data_file: gr.FileData,
         all_groups_logged_content = all_groups_logged_content + group_combined_logged_content
-        file_path_details = create_batch_file_path_details(file_name)
         # Create a pivoted reference table
         existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
         # Save the new DataFrame to CSV
-        reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
-        reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
-        topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
-        basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
         existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')

     sentiment_checkbox: str = "Negative or Positive",
     logged_content: list = None,
     show_previous_table: str = "Yes",
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
     progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
     """
     - max_time_for_loop (int): Maximum time for the loop
     - logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
     - show_previous_table (str): Whether to show the previous table ("Yes" or "No").
+    - aws_access_key_textbox (str): AWS access key.
+    - aws_secret_key_textbox (str): AWS secret key.
     - progress: Progress bar object
     Returns:
         tokenizer = get_tokenizer()
     # Set up bedrock runtime if needed
+    if model_source == "AWS":
+        bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
     # Clean file name for output
     file_name_clean = clean_column_name(file_name, max_length=20, front_characters=False)
     sentiment_checkbox: str = "Negative or Positive",
     logged_content: List[dict] = None,
     show_previous_table: str = "Yes",
+    aws_access_key_textbox: str = "",
+    aws_secret_key_textbox: str = "",
     progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
     """
         sentiment_checkbox (str): Sentiment analysis option.
         logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
         show_previous_table (str): Whether to show the previous table ("Yes" or "No").
+        aws_access_key_textbox (str): AWS access key.
+        aws_secret_key_textbox (str): AWS secret key.
+        progress (gr.Progress): Progress bar object.
     Returns:
         Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
                 max_time_for_loop=max_time_for_loop,
                 sentiment_checkbox=sentiment_checkbox,
                 logged_content=logged_content,
+                show_previous_table=show_previous_table,
+                aws_access_key_textbox=aws_access_key_textbox,
+                aws_secret_key_textbox=aws_secret_key_textbox
             )
             # Accumulate results
         MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=assistant_prefill, master=not is_first_batch
     )
+    #print("Response text:", response_text)
     # Return output tables
     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(
         all_groups_logged_content = all_groups_logged_content + group_combined_logged_content
+        #file_path_details = create_batch_file_path_details(file_name, in_column=chosen_cols)
         # Create a pivoted reference table
         existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
         # Save the new DataFrame to CSV
+        reference_table_out_pivot_path = output_folder + file_name_clean + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        reference_table_out_path = output_folder + file_name_clean + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        topic_summary_df_out_path = output_folder + file_name_clean + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        basic_response_data_out_path = output_folder + file_name_clean + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
         existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')

tools/prompts.py CHANGED Viewed

@@ -15,7 +15,7 @@ initial_table_system_prompt = system_prompt + markdown_additional_prompt
 initial_table_assistant_prefill = "|"
-default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
 initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
 In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
@@ -29,7 +29,6 @@ Response table:
 New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
 ###
 # Adding existing topics to consultation responses
 ###
@@ -50,11 +49,11 @@ add_existing_topics_prompt = """{validate_prompt_prefix}Your task is to create o
 In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
 Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
-{response_table}
-Topics that are relevant to this dataset are shown in the following Topics table:
 {topics}
 New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
 ###

 initial_table_assistant_prefill = "|"
+default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do not write any other text in this column."
 initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
 In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
 New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
 ###
 # Adding existing topics to consultation responses
 ###
 In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
 Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
+Choose from among the following topic names to assign to the responses, only if they are directlyrelevant to responses from the response table below:
 {topics}
+{response_table}
 New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
 ###