Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
70cb346
1
Parent(s):
0d14c19
Minor prompt improvements. Bedrock client and keys should now be correctly passed to validation function
Browse files- app.py +3 -1
- tools/config.py +1 -1
- tools/dedup_summaries.py +2 -3
- tools/llm_api_call.py +20 -9
- tools/prompts.py +4 -5
app.py
CHANGED
|
@@ -504,7 +504,9 @@ with app:
|
|
| 504 |
in_data_files,
|
| 505 |
sentiment_checkbox,
|
| 506 |
logged_content_df,
|
| 507 |
-
show_previous_table_radio
|
|
|
|
|
|
|
| 508 |
outputs=[display_topic_table_markdown,
|
| 509 |
master_topic_df_state,
|
| 510 |
master_unique_topics_df_state,
|
|
|
|
| 504 |
in_data_files,
|
| 505 |
sentiment_checkbox,
|
| 506 |
logged_content_df,
|
| 507 |
+
show_previous_table_radio,
|
| 508 |
+
aws_access_key_textbox,
|
| 509 |
+
aws_secret_key_textbox],
|
| 510 |
outputs=[display_topic_table_markdown,
|
| 511 |
master_topic_df_state,
|
| 512 |
master_unique_topics_df_state,
|
tools/config.py
CHANGED
|
@@ -397,7 +397,7 @@ LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '0.95'))
|
|
| 397 |
LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
|
| 398 |
|
| 399 |
LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
|
| 400 |
-
LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '
|
| 401 |
LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
|
| 402 |
LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
|
| 403 |
LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
|
|
|
|
| 397 |
LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
|
| 398 |
|
| 399 |
LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
|
| 400 |
+
LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
|
| 401 |
LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
|
| 402 |
LLM_RESET = get_or_create_env_var('LLM_RESET', 'False')
|
| 403 |
LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
|
tools/dedup_summaries.py
CHANGED
|
@@ -1141,7 +1141,7 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
|
|
| 1141 |
assistant_model = get_assistant_model()
|
| 1142 |
|
| 1143 |
summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
| 1144 |
-
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
|
| 1145 |
|
| 1146 |
if do_summaries == "Yes":
|
| 1147 |
|
|
@@ -1543,8 +1543,7 @@ def wrapper_summarise_output_topics_per_group(
|
|
| 1543 |
assistant_model=assistant_model,
|
| 1544 |
summarise_topic_descriptions_prompt=summarise_topic_descriptions_prompt,
|
| 1545 |
summarise_topic_descriptions_system_prompt=summarise_topic_descriptions_system_prompt,
|
| 1546 |
-
do_summaries=do_summaries
|
| 1547 |
-
progress=progress
|
| 1548 |
)
|
| 1549 |
|
| 1550 |
# Aggregate results
|
|
|
|
| 1141 |
assistant_model = get_assistant_model()
|
| 1142 |
|
| 1143 |
summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
| 1144 |
+
summary_loop = progress.tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
|
| 1145 |
|
| 1146 |
if do_summaries == "Yes":
|
| 1147 |
|
|
|
|
| 1543 |
assistant_model=assistant_model,
|
| 1544 |
summarise_topic_descriptions_prompt=summarise_topic_descriptions_prompt,
|
| 1545 |
summarise_topic_descriptions_system_prompt=summarise_topic_descriptions_system_prompt,
|
| 1546 |
+
do_summaries=do_summaries
|
|
|
|
| 1547 |
)
|
| 1548 |
|
| 1549 |
# Aggregate results
|
tools/llm_api_call.py
CHANGED
|
@@ -162,6 +162,8 @@ def validate_topics(
|
|
| 162 |
sentiment_checkbox: str = "Negative or Positive",
|
| 163 |
logged_content: list = None,
|
| 164 |
show_previous_table: str = "Yes",
|
|
|
|
|
|
|
| 165 |
progress = gr.Progress(track_tqdm=True)
|
| 166 |
) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
|
| 167 |
"""
|
|
@@ -195,6 +197,8 @@ def validate_topics(
|
|
| 195 |
- max_time_for_loop (int): Maximum time for the loop
|
| 196 |
- logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
|
| 197 |
- show_previous_table (str): Whether to show the previous table ("Yes" or "No").
|
|
|
|
|
|
|
| 198 |
- progress: Progress bar object
|
| 199 |
|
| 200 |
Returns:
|
|
@@ -222,8 +226,8 @@ def validate_topics(
|
|
| 222 |
tokenizer = get_tokenizer()
|
| 223 |
|
| 224 |
# Set up bedrock runtime if needed
|
| 225 |
-
if model_source == "
|
| 226 |
-
bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice,
|
| 227 |
|
| 228 |
# Clean file name for output
|
| 229 |
file_name_clean = clean_column_name(file_name, max_length=20, front_characters=False)
|
|
@@ -554,6 +558,8 @@ def validate_topics_wrapper(
|
|
| 554 |
sentiment_checkbox: str = "Negative or Positive",
|
| 555 |
logged_content: List[dict] = None,
|
| 556 |
show_previous_table: str = "Yes",
|
|
|
|
|
|
|
| 557 |
progress = gr.Progress(track_tqdm=True)
|
| 558 |
) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
|
| 559 |
"""
|
|
@@ -589,6 +595,9 @@ def validate_topics_wrapper(
|
|
| 589 |
sentiment_checkbox (str): Sentiment analysis option.
|
| 590 |
logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
|
| 591 |
show_previous_table (str): Whether to show the previous table ("Yes" or "No").
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
Returns:
|
| 594 |
Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
|
|
@@ -714,7 +723,9 @@ def validate_topics_wrapper(
|
|
| 714 |
max_time_for_loop=max_time_for_loop,
|
| 715 |
sentiment_checkbox=sentiment_checkbox,
|
| 716 |
logged_content=logged_content,
|
| 717 |
-
show_previous_table=show_previous_table
|
|
|
|
|
|
|
| 718 |
)
|
| 719 |
|
| 720 |
# Accumulate results
|
|
@@ -1574,7 +1585,7 @@ def process_batch_with_llm(
|
|
| 1574 |
MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=assistant_prefill, master=not is_first_batch
|
| 1575 |
)
|
| 1576 |
|
| 1577 |
-
print("Response text:", response_text)
|
| 1578 |
|
| 1579 |
# Return output tables
|
| 1580 |
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(
|
|
@@ -2220,16 +2231,16 @@ def extract_topics(in_data_file: gr.FileData,
|
|
| 2220 |
|
| 2221 |
all_groups_logged_content = all_groups_logged_content + group_combined_logged_content
|
| 2222 |
|
| 2223 |
-
file_path_details = create_batch_file_path_details(file_name)
|
| 2224 |
|
| 2225 |
# Create a pivoted reference table
|
| 2226 |
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
| 2227 |
|
| 2228 |
# Save the new DataFrame to CSV
|
| 2229 |
-
reference_table_out_pivot_path = output_folder +
|
| 2230 |
-
reference_table_out_path = output_folder +
|
| 2231 |
-
topic_summary_df_out_path = output_folder +
|
| 2232 |
-
basic_response_data_out_path = output_folder +
|
| 2233 |
|
| 2234 |
## Reference table mapping response numbers to topics
|
| 2235 |
existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
|
|
|
|
| 162 |
sentiment_checkbox: str = "Negative or Positive",
|
| 163 |
logged_content: list = None,
|
| 164 |
show_previous_table: str = "Yes",
|
| 165 |
+
aws_access_key_textbox: str = "",
|
| 166 |
+
aws_secret_key_textbox: str = "",
|
| 167 |
progress = gr.Progress(track_tqdm=True)
|
| 168 |
) -> Tuple[pd.DataFrame, pd.DataFrame, list, str, int, int, int]:
|
| 169 |
"""
|
|
|
|
| 197 |
- max_time_for_loop (int): Maximum time for the loop
|
| 198 |
- logged_content (list, optional): The logged content from the original run. If None, tables will be reconstructed from reference_df
|
| 199 |
- show_previous_table (str): Whether to show the previous table ("Yes" or "No").
|
| 200 |
+
- aws_access_key_textbox (str): AWS access key.
|
| 201 |
+
- aws_secret_key_textbox (str): AWS secret key.
|
| 202 |
- progress: Progress bar object
|
| 203 |
|
| 204 |
Returns:
|
|
|
|
| 226 |
tokenizer = get_tokenizer()
|
| 227 |
|
| 228 |
# Set up bedrock runtime if needed
|
| 229 |
+
if model_source == "AWS":
|
| 230 |
+
bedrock_runtime = connect_to_bedrock_runtime(model_name_map, model_choice, aws_access_key_textbox, aws_secret_key_textbox)
|
| 231 |
|
| 232 |
# Clean file name for output
|
| 233 |
file_name_clean = clean_column_name(file_name, max_length=20, front_characters=False)
|
|
|
|
| 558 |
sentiment_checkbox: str = "Negative or Positive",
|
| 559 |
logged_content: List[dict] = None,
|
| 560 |
show_previous_table: str = "Yes",
|
| 561 |
+
aws_access_key_textbox: str = "",
|
| 562 |
+
aws_secret_key_textbox: str = "",
|
| 563 |
progress = gr.Progress(track_tqdm=True)
|
| 564 |
) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
|
| 565 |
"""
|
|
|
|
| 595 |
sentiment_checkbox (str): Sentiment analysis option.
|
| 596 |
logged_content (List[dict], optional): The logged content from the original run. If None, tables will be reconstructed from reference_df.
|
| 597 |
show_previous_table (str): Whether to show the previous table ("Yes" or "No").
|
| 598 |
+
aws_access_key_textbox (str): AWS access key.
|
| 599 |
+
aws_secret_key_textbox (str): AWS secret key.
|
| 600 |
+
progress (gr.Progress): Progress bar object.
|
| 601 |
|
| 602 |
Returns:
|
| 603 |
Tuple[pd.DataFrame, pd.DataFrame, List[dict], str, int, int, int, List[str]]:
|
|
|
|
| 723 |
max_time_for_loop=max_time_for_loop,
|
| 724 |
sentiment_checkbox=sentiment_checkbox,
|
| 725 |
logged_content=logged_content,
|
| 726 |
+
show_previous_table=show_previous_table,
|
| 727 |
+
aws_access_key_textbox=aws_access_key_textbox,
|
| 728 |
+
aws_secret_key_textbox=aws_secret_key_textbox
|
| 729 |
)
|
| 730 |
|
| 731 |
# Accumulate results
|
|
|
|
| 1585 |
MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=assistant_prefill, master=not is_first_batch
|
| 1586 |
)
|
| 1587 |
|
| 1588 |
+
#print("Response text:", response_text)
|
| 1589 |
|
| 1590 |
# Return output tables
|
| 1591 |
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(
|
|
|
|
| 2231 |
|
| 2232 |
all_groups_logged_content = all_groups_logged_content + group_combined_logged_content
|
| 2233 |
|
| 2234 |
+
#file_path_details = create_batch_file_path_details(file_name, in_column=chosen_cols)
|
| 2235 |
|
| 2236 |
# Create a pivoted reference table
|
| 2237 |
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
| 2238 |
|
| 2239 |
# Save the new DataFrame to CSV
|
| 2240 |
+
reference_table_out_pivot_path = output_folder + file_name_clean + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 2241 |
+
reference_table_out_path = output_folder + file_name_clean + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 2242 |
+
topic_summary_df_out_path = output_folder + file_name_clean + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 2243 |
+
basic_response_data_out_path = output_folder + file_name_clean + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 2244 |
|
| 2245 |
## Reference table mapping response numbers to topics
|
| 2246 |
existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
|
tools/prompts.py
CHANGED
|
@@ -15,7 +15,7 @@ initial_table_system_prompt = system_prompt + markdown_additional_prompt
|
|
| 15 |
|
| 16 |
initial_table_assistant_prefill = "|"
|
| 17 |
|
| 18 |
-
default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do
|
| 19 |
|
| 20 |
initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
|
| 21 |
In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
|
|
@@ -29,7 +29,6 @@ Response table:
|
|
| 29 |
|
| 30 |
New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
|
| 31 |
|
| 32 |
-
|
| 33 |
###
|
| 34 |
# Adding existing topics to consultation responses
|
| 35 |
###
|
|
@@ -50,11 +49,11 @@ add_existing_topics_prompt = """{validate_prompt_prefix}Your task is to create o
|
|
| 50 |
In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
|
| 51 |
Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
Topics that are relevant to this dataset are shown in the following Topics table:
|
| 56 |
{topics}
|
| 57 |
|
|
|
|
|
|
|
| 58 |
New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
|
| 59 |
|
| 60 |
###
|
|
|
|
| 15 |
|
| 16 |
initial_table_assistant_prefill = "|"
|
| 17 |
|
| 18 |
+
default_response_reference_format = "In the next column named 'Response References', list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do not write any other text in this column."
|
| 19 |
|
| 20 |
initial_table_prompt = """{validate_prompt_prefix}Your task is to create one new markdown table based on open text responses in the reponse table below.
|
| 21 |
In the first column named 'General topic', identify general topics relevant to responses. Create as many general topics as you can.
|
|
|
|
| 29 |
|
| 30 |
New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
|
| 31 |
|
|
|
|
| 32 |
###
|
| 33 |
# Adding existing topics to consultation responses
|
| 34 |
###
|
|
|
|
| 49 |
In the final column named 'Summary', write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
|
| 50 |
Do not add any other columns. Do not add any other text to your response. Only mention topics that are relevant to at least one response.
|
| 51 |
|
| 52 |
+
Choose from among the following topic names to assign to the responses, only if they are directlyrelevant to responses from the response table below:
|
|
|
|
|
|
|
| 53 |
{topics}
|
| 54 |
|
| 55 |
+
{response_table}
|
| 56 |
+
|
| 57 |
New table:{previous_table_introduction}{previous_table}{validate_prompt_suffix}"""
|
| 58 |
|
| 59 |
###
|