Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Sep 8

Commit

138286c

1 Parent(s): 5fa40a6

Added framework of support for Azure models (although untested)

Browse files

Files changed (10) hide show

app.py +9 -4
requirements.txt +7 -1
requirements_cpu.txt +3 -1
requirements_gpu.txt +3 -1
requirements_no_local.txt +3 -1
tools/aws_functions.py +17 -9
tools/config.py +34 -3
tools/dedup_summaries.py +9 -3
tools/llm_api_call.py +82 -21
tools/llm_funcs.py +83 -50

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
-from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN
 def ensure_folder_exists(output_folder:str):
     """Checks if the specified folder exists, creates it if not."""
@@ -307,6 +307,9 @@ with app:
         with gr.Accordion("Gemini API keys", open = False):
             google_api_key_textbox = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Hugging Face API keys", open = False):
             hf_api_key_textbox = gr.Textbox(value = HF_TOKEN, label="Enter Hugging Face API key (only if using Hugging Face models)", lines=1, type="password")
@@ -369,7 +372,7 @@ with app:
     success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
     success(load_in_data_file,
         inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
-    success(fn=wrapper_extract_topics_per_column_value,
         inputs=[in_group_col,
                 in_data_files,
                 file_data_state,
@@ -405,6 +408,7 @@ with app:
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
@@ -467,10 +471,10 @@ with app:
     success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
     success(load_in_data_file,
         inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
-    success(fn=wrapper_extract_topics_per_column_value,
         inputs=[in_group_col,
                 in_data_files,
-                file_data_state,
                 master_topic_df_state,
                 master_reference_df_state,
                 master_unique_topics_df_state,
@@ -503,6 +507,7 @@ with app:
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,

 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
+from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN, AZURE_API_KEY
 def ensure_folder_exists(output_folder:str):
     """Checks if the specified folder exists, creates it if not."""
         with gr.Accordion("Gemini API keys", open = False):
             google_api_key_textbox = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
+        with gr.Accordion("Azure AI Inference", open = False):
+            azure_api_key_textbox = gr.Textbox(value = AZURE_API_KEY, label="Enter Azure AI Inference API key (only if using Azure models)", lines=1, type="password")
         with gr.Accordion("Hugging Face API keys", open = False):
             hf_api_key_textbox = gr.Textbox(value = HF_TOKEN, label="Enter Hugging Face API key (only if using Hugging Face models)", lines=1, type="password")
     success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
     success(load_in_data_file,
         inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
+        success(fn=wrapper_extract_topics_per_column_value,
         inputs=[in_group_col,
                 in_data_files,
                 file_data_state,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
+                azure_api_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
     success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
     success(load_in_data_file,
         inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, working_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
+            success(fn=wrapper_extract_topics_per_column_value,
         inputs=[in_group_col,
                 in_data_files,
+                file_data_state,
                 master_topic_df_state,
                 master_reference_df_state,
                 master_unique_topics_df_state,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 hf_api_key_textbox,
+                azure_api_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,

requirements.txt CHANGED Viewed

@@ -9,7 +9,9 @@ openpyxl==3.1.5
 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
-google-genai==1.32.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0
@@ -24,5 +26,9 @@ accelerate==1.10.1
 #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
 # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
 #https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl

 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
+google-genai==1.33.0
+azure-ai-inference==1.0.0b9
+azure-core==1.35.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0
 #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
 # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
 #https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
+# CPU only (for e.g. Hugging Face CPU instances)
+#torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
+# For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts
+#https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl

requirements_cpu.txt CHANGED Viewed

@@ -8,7 +8,9 @@ openpyxl==3.1.5
 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
-google-genai==1.32.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0

 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
+google-genai==1.33.0
+azure-ai-inference==1.0.0b9
+azure-core==1.35.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0

requirements_gpu.txt CHANGED Viewed

@@ -8,7 +8,9 @@ openpyxl==3.1.5
 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
-google-genai==1.32.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0

 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
+google-genai==1.33.0
+azure-ai-inference==1.0.0b9
+azure-core==1.35.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0

requirements_no_local.txt CHANGED Viewed

@@ -9,7 +9,9 @@ openpyxl==3.1.5
 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
-google-genai==1.32.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0

 markdown==3.7
 tabulate==0.9.0
 lxml==5.3.0
+google-genai==1.33.0
+azure-ai-inference==1.0.0b9
+azure-core==1.35.0
 html5lib==1.1
 beautifulsoup4==4.12.3
 rapidfuzz==3.13.0

tools/aws_functions.py CHANGED Viewed

@@ -11,34 +11,42 @@ def connect_to_bedrock_runtime(model_name_map:dict, model_choice:str, aws_access
     # If running an anthropic model, assume that running an AWS Bedrock model, load in Bedrock
     model_source = model_name_map[model_choice]["source"]
-    if "AWS" in model_source:
-        if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Bedrock using AWS access key and secret keys from user input.")
             bedrock_runtime = boto3.client('bedrock-runtime',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox, region_name=AWS_REGION)
-        elif RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
-            print("Connecting to Bedrock via existing SSO connection")
-            bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
             print("Getting Bedrock credentials from environment variables")
             bedrock_runtime = boto3.client('bedrock-runtime',
                 aws_access_key_id=AWS_ACCESS_KEY,
                 aws_secret_access_key=AWS_SECRET_KEY,
-                region_name=AWS_REGION)
         else:
             bedrock_runtime = ""
             out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
             print(out_message)
             raise Exception(out_message)
     else:
-        bedrock_runtime = []
     return bedrock_runtime
 def connect_to_s3_client(aws_access_key_textbox:str="", aws_secret_key_textbox:str=""):
     # If running an anthropic model, assume that running an AWS s3 model, load in s3
-    s3_client = []
     if aws_access_key_textbox and aws_secret_key_textbox:
         print("Connecting to s3 using AWS access key and secret keys from user input.")
@@ -148,7 +156,7 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
     """
     if RUN_AWS_FUNCTIONS == "1":
-        final_out_message = []
         s3_client = connect_to_s3_client(aws_access_key_textbox, aws_secret_key_textbox)
         #boto3.client('s3')

     # If running an anthropic model, assume that running an AWS Bedrock model, load in Bedrock
     model_source = model_name_map[model_choice]["source"]
+    if "AWS" in model_source:
+        if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
+            print("Connecting to Bedrock via existing SSO connection")
+            bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
+        elif RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
+            print("Connecting to Bedrock via existing SSO connection")
+            bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
+        elif aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Bedrock using AWS access key and secret keys from user input.")
             bedrock_runtime = boto3.client('bedrock-runtime',
                 aws_access_key_id=aws_access_key_textbox,
                 aws_secret_access_key=aws_secret_key_textbox, region_name=AWS_REGION)
         elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
             print("Getting Bedrock credentials from environment variables")
             bedrock_runtime = boto3.client('bedrock-runtime',
                 aws_access_key_id=AWS_ACCESS_KEY,
                 aws_secret_access_key=AWS_SECRET_KEY,
+                region_name=AWS_REGION)
+        elif RUN_AWS_FUNCTIONS == "1":
+            print("Connecting to Bedrock via existing SSO connection")
+            bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
         else:
             bedrock_runtime = ""
             out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
             print(out_message)
             raise Exception(out_message)
     else:
+        bedrock_runtime = list()
+    print("Bedrock runtime connected:", bedrock_runtime)
     return bedrock_runtime
 def connect_to_s3_client(aws_access_key_textbox:str="", aws_secret_key_textbox:str=""):
     # If running an anthropic model, assume that running an AWS s3 model, load in s3
+    s3_client = list()
     if aws_access_key_textbox and aws_secret_key_textbox:
         print("Connecting to s3 using AWS access key and secret keys from user input.")
     """
     if RUN_AWS_FUNCTIONS == "1":
+        final_out_message = list()
         s3_client = connect_to_s3_client(aws_access_key_textbox, aws_secret_key_textbox)
         #boto3.client('s3')

tools/config.py CHANGED Viewed

@@ -206,6 +206,33 @@ RUN_GEMINI_MODELS = get_or_create_env_var("RUN_GEMINI_MODELS", "1")
 RUN_AWS_BEDROCK_MODELS = get_or_create_env_var("RUN_AWS_BEDROCK_MODELS", "1")
 GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
 # Build up options for models
 model_full_names = list()
@@ -225,12 +252,16 @@ if RUN_AWS_BEDROCK_MODELS == "1":
     model_source.extend(["AWS", "AWS", "AWS", "AWS", "AWS"])
 if RUN_GEMINI_MODELS == "1":
-    model_full_names.extend(["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro"]) # , # Gemini pro No longer available on free tier
     model_short_names.extend(["gemini_flash_lite_2.5", "gemini_flash_2.5", "gemini_pro"])
     model_source.extend(["Gemini", "Gemini", "Gemini"])
-#print("model_short_names:", model_short_names)
-#print("model_full_names:", model_full_names)
 model_name_map = {
     full: {"short_name": short, "source": source}

 RUN_AWS_BEDROCK_MODELS = get_or_create_env_var("RUN_AWS_BEDROCK_MODELS", "1")
 GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
+# Build up options for models
+###
+# LLM variables
+###
+MAX_TOKENS = int(get_or_create_env_var('MAX_TOKENS', '4096')) # Maximum number of output tokens
+TIMEOUT_WAIT = int(get_or_create_env_var('TIMEOUT_WAIT', '30')) # AWS now seems to have a 60 second minimum wait between API calls
+NUMBER_OF_RETRY_ATTEMPTS = int(get_or_create_env_var('NUMBER_OF_RETRY_ATTEMPTS', '5'))
+# Try up to 3 times to get a valid markdown table response with LLM calls, otherwise retry with temperature changed
+MAX_OUTPUT_VALIDATION_ATTEMPTS = int(get_or_create_env_var('MAX_OUTPUT_VALIDATION_ATTEMPTS', '3'))
+MAX_TIME_FOR_LOOP = int(get_or_create_env_var('MAX_TIME_FOR_LOOP', '99999'))
+BATCH_SIZE_DEFAULT = int(get_or_create_env_var('BATCH_SIZE_DEFAULT', '5'))
+DEDUPLICATION_THRESHOLD = int(get_or_create_env_var('DEDUPLICATION_THRESHOLD', '90'))
+MAX_COMMENT_CHARS = int(get_or_create_env_var('MAX_COMMENT_CHARS', '14000'))
+RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
+RUN_AWS_BEDROCK_MODELS = get_or_create_env_var("RUN_AWS_BEDROCK_MODELS", "1")
+RUN_GEMINI_MODELS = get_or_create_env_var("RUN_GEMINI_MODELS", "1")
+GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
+# Azure AI Inference settings
+RUN_AZURE_MODELS = get_or_create_env_var("RUN_AZURE_MODELS", "0")
+AZURE_API_KEY = get_or_create_env_var('AZURE_API_KEY', '')
+AZURE_INFERENCE_ENDPOINT = get_or_create_env_var('AZURE_INFERENCE_ENDPOINT', '')
 # Build up options for models
 model_full_names = list()
     model_source.extend(["AWS", "AWS", "AWS", "AWS", "AWS"])
 if RUN_GEMINI_MODELS == "1":
+    model_full_names.extend(["gemini-2.5-flash-lite", "gemini-2.5-flash", "gemini-2.5-pro"])
     model_short_names.extend(["gemini_flash_lite_2.5", "gemini_flash_2.5", "gemini_pro"])
     model_source.extend(["Gemini", "Gemini", "Gemini"])
+# Register Azure AI models (model names must match your Azure deployments)
+if RUN_AZURE_MODELS == "1":
+    # Example deployments; adjust to the deployments you actually create in Azure
+    model_full_names.extend(["gpt-5-mini"])
+    model_short_names.extend(["gpt-5-mini"])
+    model_source.extend(["Azure"])
 model_name_map = {
     full: {"short_name": short, "source": source}

tools/dedup_summaries.py CHANGED Viewed

@@ -8,12 +8,13 @@ import time
 import markdown
 import boto3
 from tqdm import tqdm
 from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt, summarise_everything_system_prompt, comprehensive_summary_format_prompt_by_group, summary_assistant_prefill
-from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model, calculate_tokens_from_metadata
 from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name, get_file_name_no_ext, create_batch_file_path_details
-from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, REASONING_SUFFIX
 from tools.aws_functions import connect_to_bedrock_runtime
 max_tokens = MAX_TOKENS
 timeout_wait = TIMEOUT_WAIT
@@ -440,10 +441,13 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     google_client = list()
     google_config = {}
-    # Prepare Gemini models before query
     if "Gemini" in model_source:
         #print("Using Gemini model:", model_choice)
         google_client, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
     elif "Local" in model_source:
         pass
         #print("Using local model: ", model_choice)
@@ -594,6 +598,8 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
             if "Local" in model_source and reasoning_suffix: formatted_summarise_topic_descriptions_system_prompt = formatted_summarise_topic_descriptions_system_prompt + "\n" + reasoning_suffix
             try:
                 response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, model_source, bedrock_runtime, local_model, tokenizer=tokenizer)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break

 import markdown
 import boto3
 from tqdm import tqdm
+import os
 from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt, summarise_everything_system_prompt, comprehensive_summary_format_prompt_by_group, summary_assistant_prefill
+from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model, calculate_tokens_from_metadata, construct_azure_client
 from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name, get_file_name_no_ext, create_batch_file_path_details
 from tools.aws_functions import connect_to_bedrock_runtime
+from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT
 max_tokens = MAX_TOKENS
 timeout_wait = TIMEOUT_WAIT
     google_client = list()
     google_config = {}
+    # Prepare Gemini models before query
     if "Gemini" in model_source:
         #print("Using Gemini model:", model_choice)
         google_client, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
+    elif "Azure" in model_source:
+        # Azure client (endpoint from env/config)
+        google_client, config = construct_azure_client(in_api_key=os.environ.get("AZURE_INFERENCE_CREDENTIAL", ""), endpoint=AZURE_INFERENCE_ENDPOINT)
     elif "Local" in model_source:
         pass
         #print("Using local model: ", model_choice)
             if "Local" in model_source and reasoning_suffix: formatted_summarise_topic_descriptions_system_prompt = formatted_summarise_topic_descriptions_system_prompt + "\n" + reasoning_suffix
             try:
+                print("formatted_summarise_topic_descriptions_system_prompt:", formatted_summarise_topic_descriptions_system_prompt)
                 response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, model_source, bedrock_runtime, local_model, tokenizer=tokenizer)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break

tools/llm_api_call.py CHANGED Viewed

@@ -16,8 +16,8 @@ GradioFileData = gr.FileData
 from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details
-from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata
-from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX
 from tools.aws_functions import connect_to_bedrock_runtime
 if RUN_LOCAL_MODEL == "1":
@@ -477,8 +477,6 @@ def write_llm_output_and_logs(response_text: str,
         new_reference_df = pd.DataFrame(reference_data)
     else:
         new_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
-    print("new_reference_df:", new_reference_df)
     # Append on old reference data
     if not new_reference_df.empty:
@@ -689,6 +687,7 @@ def extract_topics(in_data_file: GradioFileData,
               aws_access_key_textbox:str='',
               aws_secret_key_textbox:str='',
               hf_api_key_textbox:str='',
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
@@ -708,7 +707,7 @@ def extract_topics(in_data_file: GradioFileData,
     - unique_table_df_display_table_markdown (str): Table for display in markdown format.
     - file_name (str): File name of the data file.
     - num_batches (int): Number of batches required to go through all the response rows.
-    - in_api_key (str): The API key for authentication.
     - temperature (float): The temperature parameter for the model.
     - chosen_cols (List[str]): A list of chosen columns to process.
     - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
@@ -843,7 +842,7 @@ def extract_topics(in_data_file: GradioFileData,
         for i in topics_loop:
             reported_batch_no = latest_batch_completed + 1
-            print("Running batch:", reported_batch_no)
             # Call the function to prepare the input table
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
@@ -859,10 +858,16 @@ def extract_topics(in_data_file: GradioFileData,
                     formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
-                    # Prepare Gemini models before query
                     if "Gemini" in model_source:
                         print("Using Gemini model:", model_choice)
                         google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=formatted_system_prompt, max_tokens=max_tokens)
                     elif "anthropic.claude" in model_choice:
                         print("Using AWS Bedrock model:", model_choice)
                     else:
@@ -1034,6 +1039,11 @@ def extract_topics(in_data_file: GradioFileData,
                     if model_source == "Gemini":
                         print("Using Gemini model:", model_choice)
                         google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=formatted_initial_table_system_prompt, max_tokens=max_tokens)
                     elif model_choice == CHOSEN_LOCAL_MODEL_TYPE:
                         print("Using local model:", model_choice)
                     else:
@@ -1118,7 +1128,7 @@ def extract_topics(in_data_file: GradioFileData,
             # Increase latest file completed count unless we are over the last batch number
             if latest_batch_completed <= num_batches:
-                print("Completed batch number:", str(reported_batch_no))
                 latest_batch_completed += 1
             toc = time.perf_counter()
@@ -1242,19 +1252,16 @@ def wrapper_extract_topics_per_column_value(
     initial_existing_topic_summary_df: pd.DataFrame,
     initial_unique_table_df_display_table_markdown: str,
     original_file_name: str, # Original file name, to be modified per segment
-    # Initial state parameters (wrapper will use these for the very first call)
     total_number_of_batches:int,
     in_api_key: str,
     temperature: float,
     chosen_cols: List[str],
     model_choice: str,
     candidate_topics: GradioFileData = None,
     initial_first_loop_state: bool = True,
     initial_whole_conversation_metadata_str: str = '',
     initial_latest_batch_completed: int = 0,
     initial_time_taken: float = 0,
     initial_table_prompt: str = initial_table_prompt,
     prompt2: str = prompt2,
     prompt3: str = prompt3,
@@ -1273,14 +1280,67 @@ def wrapper_extract_topics_per_column_value(
     aws_access_key_textbox:str="",
     aws_secret_key_textbox:str="",
     hf_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
     model_name_map: dict = model_name_map,
     max_time_for_loop: int = max_time_for_loop, # This applies per call to extract_topics
     CHOSEN_LOCAL_MODEL_TYPE: str = CHOSEN_LOCAL_MODEL_TYPE,
     progress=Progress(track_tqdm=True) # type: ignore
 ) -> Tuple: # Mimicking the return tuple structure of extract_topics
     acc_input_tokens = 0
     acc_output_tokens = 0
@@ -1380,7 +1440,7 @@ def wrapper_extract_topics_per_column_value(
                 seg_join_files,
                 seg_reference_df_pivot,
                 seg_missing_df
-            ) = extract_topics(
                 in_data_file=in_data_file,
                 file_data=filtered_file_data,
                 existing_topics_table=pd.DataFrame(), #acc_topics_table.copy(), # Pass the accumulated table
@@ -1389,19 +1449,17 @@ def wrapper_extract_topics_per_column_value(
                 unique_table_df_display_table_markdown="", # extract_topics will generate this
                 file_name=segment_file_name,
                 num_batches=current_num_batches,
-                latest_batch_completed=current_latest_batch_completed, # Reset for each new segment's internal batching
-                first_loop_state=current_first_loop_state, # True only for the very first iteration of wrapper
-                out_message= list(), # Fresh for each call
-                out_file_paths= list(),# Fresh for each call
-                log_files_output_paths= list(),# Fresh for each call
-                whole_conversation_metadata_str="", # Fresh for each call
-                time_taken=0, # Time taken for this specific call, wrapper sums it.
-                # Pass through other parameters
                 in_api_key=in_api_key,
                 temperature=temperature,
                 chosen_cols=chosen_cols,
                 model_choice=model_choice,
                 candidate_topics=candidate_topics,
                 initial_table_prompt=initial_table_prompt,
                 prompt2=prompt2,
                 prompt3=prompt3,
@@ -1411,6 +1469,7 @@ def wrapper_extract_topics_per_column_value(
                 number_of_prompts_used=number_of_prompts_used,
                 batch_size=batch_size,
                 context_textbox=context_textbox,
                 sentiment_checkbox=sentiment_checkbox,
                 force_zero_shot_radio=force_zero_shot_radio,
                 in_excel_sheets=in_excel_sheets,
@@ -1422,11 +1481,13 @@ def wrapper_extract_topics_per_column_value(
                 aws_access_key_textbox=aws_access_key_textbox,
                 aws_secret_key_textbox=aws_secret_key_textbox,
                 hf_api_key_textbox=hf_api_key_textbox,
                 max_tokens=max_tokens,
                 model_name_map=model_name_map,
                 max_time_for_loop=max_time_for_loop,
                 CHOSEN_LOCAL_MODEL_TYPE=CHOSEN_LOCAL_MODEL_TYPE,
-                progress=progress,
             )
             # Aggregate results

 from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details
+from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client
+from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT
 from tools.aws_functions import connect_to_bedrock_runtime
 if RUN_LOCAL_MODEL == "1":
         new_reference_df = pd.DataFrame(reference_data)
     else:
         new_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
     # Append on old reference data
     if not new_reference_df.empty:
               aws_access_key_textbox:str='',
               aws_secret_key_textbox:str='',
               hf_api_key_textbox:str='',
+              azure_api_key_textbox:str='',
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
     - unique_table_df_display_table_markdown (str): Table for display in markdown format.
     - file_name (str): File name of the data file.
     - num_batches (int): Number of batches required to go through all the response rows.
+    - in_api_key (str): The API key for authentication (Google Gemini).
     - temperature (float): The temperature parameter for the model.
     - chosen_cols (List[str]): A list of chosen columns to process.
     - candidate_topics (gr.FileData): A Gradio FileData object of existing candidate topics submitted by the user.
         for i in topics_loop:
             reported_batch_no = latest_batch_completed + 1
+            print("Running response batch:", reported_batch_no)
             # Call the function to prepare the input table
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
                     formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
+                    # Prepare clients before query
                     if "Gemini" in model_source:
                         print("Using Gemini model:", model_choice)
                         google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=formatted_system_prompt, max_tokens=max_tokens)
+                    elif "Azure" in model_source:
+                        print("Using Azure AI Inference model:", model_choice)
+                        # If provided, set env for downstream calls too
+                        if azure_api_key_textbox:
+                            os.environ["AZURE_INFERENCE_CREDENTIAL"] = azure_api_key_textbox
+                        google_client, google_config = construct_azure_client(in_api_key=azure_api_key_textbox, endpoint=AZURE_INFERENCE_ENDPOINT)
                     elif "anthropic.claude" in model_choice:
                         print("Using AWS Bedrock model:", model_choice)
                     else:
                     if model_source == "Gemini":
                         print("Using Gemini model:", model_choice)
                         google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=formatted_initial_table_system_prompt, max_tokens=max_tokens)
+                    elif model_source == "Azure":
+                        print("Using Azure AI Inference model:", model_choice)
+                        if azure_api_key_textbox:
+                            os.environ["AZURE_INFERENCE_CREDENTIAL"] = azure_api_key_textbox
+                        google_client, google_config = construct_azure_client(in_api_key=azure_api_key_textbox, endpoint=AZURE_INFERENCE_ENDPOINT)
                     elif model_choice == CHOSEN_LOCAL_MODEL_TYPE:
                         print("Using local model:", model_choice)
                     else:
             # Increase latest file completed count unless we are over the last batch number
             if latest_batch_completed <= num_batches:
+                #print("Completed batch number:", str(reported_batch_no))
                 latest_batch_completed += 1
             toc = time.perf_counter()
     initial_existing_topic_summary_df: pd.DataFrame,
     initial_unique_table_df_display_table_markdown: str,
     original_file_name: str, # Original file name, to be modified per segment
     total_number_of_batches:int,
     in_api_key: str,
     temperature: float,
     chosen_cols: List[str],
     model_choice: str,
     candidate_topics: GradioFileData = None,
     initial_first_loop_state: bool = True,
     initial_whole_conversation_metadata_str: str = '',
     initial_latest_batch_completed: int = 0,
     initial_time_taken: float = 0,
     initial_table_prompt: str = initial_table_prompt,
     prompt2: str = prompt2,
     prompt3: str = prompt3,
     aws_access_key_textbox:str="",
     aws_secret_key_textbox:str="",
     hf_api_key_textbox:str="",
+    azure_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
     model_name_map: dict = model_name_map,
     max_time_for_loop: int = max_time_for_loop, # This applies per call to extract_topics
+    reasoning_suffix: str = reasoning_suffix,
     CHOSEN_LOCAL_MODEL_TYPE: str = CHOSEN_LOCAL_MODEL_TYPE,
     progress=Progress(track_tqdm=True) # type: ignore
 ) -> Tuple: # Mimicking the return tuple structure of extract_topics
+    """
+    A wrapper function that iterates through unique values in a specified grouping column
+    and calls the `extract_topics` function for each segment of the data.
+    It accumulates results from each call and returns a consolidated output.
+    :param grouping_col: The name of the column to group the data by.
+    :param in_data_file: The input data file object (e.g., Gradio FileData).
+    :param file_data: The full DataFrame containing all data.
+    :param initial_existing_topics_table: Initial DataFrame of existing topics.
+    :param initial_existing_reference_df: Initial DataFrame mapping responses to topics.
+    :param initial_existing_topic_summary_df: Initial DataFrame summarizing topics.
+    :param initial_unique_table_df_display_table_markdown: Initial markdown string for topic display.
+    :param original_file_name: The original name of the input file.
+    :param total_number_of_batches: The total number of batches across all data.
+    :param in_api_key: API key for the chosen LLM.
+    :param temperature: Temperature setting for the LLM.
+    :param chosen_cols: List of columns from `file_data` to be processed.
+    :param model_choice: The chosen LLM model (e.g., "Gemini", "AWS Claude").
+    :param candidate_topics: Optional Gradio FileData for candidate topics (zero-shot).
+    :param initial_first_loop_state: Boolean indicating if this is the very first loop iteration.
+    :param initial_whole_conversation_metadata_str: Initial metadata string for the whole conversation.
+    :param initial_latest_batch_completed: The batch number completed in the previous run.
+    :param initial_time_taken: Initial time taken for processing.
+    :param initial_table_prompt: The initial prompt for table summarization.
+    :param prompt2: The second prompt for LLM interaction.
+    :param prompt3: The third prompt for LLM interaction.
+    :param initial_table_system_prompt: The initial system prompt for table summarization.
+    :param add_existing_topics_system_prompt: System prompt for adding existing topics.
+    :param add_existing_topics_prompt: Prompt for adding existing topics.
+    :param number_of_prompts_used: Number of prompts used in the LLM call.
+    :param batch_size: Number of rows to process in each batch for the LLM.
+    :param context_textbox: Additional context provided by the user.
+    :param sentiment_checkbox: Choice for sentiment assessment (e.g., "Negative, Neutral, or Positive").
+    :param force_zero_shot_radio: Option to force responses into zero-shot topics.
+    :param in_excel_sheets: List of Excel sheet names if applicable.
+    :param force_single_topic_radio: Option to force a single topic per response.
+    :param produce_structures_summary_radio: Option to produce a structured summary.
+    :param aws_access_key_textbox: AWS access key for Bedrock.
+    :param aws_secret_key_textbox: AWS secret key for Bedrock.
+    :param hf_api_key_textbox: Hugging Face API key for local models.
+    :param azure_api_key_textbox: Azure API key for Azure AI Inference.
+    :param output_folder: The folder where output files will be saved.
+    :param force_single_topic_prompt: Prompt for forcing a single topic.
+    :param max_tokens: Maximum tokens for LLM generation.
+    :param model_name_map: Dictionary mapping model names to their properties.
+    :param max_time_for_loop: Maximum time allowed for the processing loop.
+    :param reasoning_suffix: Suffix to append for reasoning.
+    :param CHOSEN_LOCAL_MODEL_TYPE: Type of local model chosen.
+    :param progress: Gradio Progress object for tracking progress.
+    :return: A tuple containing consolidated results, mimicking the return structure of `extract_topics`.
+    """
     acc_input_tokens = 0
     acc_output_tokens = 0
                 seg_join_files,
                 seg_reference_df_pivot,
                 seg_missing_df
+                        ) = extract_topics(
                 in_data_file=in_data_file,
                 file_data=filtered_file_data,
                 existing_topics_table=pd.DataFrame(), #acc_topics_table.copy(), # Pass the accumulated table
                 unique_table_df_display_table_markdown="", # extract_topics will generate this
                 file_name=segment_file_name,
                 num_batches=current_num_batches,
                 in_api_key=in_api_key,
                 temperature=temperature,
                 chosen_cols=chosen_cols,
                 model_choice=model_choice,
                 candidate_topics=candidate_topics,
+                latest_batch_completed=current_latest_batch_completed, # Reset for each new segment's internal batching
+                out_message= list(), # Fresh for each call
+                out_file_paths= list(),# Fresh for each call
+                log_files_output_paths= list(),# Fresh for each call
+                first_loop_state=current_first_loop_state, # True only for the very first iteration of wrapper
+                whole_conversation_metadata_str="", # Fresh for each call
                 initial_table_prompt=initial_table_prompt,
                 prompt2=prompt2,
                 prompt3=prompt3,
                 number_of_prompts_used=number_of_prompts_used,
                 batch_size=batch_size,
                 context_textbox=context_textbox,
+                time_taken=0, # Time taken for this specific call, wrapper sums it.
                 sentiment_checkbox=sentiment_checkbox,
                 force_zero_shot_radio=force_zero_shot_radio,
                 in_excel_sheets=in_excel_sheets,
                 aws_access_key_textbox=aws_access_key_textbox,
                 aws_secret_key_textbox=aws_secret_key_textbox,
                 hf_api_key_textbox=hf_api_key_textbox,
+                azure_api_key_textbox=azure_api_key_textbox,
                 max_tokens=max_tokens,
                 model_name_map=model_name_map,
                 max_time_for_loop=max_time_for_loop,
                 CHOSEN_LOCAL_MODEL_TYPE=CHOSEN_LOCAL_MODEL_TYPE,
+                reasoning_suffix=reasoning_suffix,
+                progress=progress
             )
             # Aggregate results

tools/llm_funcs.py CHANGED Viewed

@@ -13,12 +13,16 @@ from google.genai import types
 import gradio as gr
 from gradio import Progress
 model_type = None # global variable setup
 full_text = "" # Define dummy source text (full text) just to enable highlight function to load
 model = list() # Define empty list for model functions to run
 tokenizer = list() #[] # Define empty list for model functions to run
-from tools.config import AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS, USE_LLAMA_CPP, COMPILE_MODE, MODEL_DTYPE, USE_BITSANDBYTES, COMPILE_TRANSFORMERS, INT8_WITH_OFFLOAD_TO_CPU
 from tools.prompts import initial_table_assistant_prefill
 if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
@@ -500,16 +504,7 @@ def llama_cpp_streaming(history, full_prompt, temperature=temperature):
 def construct_gemini_generative_model(in_api_key: str, temperature: float, model_choice: str, system_prompt: str, max_tokens: int, random_seed=seed) -> Tuple[object, dict]:
     """
     Constructs a GenerativeModel for Gemini API calls.
-    Parameters:
-    - in_api_key (str): The API key for authentication.
-    - temperature (float): The temperature parameter for the model, controlling the randomness of the output.
-    - model_choice (str): The choice of model to use for generation.
-    - system_prompt (str): The system prompt to guide the generation.
-    - max_tokens (int): The maximum number of tokens to generate.
-    Returns:
-    - Tuple[object, dict]: A tuple containing the constructed GenerativeModel and its configuration.
     """
     # Construct a GenerativeModel
     try:
@@ -532,6 +527,31 @@ def construct_gemini_generative_model(in_api_key: str, temperature: float, model
     return client, config
 def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tokens: int, model_choice:str, bedrock_runtime:boto3.Session.client, assistant_prefill:str="") -> ResponseObject:
     """
     This function sends a request to AWS Claude with the following parameters:
@@ -667,15 +687,10 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     duration = end_time - start_time
     tokens_per_second = num_generated_tokens / duration
-    # print("\n--- Inference Results ---")
-    # print(f"System Prompt: {conversation[0]['content']}")
-    # print(f"User Prompt: {conversation[1]['content']}")
-    # print("---")
-    # print(f"Assistant's Reply: {assistant_reply}")
-    # print("\n--- Performance ---")
-    # print(f"Time taken: {duration:.2f} seconds")
-    # print(f"Generated tokens: {num_generated_tokens}")
-    # print(f"Tokens per second: {tokens_per_second:.2f}")
     return assistant_reply, num_input_tokens, num_generated_tokens
@@ -725,6 +740,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
             if i == number_of_api_retry_attempts:
                 return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "AWS" in model_source:
         for i in progress_bar:
             try:
@@ -740,6 +756,35 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
             if i == number_of_api_retry_attempts:
                 return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "Local" in model_source:
         # This is the local model
         for i in progress_bar:
@@ -776,28 +821,29 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     # Check if is a LLama.cpp model response
     if isinstance(response, ResponseObject):
         response_text = response.text
-        conversation_history.append({'role': 'assistant', 'parts': [response_text]})
     elif 'choices' in response: # LLama.cpp model response
         if "gpt-oss" in model_choice:
             response_text = response['choices'][0]['message']['content'].split('<|start|>assistant<|channel|>final<|message|>')[1]
         else:
             response_text = response['choices'][0]['message']['content']
         response_text = response_text.strip()
-        conversation_history.append({'role': 'assistant', 'parts': [response_text]}) #response['choices'][0]['text']]})
     elif model_source == "Gemini":
         response_text = response.text
         response_text = response_text.strip()
-        conversation_history.append({'role': 'assistant', 'parts': [response_text]})
     else: # Assume transformers model response
         if "gpt-oss" in model_choice:
             response_text = response.split('<|start|>assistant<|channel|>final<|message|>')[1]
         else:
             response_text = response
-        conversation_history.append({'role': 'assistant', 'parts': [response_text]})
     return response, conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
-def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), tokenizer=tokenizer, master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
@@ -836,28 +882,27 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
         whole_conversation.append(response_text)
         # Create conversation metadata
-        if master == False:
-            whole_conversation_metadata.append(f"Batch {batch_no}:")
-        else:
-            #whole_conversation_metadata.append(f"Query summary metadata:")
-            whole_conversation_metadata.append(f"Batch {batch_no}:")
         # if not isinstance(response, str):
         try:
             if "AWS" in model_source:
-                #print("Extracting usage metadata from Converse API response...")
-                # Using .get() is safer than direct access, in case a key is missing.
                 output_tokens = response.usage_metadata.get('outputTokens', 0)
                 input_tokens = response.usage_metadata.get('inputTokens', 0)
-                #print(f"Extracted Token Counts - Input: {input_tokens}, Output: {output_tokens}")
-            elif "Gemini" in model_source:
                 output_tokens = response.usage_metadata.candidates_token_count
                 input_tokens = response.usage_metadata.prompt_token_count
             elif "Local" in model_source:
                 if USE_LLAMA_CPP == "True":
                     output_tokens = response['usage'].get('completion_tokens', 0)
@@ -1012,20 +1057,8 @@ def calculate_tokens_from_metadata(metadata_string:str, model_choice:str, model_
     # Regex to find the numbers following the keys in the "Query summary metadata" section
     # This ensures we get the final, aggregated totals for the whole query.
-    #if "Gemini" in model_source:
     input_regex = r"input_tokens: (\d+)"
     output_regex = r"output_tokens: (\d+)"
-    # elif "AWS" in model_source:
-    #     input_regex = r"inputTokens: (\d+)"
-    #     output_regex = r"outputTokens: (\d+)"
-    # elif "Local" in model_source:
-    #     print("Local model source")
-    #     input_regex = r"\'prompt_tokens\': (\d+)"
-    #     output_regex = r"\'completion_tokens\': (\d+)"
-    #print("Metadata string:", metadata_string)
-    #print("Input regex:", input_regex)
-    #print("Output regex:", output_regex)
     # re.findall returns a list of all matching strings (the captured groups).
     input_token_strings = re.findall(input_regex, metadata_string)

 import gradio as gr
 from gradio import Progress
+from azure.ai.inference import ChatCompletionsClient
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.inference.models import SystemMessage, UserMessage
 model_type = None # global variable setup
 full_text = "" # Define dummy source text (full text) just to enable highlight function to load
 model = list() # Define empty list for model functions to run
 tokenizer = list() #[] # Define empty list for model functions to run
+from tools.config import AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS, USE_LLAMA_CPP, COMPILE_MODE, MODEL_DTYPE, USE_BITSANDBYTES, COMPILE_TRANSFORMERS, INT8_WITH_OFFLOAD_TO_CPU, AZURE_INFERENCE_ENDPOINT
 from tools.prompts import initial_table_assistant_prefill
 if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
 def construct_gemini_generative_model(in_api_key: str, temperature: float, model_choice: str, system_prompt: str, max_tokens: int, random_seed=seed) -> Tuple[object, dict]:
     """
     Constructs a GenerativeModel for Gemini API calls.
+    ...
     """
     # Construct a GenerativeModel
     try:
     return client, config
+def construct_azure_client(in_api_key: str, endpoint: str) -> Tuple[object, dict]:
+    """
+    Constructs a ChatCompletionsClient for Azure AI Inference.
+    """
+    try:
+        key = None
+        if in_api_key:
+            key = in_api_key
+        elif os.environ.get("AZURE_INFERENCE_CREDENTIAL"):
+            key = os.environ["AZURE_INFERENCE_CREDENTIAL"]
+        elif os.environ.get("AZURE_API_KEY"):
+            key = os.environ["AZURE_API_KEY"]
+        if not key:
+            raise Warning("No Azure API key found.")
+        if not endpoint:
+            endpoint = os.environ.get("AZURE_INFERENCE_ENDPOINT", "")
+            if not endpoint:
+                raise Warning("No Azure inference endpoint found.")
+        client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+        return client, {}
+    except Exception as e:
+        print("Error constructing Azure ChatCompletions client:", e)
+        raise
 def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tokens: int, model_choice:str, bedrock_runtime:boto3.Session.client, assistant_prefill:str="") -> ResponseObject:
     """
     This function sends a request to AWS Claude with the following parameters:
     duration = end_time - start_time
     tokens_per_second = num_generated_tokens / duration
+    print("\n--- Performance ---")
+    print(f"Time taken: {duration:.2f} seconds")
+    print(f"Generated tokens: {num_generated_tokens}")
+    print(f"Tokens per second: {tokens_per_second:.2f}")
     return assistant_reply, num_input_tokens, num_generated_tokens
             if i == number_of_api_retry_attempts:
                 return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "AWS" in model_source:
         for i in progress_bar:
             try:
             if i == number_of_api_retry_attempts:
                 return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
+    elif "Azure" in model_source:
+        for i in progress_bar:
+            try:
+                print("Calling Azure AI Inference model, attempt", i + 1)
+                # Use structured messages for Azure
+                response_raw = google_client.complete(
+                    messages=[
+                        SystemMessage(content=system_prompt),
+                        UserMessage(content=prompt),
+                    ],
+                    model=model_choice
+                )
+                response_text = response_raw.choices[0].message.content
+                usage = getattr(response_raw, "usage", None)
+                input_tokens = 0
+                output_tokens = 0
+                if usage is not None:
+                    input_tokens = getattr(usage, "input_tokens", getattr(usage, "prompt_tokens", 0))
+                    output_tokens = getattr(usage, "output_tokens", getattr(usage, "completion_tokens", 0))
+                response = ResponseObject(
+                    text=response_text,
+                    usage_metadata={'inputTokens': input_tokens, 'outputTokens': output_tokens}
+                )
+                break
+            except Exception as e:
+                print("Call to Azure model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
+                time.sleep(timeout_wait)
+            if i == number_of_api_retry_attempts:
+                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "Local" in model_source:
         # This is the local model
         for i in progress_bar:
     # Check if is a LLama.cpp model response
     if isinstance(response, ResponseObject):
         response_text = response.text
     elif 'choices' in response: # LLama.cpp model response
         if "gpt-oss" in model_choice:
             response_text = response['choices'][0]['message']['content'].split('<|start|>assistant<|channel|>final<|message|>')[1]
         else:
             response_text = response['choices'][0]['message']['content']
         response_text = response_text.strip()
     elif model_source == "Gemini":
         response_text = response.text
         response_text = response_text.strip()
     else: # Assume transformers model response
         if "gpt-oss" in model_choice:
             response_text = response.split('<|start|>assistant<|channel|>final<|message|>')[1]
         else:
             response_text = response
+    conversation_history.append({'role': 'assistant', 'parts': [response_text]})
     return response, conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
+def process_requests(prompts: List[str],
+system_prompt: str,
+conversation_history: List[dict],
+whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), tokenizer=tokenizer, master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
         whole_conversation.append(response_text)
         # Create conversation metadata
+        # if master == False:
+        #     whole_conversation_metadata.append(f"Batch {batch_no}:")
+        # else:
+        #     #whole_conversation_metadata.append(f"Query summary metadata:")
+        whole_conversation_metadata.append(f"Batch {batch_no}:")
         # if not isinstance(response, str):
         try:
             if "AWS" in model_source:
                 output_tokens = response.usage_metadata.get('outputTokens', 0)
                 input_tokens = response.usage_metadata.get('inputTokens', 0)
+            elif "Gemini" in model_source:
                 output_tokens = response.usage_metadata.candidates_token_count
                 input_tokens = response.usage_metadata.prompt_token_count
+            elif "Azure" in model_source:
+                input_tokens = response.usage_metadata.get('inputTokens', 0)
+                output_tokens = response.usage_metadata.get('outputTokens', 0)
             elif "Local" in model_source:
                 if USE_LLAMA_CPP == "True":
                     output_tokens = response['usage'].get('completion_tokens', 0)
     # Regex to find the numbers following the keys in the "Query summary metadata" section
     # This ensures we get the final, aggregated totals for the whole query.
     input_regex = r"input_tokens: (\d+)"
     output_regex = r"output_tokens: (\d+)"
     # re.findall returns a list of all matching strings (the captured groups).
     input_token_strings = re.findall(input_regex, metadata_string)