import pandas as pd from openpyxl import Workbook from openpyxl.utils.dataframe import dataframe_to_rows from openpyxl.utils import get_column_letter from openpyxl.styles import Alignment, Font from datetime import date, datetime import os from typing import List from tools.config import OUTPUT_FOLDER from tools.helper_functions import convert_reference_table_to_pivot_table, get_basic_response_data, load_in_data_file, clean_column_name def add_cover_sheet( wb:Workbook, intro_paragraphs:list[str], model_name:str, analysis_date:str, analysis_cost:str, number_of_responses:int, number_of_responses_with_text:int, number_of_responses_with_text_five_plus_words:int, llm_call_number:int, input_tokens:int, output_tokens:int, time_taken:float, file_name:str, column_name:str, number_of_responses_with_topic_assignment:int, custom_title:str="Cover Sheet" ): ws = wb.create_sheet(title=custom_title, index=0) # Freeze top row ws.freeze_panes = "A2" # Write title ws["A1"] = "Large Language Model Topic analysis" ws["A1"].font = Font(size=14, bold=True) ws["A1"].alignment = Alignment(wrap_text=True, vertical="top") # Add intro paragraphs row = 3 for paragraph in intro_paragraphs: ws.merge_cells(start_row=row, start_column=1, end_row=row, end_column=2) cell = ws.cell(row=row, column=1, value=paragraph) cell.alignment = Alignment(wrap_text=True, vertical="top") ws.row_dimensions[row].height = 60 # Adjust height as needed row += 2 # Add metadata meta_start = row + 1 metadata = { "Date Excel file created": date.today().strftime("%Y-%m-%d"), "File name": file_name, "Column name": column_name, "Model name": model_name, "Analysis date": analysis_date, #"Analysis cost": analysis_cost, "Number of responses": number_of_responses, "Number of responses with text": number_of_responses_with_text, "Number of responses with text five plus words": number_of_responses_with_text_five_plus_words, "Number of responses with at least one assigned topic": number_of_responses_with_topic_assignment, "Number of LLM calls": llm_call_number, "Total number of input tokens from LLM calls": input_tokens, "Total number of output tokens from LLM calls": output_tokens, "Total time taken for all LLM calls (seconds)": time_taken, } for i, (label, value) in enumerate(metadata.items()): row_num = meta_start + i ws[f"A{row_num}"] = label ws[f"A{row_num}"].font = Font(bold=True) cell = ws[f"B{row_num}"] cell.value = value cell.alignment = Alignment(wrap_text=True) # Optional: Adjust column widths ws.column_dimensions["A"].width = 25 ws.column_dimensions["B"].width = 75 # Ensure first row cells are wrapped on the cover sheet for col_idx in range(1, ws.max_column + 1): header_cell = ws.cell(row=1, column=col_idx) header_cell.alignment = Alignment(wrap_text=True, vertical="center") def csvs_to_excel( csv_files:list[str], output_filename:str, sheet_names:list[str]=None, column_widths:dict=None, # Dict of {sheet_name: {col_letter: width}} wrap_text_columns:dict=None, # Dict of {sheet_name: [col_letters]} intro_text: list[str] = None, model_name:str="", analysis_date:str="", analysis_cost:str="", llm_call_number:int=0, input_tokens:int=0, output_tokens:int=0, time_taken:float=0, number_of_responses:int=0, number_of_responses_with_text:int=0, number_of_responses_with_text_five_plus_words:int=0, column_name:str="", number_of_responses_with_topic_assignment:int=0, file_name:str="", unique_reference_numbers:list=[] ): if intro_text is None: intro_text = list() wb = Workbook() # Remove default sheet wb.remove(wb.active) for idx, csv_path in enumerate(csv_files): # Use provided sheet name or derive from file name sheet_name = sheet_names[idx] if sheet_names and idx < len(sheet_names) else os.path.splitext(os.path.basename(csv_path))[0] df = pd.read_csv(csv_path) if sheet_name == "Original data": try: # Create a copy to avoid modifying the original df_copy = df.copy() # Insert the Reference column at position 0 (first column) df_copy.insert(0, "Reference", unique_reference_numbers) df = df_copy except Exception as e: print("Could not add reference number to original data due to:", e) ws = wb.create_sheet(title=sheet_name) for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1): ws.append(row) for col_idx, value in enumerate(row, start=1): cell = ws.cell(row=r_idx, column=col_idx) # Bold header row if r_idx == 1: cell.font = Font(bold=True) # Set vertical alignment to middle by default cell.alignment = Alignment(vertical="center") # Apply wrap text if needed if wrap_text_columns and sheet_name in wrap_text_columns: for col_letter in wrap_text_columns[sheet_name]: cell = ws[f"{col_letter}{r_idx}"] cell.alignment = Alignment(vertical="center", wrap_text=True) # Freeze top row for all data sheets ws.freeze_panes = "A2" # Ensure all header cells (first row) are wrapped for col_idx in range(1, ws.max_column + 1): header_cell = ws.cell(row=1, column=col_idx) header_cell.alignment = Alignment(vertical="center", wrap_text=True) # Set column widths if column_widths and sheet_name in column_widths: for col_letter, width in column_widths[sheet_name].items(): ws.column_dimensions[col_letter].width = width add_cover_sheet( wb, intro_paragraphs=intro_text, model_name=model_name, analysis_date=analysis_date, analysis_cost=analysis_cost, number_of_responses=number_of_responses, number_of_responses_with_text=number_of_responses_with_text, number_of_responses_with_text_five_plus_words=number_of_responses_with_text_five_plus_words, llm_call_number = llm_call_number, input_tokens = input_tokens, output_tokens = output_tokens, time_taken = time_taken, file_name=file_name, column_name=column_name, number_of_responses_with_topic_assignment=number_of_responses_with_topic_assignment ) wb.save(output_filename) print(f"Output xlsx summary saved as '{output_filename}'") return output_filename ### # Run the functions ### def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:list[str], reference_data_file_name_textbox:str, in_group_col:str, model_choice:str, master_reference_df_state:pd.DataFrame, master_unique_topics_df_state:pd.DataFrame, summarised_output_df:pd.DataFrame, missing_df_state:pd.DataFrame, excel_sheets:str="", usage_logs_location:str="", model_name_map:dict=dict(), output_folder:str=OUTPUT_FOLDER, structured_summaries:str="No"): ''' Collect together output CSVs from various output boxes and combine them into a single output Excel file. Args: in_data_files (List): A list of paths to the input data files. chosen_cols (list[str]): A list of column names selected for analysis. reference_data_file_name_textbox (str): The name of the reference data file. in_group_col (str): The column used for grouping the data. model_choice (str): The LLM model chosen for the analysis. master_reference_df_state (pd.DataFrame): The master DataFrame containing reference data. master_unique_topics_df_state (pd.DataFrame): The master DataFrame containing unique topics data. summarised_output_df (pd.DataFrame): DataFrame containing the summarised output. missing_df_state (pd.DataFrame): DataFrame containing information about missing data. excel_sheets (str): Information regarding Excel sheets, typically sheet names or structure. usage_logs_location (str, optional): Path to the usage logs CSV file. Defaults to "". model_name_map (dict, optional): A dictionary mapping model choices to their display names. Defaults to {}. output_folder (str, optional): The directory where the output Excel file will be saved. Defaults to OUTPUT_FOLDER. structured_summaries (str, optional): Indicates whether structured summaries are being produced ("Yes" or "No"). Defaults to "No". Returns: tuple: A tuple containing: - list: A list of paths to the generated Excel output files. - list: A duplicate of the list of paths to the generated Excel output files (for UI compatibility). ''' if structured_summaries == "Yes": structured_summaries = True else: structured_summaries = False if not chosen_cols: raise Exception("Could not find chosen column") today_date = datetime.today().strftime('%Y-%m-%d') original_data_file_path = os.path.abspath(in_data_files[0]) csv_files = list() sheet_names = list() column_widths = dict() wrap_text_columns = dict() short_file_name = os.path.basename(reference_data_file_name_textbox) reference_pivot_table = pd.DataFrame() reference_table_csv_path = "" reference_pivot_table_csv_path = "" unique_topic_table_csv_path = "" missing_df_state_csv_path = "" overall_summary_csv_path = "" number_of_responses_with_topic_assignment = 0 if in_group_col: group = in_group_col else: group = "All" overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv" if structured_summaries is True and not master_unique_topics_df_state.empty: print("Producing overall summary based on structured summaries.") # Create structured summary from master_unique_topics_df_state structured_summary_data = list() # Group by 'Group' column for group_name, group_df in master_unique_topics_df_state.groupby('Group'): group_summary = f"## {group_name}\n\n" # Group by 'General topic' within each group for general_topic, topic_df in group_df.groupby('General topic'): group_summary += f"### {general_topic}\n\n" # Add subtopics under each general topic for _, row in topic_df.iterrows(): subtopic = row['Subtopic'] summary = row['Summary'] # sentiment = row.get('Sentiment', '') # num_responses = row.get('Number of responses', '') # Create subtopic entry subtopic_entry = f"**{subtopic}**" # if sentiment: # subtopic_entry += f" ({sentiment})" # if num_responses: # subtopic_entry += f" - {num_responses} responses" subtopic_entry += "\n\n" if summary and pd.notna(summary): subtopic_entry += f"{summary}\n\n" group_summary += subtopic_entry # Add to structured summary data structured_summary_data.append({ 'Group': group_name, 'Summary': group_summary.strip() }) # Create DataFrame for structured summary structured_summary_df = pd.DataFrame(structured_summary_data) structured_summary_df.to_csv(overall_summary_csv_path, index=False) else: # Use original summarised_output_df structured_summary_df = summarised_output_df structured_summary_df.to_csv(overall_summary_csv_path, index = None) if not structured_summary_df.empty: csv_files.append(overall_summary_csv_path) sheet_names.append("Overall summary") column_widths["Overall summary"] = {"A": 20, "B": 100} wrap_text_columns["Overall summary"] = ['B'] if not master_reference_df_state.empty: # Simplify table to just responses column and the Response reference number file_data, file_name, num_batches = load_in_data_file(in_data_files, chosen_cols, 1, in_excel_sheets=excel_sheets) basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles="No") reference_pivot_table = convert_reference_table_to_pivot_table(master_reference_df_state, basic_response_data) unique_reference_numbers = basic_response_data["Reference"].tolist() try: master_reference_df_state.rename(columns={"Topic_number":"Topic number"}, inplace=True, errors="ignore") except Exception as e: print("Could not rename Topic_number due to", e) number_of_responses_with_topic_assignment = len(master_reference_df_state["Response References"].unique()) reference_table_csv_path = output_folder + "reference_df_for_xlsx.csv" master_reference_df_state.to_csv(reference_table_csv_path, index = None) reference_pivot_table_csv_path = output_folder + "reference_pivot_df_for_xlsx.csv" reference_pivot_table.to_csv(reference_pivot_table_csv_path, index = None) short_file_name = os.path.basename(file_name) if not master_unique_topics_df_state.empty: unique_topic_table_csv_path = output_folder + "unique_topic_table_df_for_xlsx.csv" master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index = None) if unique_topic_table_csv_path: csv_files.append(unique_topic_table_csv_path) sheet_names.append("Topic summary") column_widths["Topic summary"] = {"A": 25, "B": 25, "C": 15, "D": 15, "F":100} wrap_text_columns["Topic summary"] = ["B", "F"] else: print("Relevant unique topic files not found, excluding from xlsx output.") if reference_table_csv_path: if structured_summaries: print("Structured summaries are being produced, excluding response level data from xlsx output.") else: csv_files.append(reference_table_csv_path) sheet_names.append("Response level data") column_widths["Response level data"] = {"A": 15, "B": 30, "C": 40, "H":100} wrap_text_columns["Response level data"] = ["C", "G"] else: print("Relevant reference files not found, excluding from xlsx output.") if reference_pivot_table_csv_path: if structured_summaries: print("Structured summaries are being produced, excluding topic response pivot table from xlsx output.") else: csv_files.append(reference_pivot_table_csv_path) sheet_names.append("Topic response pivot table") if reference_pivot_table.empty: reference_pivot_table = pd.read_csv(reference_pivot_table_csv_path) # Base widths and wrap column_widths["Topic response pivot table"] = {"A": 25, "B": 100} wrap_text_columns["Topic response pivot table"] = ["B"] num_cols = len(reference_pivot_table.columns) col_letters = [get_column_letter(i) for i in range(3, num_cols + 1)] for col_letter in col_letters: column_widths["Topic response pivot table"][col_letter] = 25 wrap_text_columns["Topic response pivot table"].extend(col_letters) else: print("Relevant reference pivot table files not found, excluding from xlsx output.") if not missing_df_state.empty: missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv" missing_df_state.to_csv(missing_df_state_csv_path, index = None) if missing_df_state_csv_path: if structured_summaries: print("Structured summaries are being produced, excluding missing responses from xlsx output.") else: csv_files.append(missing_df_state_csv_path) sheet_names.append("Missing responses") column_widths["Missing responses"] = {"A": 25, "B": 30, "C": 50} wrap_text_columns["Missing responses"] = ["C"] else: print("Relevant missing responses files not found, excluding from xlsx output.") new_csv_files = csv_files.copy() # Original data file original_ext = os.path.splitext(original_data_file_path)[1].lower() if original_ext == ".csv": csv_files.append(original_data_file_path) else: # Read and convert to CSV if original_ext == ".xlsx": if excel_sheets: df = pd.read_excel(original_data_file_path, sheet_name=excel_sheets) else: df = pd.read_excel(original_data_file_path) elif original_ext == ".parquet": df = pd.read_parquet(original_data_file_path) else: raise Exception(f"Unsupported file type for original data: {original_ext}") # Save as CSV in output folder original_data_csv_path = os.path.join( output_folder, os.path.splitext(os.path.basename(original_data_file_path))[0] + "_for_xlsx.csv" ) df.to_csv(original_data_csv_path, index=False) csv_files.append(original_data_csv_path) sheet_names.append("Original data") column_widths["Original data"] = {"A": 20, "B": 20, "C": 20} wrap_text_columns["Original data"] = ["C"] # Intro page text intro_text = [ "This workbook contains outputs from the large language model topic analysis of open text data. Each sheet corresponds to a different CSV report included in the analysis.", f"The file analysed was {short_file_name}, the column analysed was '{chosen_cols}' and the data was grouped by column '{group}'." "Please contact the LLM Topic Modelling app administrator if you need any explanation on how to use the results." "Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this analysis **need to be checked by a human** to check for harmful outputs, false information, and bias." ] # Get values for number of rows, number of responses, and number of responses longer than five words number_of_responses = basic_response_data.shape[0] #number_of_responses_with_text = basic_response_data["Response"].str.strip().notnull().sum() number_of_responses_with_text = ( basic_response_data["Response"].str.strip().notnull() & (basic_response_data["Response"].str.split().str.len() >= 1)).sum() number_of_responses_with_text_five_plus_words = ( basic_response_data["Response"].str.strip().notnull() & (basic_response_data["Response"].str.split().str.len() >= 5)).sum() # Get number of LLM calls, input and output tokens if usage_logs_location: try: usage_logs = pd.read_csv(usage_logs_location) relevant_logs = usage_logs.loc[(usage_logs["Reference data file name"] == reference_data_file_name_textbox) & (usage_logs["Large language model for topic extraction and summarisation"]==model_choice) & (usage_logs["Select the open text column of interest. In an Excel file, this shows columns across all sheets."]==chosen_cols),:] llm_call_number = sum(relevant_logs["Total LLM calls"].astype(int)) input_tokens = sum(relevant_logs["Total input tokens"].astype(int)) output_tokens = sum(relevant_logs["Total output tokens"].astype(int)) time_taken = sum(relevant_logs["Estimated time taken (seconds)"].astype(float)) except Exception as e: print("Could not obtain usage logs due to:", e) usage_logs = pd.DataFrame() llm_call_number = 0 input_tokens = 0 output_tokens = 0 time_taken = 0 else: print("LLM call logs location not provided") usage_logs = pd.DataFrame() llm_call_number = 0 input_tokens = 0 output_tokens = 0 time_taken = 0 # Create short filename: model_choice_clean_short = clean_column_name(model_name_map[model_choice]["short_name"], max_length=20, front_characters=False) in_column_cleaned = clean_column_name(chosen_cols, max_length=20) file_name_cleaned = clean_column_name(file_name, max_length=20, front_characters=True) # Save outputs for each batch. If master file created, label file as master file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}_{model_choice_clean_short}" output_xlsx_filename = output_folder + file_path_details + ("_structured_summaries" if structured_summaries else "_topic_analysis") + ".xlsx" xlsx_output_filename = csvs_to_excel( csv_files = csv_files, output_filename = output_xlsx_filename, sheet_names = sheet_names, column_widths = column_widths, wrap_text_columns = wrap_text_columns, intro_text = intro_text, model_name = model_choice, analysis_date = today_date, analysis_cost = "", llm_call_number = llm_call_number, input_tokens = input_tokens, output_tokens = output_tokens, time_taken = time_taken, number_of_responses = number_of_responses, number_of_responses_with_text = number_of_responses_with_text, number_of_responses_with_text_five_plus_words = number_of_responses_with_text_five_plus_words, column_name=chosen_cols, number_of_responses_with_topic_assignment=number_of_responses_with_topic_assignment, file_name = short_file_name, unique_reference_numbers=unique_reference_numbers ) xlsx_output_filenames = [xlsx_output_filename] # Delete intermediate csv files for csv_file in new_csv_files: os.remove(csv_file) return xlsx_output_filenames, xlsx_output_filenames