insight-finder

Running

File size: 4,240 Bytes

1f05644

import pickle
import numpy as np
import pandas as pd

import nltk
from nltk.stem import *
nltk.download("punkt_tab")

FILE_PATH = "/app/src/ressources/technologies_database.xlsx"

def set_prompt(problem):
    prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only. 
    Take into account different technical domains to encompass the whole problem. 
    Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
    Technical problem :
    """ + problem
    return prompt

def load_technologies_excel():
    df = pd.read_excel(FILE_PATH)
    return df

def load_technologies():
    EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl'

    try:
        with open(EMBEDDINGS_FILE, 'rb') as f:
            loaded_data = pickle.load(f)
        global_tech = loaded_data['global_tech']
        global_tech_embedding = loaded_data['global_tech_embeddings']
        return global_tech, global_tech_embedding
    except Exception as e:
        print(f"Error: {e}")

def tech_to_dict(technologies):
    tech_dict = []
    for index, tech in enumerate(technologies):
        if not tech.find("<title>") > 1:
            tab = tech.split("\n")
            tab.pop(0)
            tab.pop(len(tab)-1)
            tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
                                "purpose": tab[1][tab[1].find(": ")+2:],
                                "key_components": tab[2][tab[2].find(": ")+2:],
                                "advantages": tab[3][tab[3].find(": ")+2:],
                                "limitations": tab[4][tab[4].find(": ")+2:],
                                "id": index})
    return tech_dict

def save_dataframe(df, title):
    pd.DataFrame(df).to_excel(title)
    return title

def stem(data,data_type):
    stemmer = SnowballStemmer("english")
    processed_data = []
    if data_type == "technologies":
      for t_item in data:
          processed_data.append({
              "title": stemmer.stem(t_item["title"]),
              "purpose": stemmer.stem(t_item["purpose"]),
              "key_components": stemmer.stem(t_item["key_components"]),
              "advantages": stemmer.stem(t_item["advantages"]),
              "limitations": stemmer.stem(t_item["limitations"]),
              "id": t_item["id"]
          })
    else:
      for t_item in data:
          print(t_item)
          processed_data.append({
              "title": stemmer.stem(t_item),
              "description": stemmer.stem(data[t_item])
              })

    return processed_data


def get_technologies_by_id(id_list, technologies):
    result = []
    id_set = set(id_list)
    for tech in technologies:
        if tech.get('id') in id_set:
            result.append(tech)
    return result

def save_to_pickle(result_similarites):

    constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
    max_id2 = max([item['id2'] for item in result_similarites])

    row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
    col_labels = list(range(1, max_id2 + 1))

    num_rows = len(constraint_titles)
    num_cols = max_id2

    matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)

    for item in result_similarites:
        row_idx = row_label_to_index[item['constraint']['title']]
        col_idx = item['id2'] - 1 # 
        similarity_value = item['similarity'].item()

        matrix[row_idx, col_idx] = similarity_value

    print(f"Successfully created matrix with shape: {matrix.shape}")
    print(f"Number of rows (unique constraints): {num_rows}")
    print(f"Number of columns (max id2): {num_cols}")
    print("\nExample 5x5 block of the created matrix (NaN for missing values):")
    print(matrix[:5, :5])

    output_filename = "cosine_similarity_matrix_with_labels.pkl"
    data_to_save = {
        'matrix': matrix,
        'row_labels': constraint_titles,
        'col_labels': col_labels
    }

    with open(output_filename, 'wb') as f:
        pickle.dump(data_to_save, f)

    print(f"\nMatrix and labels saved to {output_filename}")
    return output_filename