heymenn's picture
Upload 7 files
1f05644 verified
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.stem import *
nltk.download("punkt_tab")
FILE_PATH = "/app/src/ressources/technologies_database.xlsx"
def set_prompt(problem):
prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
Take into account different technical domains to encompass the whole problem.
Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
Technical problem :
""" + problem
return prompt
def load_technologies_excel():
df = pd.read_excel(FILE_PATH)
return df
def load_technologies():
EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl'
try:
with open(EMBEDDINGS_FILE, 'rb') as f:
loaded_data = pickle.load(f)
global_tech = loaded_data['global_tech']
global_tech_embedding = loaded_data['global_tech_embeddings']
return global_tech, global_tech_embedding
except Exception as e:
print(f"Error: {e}")
def tech_to_dict(technologies):
tech_dict = []
for index, tech in enumerate(technologies):
if not tech.find("<title>") > 1:
tab = tech.split("\n")
tab.pop(0)
tab.pop(len(tab)-1)
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
"purpose": tab[1][tab[1].find(": ")+2:],
"key_components": tab[2][tab[2].find(": ")+2:],
"advantages": tab[3][tab[3].find(": ")+2:],
"limitations": tab[4][tab[4].find(": ")+2:],
"id": index})
return tech_dict
def save_dataframe(df, title):
pd.DataFrame(df).to_excel(title)
return title
def stem(data,data_type):
stemmer = SnowballStemmer("english")
processed_data = []
if data_type == "technologies":
for t_item in data:
processed_data.append({
"title": stemmer.stem(t_item["title"]),
"purpose": stemmer.stem(t_item["purpose"]),
"key_components": stemmer.stem(t_item["key_components"]),
"advantages": stemmer.stem(t_item["advantages"]),
"limitations": stemmer.stem(t_item["limitations"]),
"id": t_item["id"]
})
else:
for t_item in data:
print(t_item)
processed_data.append({
"title": stemmer.stem(t_item),
"description": stemmer.stem(data[t_item])
})
return processed_data
def get_technologies_by_id(id_list, technologies):
result = []
id_set = set(id_list)
for tech in technologies:
if tech.get('id') in id_set:
result.append(tech)
return result
def save_to_pickle(result_similarites):
constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
max_id2 = max([item['id2'] for item in result_similarites])
row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
col_labels = list(range(1, max_id2 + 1))
num_rows = len(constraint_titles)
num_cols = max_id2
matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
for item in result_similarites:
row_idx = row_label_to_index[item['constraint']['title']]
col_idx = item['id2'] - 1 #
similarity_value = item['similarity'].item()
matrix[row_idx, col_idx] = similarity_value
print(f"Successfully created matrix with shape: {matrix.shape}")
print(f"Number of rows (unique constraints): {num_rows}")
print(f"Number of columns (max id2): {num_cols}")
print("\nExample 5x5 block of the created matrix (NaN for missing values):")
print(matrix[:5, :5])
output_filename = "cosine_similarity_matrix_with_labels.pkl"
data_to_save = {
'matrix': matrix,
'row_labels': constraint_titles,
'col_labels': col_labels
}
with open(output_filename, 'wb') as f:
pickle.dump(data_to_save, f)
print(f"\nMatrix and labels saved to {output_filename}")
return output_filename