insight-finder / src /services /external_process.py
heymenn's picture
Upload 7 files
1f05644 verified
# This file is used to compute the embedding of the technologies, easily executable on google colab
#!pip install sentence-transformers
#!pip install nltk
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
import pandas as pd
import nltk
from nltk.stem import *
nltk.download("punkt_tab")
print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded.")
def load_technologies():
df = pd.read_excel('technologies_database.xlsx')
return df
def tech_to_dict(technologies):
tech_dict = []
for index, tech in enumerate(technologies):
if not tech.find("<title>") > 1:
tab = tech.split("\n")
tab.pop(0)
tab.pop(len(tab)-1)
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
"purpose": tab[1][tab[1].find(": ")+2:],
"key_components": tab[2][tab[2].find(": ")+2:],
"advantages": tab[3][tab[3].find(": ")+2:],
"limitations": tab[4][tab[4].find(": ")+2:],
"id": index})
return tech_dict
def stem(data,data_type):
stemmer = SnowballStemmer("english")
processed_data = []
if data_type == "technologies":
for t_item in data:
processed_data.append({
"title": stemmer.stem(t_item["title"]),
"purpose": stemmer.stem(t_item["purpose"]),
"key_components": stemmer.stem(t_item["key_components"]),
"advantages": stemmer.stem(t_item["advantages"]),
"limitations": stemmer.stem(t_item["limitations"]),
"id": t_item["id"]
})
else:
for t_item in data:
print(t_item)
processed_data.append({
"title": stemmer.stem(t_item),
"description": stemmer.stem(data[t_item])
})
return processed_data
def preprocess_tech_data(_df):
if _df is None or "description" not in _df.columns:
return [], []
technologies_list = _df["description"].to_list()
tech_dict_raw = tech_to_dict(technologies_list)
tech_dict_filtered = [
t for t in tech_dict_raw if (
len(t.get("title", "")) >= 5 and
len(t.get("advantages", "")) >= 5 and
len(t.get("key_components", "")) >= 5
)
]
if not tech_dict_filtered:
return [], []
processed_tech_wt = stem(tech_dict_filtered,"technologies")
for t_item_wt in processed_tech_wt:
kc = t_item_wt.get("key_components")
if isinstance(kc, str):
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
else:
t_item_wt["key_components"] = ""
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
return processed_tech_wt, _keys, original_tech_for_display
df = load_technologies()
global_tech,keys,original_tech = preprocess_tech_data(df)
global_tech_purposes = [t["purpose"] for t in global_tech]
# Encode all global_tech purposes into embeddings
print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.")
global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True)
print("Global tech embeddings created.")
# Define the filename for the pickle file
output_filename = 'global_tech_embeddings.pkl'
# Save the embeddings and the global_tech data (optional, but good for context)
# Saving global_tech alongside embeddings ensures you have the original data if needed
data_to_save = {
'global_tech': global_tech, # The original list of dictionaries
'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings
}
print(f"Saving embeddings and global_tech data to {output_filename}...")
with open(output_filename, 'wb') as f:
pickle.dump(data_to_save, f)
print(f"Data saved successfully to {output_filename}.")
print(f"\nTo load this file later in your API, use: \n"
f"with open('{output_filename}', 'rb') as f:\n"
f" loaded_data = pickle.load(f)\n"
f"global_tech = loaded_data['global_tech']\n"
f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n")