# This file is used to compute the embedding of the technologies, easily executable on google colab #!pip install sentence-transformers #!pip install nltk import numpy as np from sentence_transformers import SentenceTransformer import pickle import pandas as pd import nltk from nltk.stem import * nltk.download("punkt_tab") print("Loading SentenceTransformer model...") model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') print("Model loaded.") def load_technologies(): df = pd.read_excel('technologies_database.xlsx') return df def tech_to_dict(technologies): tech_dict = [] for index, tech in enumerate(technologies): if not tech.find("") > 1: tab = tech.split("\n") tab.pop(0) tab.pop(len(tab)-1) tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], "purpose": tab[1][tab[1].find(": ")+2:], "key_components": tab[2][tab[2].find(": ")+2:], "advantages": tab[3][tab[3].find(": ")+2:], "limitations": tab[4][tab[4].find(": ")+2:], "id": index}) return tech_dict def stem(data,data_type): stemmer = SnowballStemmer("english") processed_data = [] if data_type == "technologies": for t_item in data: processed_data.append({ "title": stemmer.stem(t_item["title"]), "purpose": stemmer.stem(t_item["purpose"]), "key_components": stemmer.stem(t_item["key_components"]), "advantages": stemmer.stem(t_item["advantages"]), "limitations": stemmer.stem(t_item["limitations"]), "id": t_item["id"] }) else: for t_item in data: print(t_item) processed_data.append({ "title": stemmer.stem(t_item), "description": stemmer.stem(data[t_item]) }) return processed_data def preprocess_tech_data(_df): if _df is None or "description" not in _df.columns: return [], [] technologies_list = _df["description"].to_list() tech_dict_raw = tech_to_dict(technologies_list) tech_dict_filtered = [ t for t in tech_dict_raw if ( len(t.get("title", "")) >= 5 and len(t.get("advantages", "")) >= 5 and len(t.get("key_components", "")) >= 5 ) ] if not tech_dict_filtered: return [], [] processed_tech_wt = stem(tech_dict_filtered,"technologies") for t_item_wt in processed_tech_wt: kc = t_item_wt.get("key_components") if isinstance(kc, str): t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc)) else: t_item_wt["key_components"] = "" original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)] _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else [] return processed_tech_wt, _keys, original_tech_for_display df = load_technologies() global_tech,keys,original_tech = preprocess_tech_data(df) global_tech_purposes = [t["purpose"] for t in global_tech] # Encode all global_tech purposes into embeddings print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.") global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True) print("Global tech embeddings created.") # Define the filename for the pickle file output_filename = 'global_tech_embeddings.pkl' # Save the embeddings and the global_tech data (optional, but good for context) # Saving global_tech alongside embeddings ensures you have the original data if needed data_to_save = { 'global_tech': global_tech, # The original list of dictionaries 'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings } print(f"Saving embeddings and global_tech data to {output_filename}...") with open(output_filename, 'wb') as f: pickle.dump(data_to_save, f) print(f"Data saved successfully to {output_filename}.") print(f"\nTo load this file later in your API, use: \n" f"with open('{output_filename}', 'rb') as f:\n" f" loaded_data = pickle.load(f)\n" f"global_tech = loaded_data['global_tech']\n" f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n")