|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
from sentence_transformers import SentenceTransformer |
|
import pickle |
|
import pandas as pd |
|
import nltk |
|
from nltk.stem import * |
|
nltk.download("punkt_tab") |
|
|
|
|
|
print("Loading SentenceTransformer model...") |
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
print("Model loaded.") |
|
|
|
def load_technologies(): |
|
df = pd.read_excel('technologies_database.xlsx') |
|
return df |
|
|
|
def tech_to_dict(technologies): |
|
tech_dict = [] |
|
for index, tech in enumerate(technologies): |
|
if not tech.find("<title>") > 1: |
|
tab = tech.split("\n") |
|
tab.pop(0) |
|
tab.pop(len(tab)-1) |
|
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], |
|
"purpose": tab[1][tab[1].find(": ")+2:], |
|
"key_components": tab[2][tab[2].find(": ")+2:], |
|
"advantages": tab[3][tab[3].find(": ")+2:], |
|
"limitations": tab[4][tab[4].find(": ")+2:], |
|
"id": index}) |
|
return tech_dict |
|
|
|
def stem(data,data_type): |
|
stemmer = SnowballStemmer("english") |
|
processed_data = [] |
|
if data_type == "technologies": |
|
for t_item in data: |
|
processed_data.append({ |
|
"title": stemmer.stem(t_item["title"]), |
|
"purpose": stemmer.stem(t_item["purpose"]), |
|
"key_components": stemmer.stem(t_item["key_components"]), |
|
"advantages": stemmer.stem(t_item["advantages"]), |
|
"limitations": stemmer.stem(t_item["limitations"]), |
|
"id": t_item["id"] |
|
}) |
|
else: |
|
for t_item in data: |
|
print(t_item) |
|
processed_data.append({ |
|
"title": stemmer.stem(t_item), |
|
"description": stemmer.stem(data[t_item]) |
|
}) |
|
|
|
return processed_data |
|
|
|
def preprocess_tech_data(_df): |
|
if _df is None or "description" not in _df.columns: |
|
return [], [] |
|
|
|
technologies_list = _df["description"].to_list() |
|
tech_dict_raw = tech_to_dict(technologies_list) |
|
|
|
tech_dict_filtered = [ |
|
t for t in tech_dict_raw if ( |
|
len(t.get("title", "")) >= 5 and |
|
len(t.get("advantages", "")) >= 5 and |
|
len(t.get("key_components", "")) >= 5 |
|
) |
|
] |
|
|
|
if not tech_dict_filtered: |
|
return [], [] |
|
|
|
processed_tech_wt = stem(tech_dict_filtered,"technologies") |
|
|
|
for t_item_wt in processed_tech_wt: |
|
kc = t_item_wt.get("key_components") |
|
if isinstance(kc, str): |
|
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc)) |
|
else: |
|
t_item_wt["key_components"] = "" |
|
|
|
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)] |
|
|
|
|
|
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else [] |
|
return processed_tech_wt, _keys, original_tech_for_display |
|
|
|
|
|
df = load_technologies() |
|
global_tech,keys,original_tech = preprocess_tech_data(df) |
|
global_tech_purposes = [t["purpose"] for t in global_tech] |
|
|
|
|
|
print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.") |
|
global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True) |
|
print("Global tech embeddings created.") |
|
|
|
|
|
output_filename = 'global_tech_embeddings.pkl' |
|
|
|
|
|
|
|
data_to_save = { |
|
'global_tech': global_tech, |
|
'global_tech_embeddings': global_tech_embeddings |
|
} |
|
|
|
print(f"Saving embeddings and global_tech data to {output_filename}...") |
|
with open(output_filename, 'wb') as f: |
|
pickle.dump(data_to_save, f) |
|
print(f"Data saved successfully to {output_filename}.") |
|
|
|
print(f"\nTo load this file later in your API, use: \n" |
|
f"with open('{output_filename}', 'rb') as f:\n" |
|
f" loaded_data = pickle.load(f)\n" |
|
f"global_tech = loaded_data['global_tech']\n" |
|
f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n") |
|
|
|
|