insight-finder

Running

File size: 4,387 Bytes

1f05644

# This file is used to compute the embedding of the technologies, easily executable on google colab

#!pip install sentence-transformers
#!pip install nltk

import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
import pandas as pd
import nltk
from nltk.stem import *
nltk.download("punkt_tab")


print("Loading SentenceTransformer model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("Model loaded.")

def load_technologies():
    df = pd.read_excel('technologies_database.xlsx')
    return df

def tech_to_dict(technologies):
    tech_dict = []
    for index, tech in enumerate(technologies):
        if not tech.find("<title>") > 1:
            tab = tech.split("\n")
            tab.pop(0)
            tab.pop(len(tab)-1)
            tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
                                "purpose": tab[1][tab[1].find(": ")+2:],
                                "key_components": tab[2][tab[2].find(": ")+2:],
                                "advantages": tab[3][tab[3].find(": ")+2:],
                                "limitations": tab[4][tab[4].find(": ")+2:],
                                "id": index})
    return tech_dict

def stem(data,data_type):
    stemmer = SnowballStemmer("english")
    processed_data = []
    if data_type == "technologies":
      for t_item in data:
          processed_data.append({
              "title": stemmer.stem(t_item["title"]),
              "purpose": stemmer.stem(t_item["purpose"]),
              "key_components": stemmer.stem(t_item["key_components"]),
              "advantages": stemmer.stem(t_item["advantages"]),
              "limitations": stemmer.stem(t_item["limitations"]),
              "id": t_item["id"]
          })
    else:
      for t_item in data:
          print(t_item)
          processed_data.append({
              "title": stemmer.stem(t_item),
              "description": stemmer.stem(data[t_item])
              })

    return processed_data

def preprocess_tech_data(_df):
    if _df is None or "description" not in _df.columns:
        return [], []

    technologies_list = _df["description"].to_list()
    tech_dict_raw = tech_to_dict(technologies_list)

    tech_dict_filtered = [
        t for t in tech_dict_raw if (
            len(t.get("title", "")) >= 5 and
            len(t.get("advantages", "")) >= 5 and
            len(t.get("key_components", "")) >= 5
        )
    ]

    if not tech_dict_filtered:
        return [], []

    processed_tech_wt = stem(tech_dict_filtered,"technologies")

    for t_item_wt in processed_tech_wt:
        kc = t_item_wt.get("key_components")
        if isinstance(kc, str):
            t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
        else:
            t_item_wt["key_components"] = ""

    original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]


    _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
    return processed_tech_wt, _keys, original_tech_for_display


df = load_technologies()
global_tech,keys,original_tech = preprocess_tech_data(df)
global_tech_purposes = [t["purpose"] for t in global_tech]

# Encode all global_tech purposes into embeddings
print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.")
global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True)
print("Global tech embeddings created.")

# Define the filename for the pickle file
output_filename = 'global_tech_embeddings.pkl'

# Save the embeddings and the global_tech data (optional, but good for context)
# Saving global_tech alongside embeddings ensures you have the original data if needed
data_to_save = {
    'global_tech': global_tech, # The original list of dictionaries
    'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings
}

print(f"Saving embeddings and global_tech data to {output_filename}...")
with open(output_filename, 'wb') as f:
    pickle.dump(data_to_save, f)
print(f"Data saved successfully to {output_filename}.")

print(f"\nTo load this file later in your API, use: \n"
      f"with open('{output_filename}', 'rb') as f:\n"
      f"    loaded_data = pickle.load(f)\n"
      f"global_tech = loaded_data['global_tech']\n"
      f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n")