insight-finder

Running

App Files Files Community

heymenn commited on Jun 24

Commit

1f05644

verified ·

1 Parent(s): 2d541bb

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
src/__init__.py +0 -0
src/core.py +26 -0
src/ressources/global_tech_embeddings.pkl +3 -0
src/ressources/technologies_database.xlsx +3 -0
src/services/external_process.py +124 -0
src/services/processor.py +218 -0
src/services/utils.py +123 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
 src/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
 src/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
+src/ressources/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text

src/__init__.py ADDED Viewed

File without changes

src/core.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from src.services.utils import *
+from src.services.processor import *
+def process_input(data):
+    prompt = set_prompt(data.problem)
+    constraints = retrieve_constraints(prompt)
+    constraints_stemmed = stem(constraints, "constraints")
+    save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
+    global_tech, global_tech_embeddings = load_technologies()
+    #global_tech, keys, original_tech = preprocess_tech_data(df)
+    save_dataframe(global_tech, "global_tech.xlsx")
+    result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, global_tech, global_tech_embeddings, )
+    save_to_pickle(result_similarities)
+    best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
+    best_technologies_id = select_technologies(best_combinations)
+    best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
+    return best_technologies

src/ressources/global_tech_embeddings.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a62fa7bbe756522e26ed283c36fe42a62ba950ea1c765cc85e35ffb335894993
+size 1809459

src/ressources/technologies_database.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
+size 213138

src/services/external_process.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# This file is used to compute the embedding of the technologies, easily executable on google colab
+#!pip install sentence-transformers
+#!pip install nltk
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import pickle
+import pandas as pd
+import nltk
+from nltk.stem import *
+nltk.download("punkt_tab")
+print("Loading SentenceTransformer model...")
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+print("Model loaded.")
+def load_technologies():
+    df = pd.read_excel('technologies_database.xlsx')
+    return df
+def tech_to_dict(technologies):
+    tech_dict = []
+    for index, tech in enumerate(technologies):
+        if not tech.find("<title>") > 1:
+            tab = tech.split("\n")
+            tab.pop(0)
+            tab.pop(len(tab)-1)
+            tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
+                                "purpose": tab[1][tab[1].find(": ")+2:],
+                                "key_components": tab[2][tab[2].find(": ")+2:],
+                                "advantages": tab[3][tab[3].find(": ")+2:],
+                                "limitations": tab[4][tab[4].find(": ")+2:],
+                                "id": index})
+    return tech_dict
+def stem(data,data_type):
+    stemmer = SnowballStemmer("english")
+    processed_data = []
+    if data_type == "technologies":
+      for t_item in data:
+          processed_data.append({
+              "title": stemmer.stem(t_item["title"]),
+              "purpose": stemmer.stem(t_item["purpose"]),
+              "key_components": stemmer.stem(t_item["key_components"]),
+              "advantages": stemmer.stem(t_item["advantages"]),
+              "limitations": stemmer.stem(t_item["limitations"]),
+              "id": t_item["id"]
+          })
+    else:
+      for t_item in data:
+          print(t_item)
+          processed_data.append({
+              "title": stemmer.stem(t_item),
+              "description": stemmer.stem(data[t_item])
+              })
+    return processed_data
+def preprocess_tech_data(_df):
+    if _df is None or "description" not in _df.columns:
+        return [], []
+    technologies_list = _df["description"].to_list()
+    tech_dict_raw = tech_to_dict(technologies_list)
+    tech_dict_filtered = [
+        t for t in tech_dict_raw if (
+            len(t.get("title", "")) >= 5 and
+            len(t.get("advantages", "")) >= 5 and
+            len(t.get("key_components", "")) >= 5
+        )
+    ]
+    if not tech_dict_filtered:
+        return [], []
+    processed_tech_wt = stem(tech_dict_filtered,"technologies")
+    for t_item_wt in processed_tech_wt:
+        kc = t_item_wt.get("key_components")
+        if isinstance(kc, str):
+            t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
+        else:
+            t_item_wt["key_components"] = ""
+    original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
+    _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
+    return processed_tech_wt, _keys, original_tech_for_display
+df = load_technologies()
+global_tech,keys,original_tech = preprocess_tech_data(df)
+global_tech_purposes = [t["purpose"] for t in global_tech]
+# Encode all global_tech purposes into embeddings
+print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.")
+global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True)
+print("Global tech embeddings created.")
+# Define the filename for the pickle file
+output_filename = 'global_tech_embeddings.pkl'
+# Save the embeddings and the global_tech data (optional, but good for context)
+# Saving global_tech alongside embeddings ensures you have the original data if needed
+data_to_save = {
+    'global_tech': global_tech, # The original list of dictionaries
+    'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings
+}
+print(f"Saving embeddings and global_tech data to {output_filename}...")
+with open(output_filename, 'wb') as f:
+    pickle.dump(data_to_save, f)
+print(f"Data saved successfully to {output_filename}.")
+print(f"\nTo load this file later in your API, use: \n"
+      f"with open('{output_filename}', 'rb') as f:\n"
+      f"    loaded_data = pickle.load(f)\n"
+      f"global_tech = loaded_data['global_tech']\n"
+      f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n")

src/services/processor.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from src.services.utils import tech_to_dict, stem
+import requests as r
+import json
+import nltk
+import itertools
+import numpy as np
+from sentence_transformers import *
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+def retrieve_constraints(prompt):
+    request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
+    response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
+    print(f"response : {response}")
+    decoded_content = json.loads(response.content.decode())
+    llm_response = decoded_content["content"][0]["message"]["content"]
+    start_marker = '{'
+    end_marker = '}'
+    start_index = llm_response.find(start_marker) + len(start_marker)
+    end_index = llm_response.find(end_marker, start_index)
+    json_str = llm_response[start_index:end_index].strip()
+    constraints_json = json.loads("{"+json_str+"}")
+    return constraints_json
+def preprocess_tech_data(_df):
+    if _df is None or "description" not in _df.columns:
+        return [], []
+    technologies_list = _df["description"].to_list()
+    tech_dict_raw = tech_to_dict(technologies_list)
+    tech_dict_filtered = [
+        t for t in tech_dict_raw if (
+            len(t.get("title", "")) >= 5 and
+            len(t.get("advantages", "")) >= 5 and
+            len(t.get("key_components", "")) >= 5
+        )
+    ]
+    if not tech_dict_filtered:
+        return [], []
+    processed_tech_wt = stem(tech_dict_filtered,"technologies")
+    for t_item_wt in processed_tech_wt:
+        kc = t_item_wt.get("key_components")
+        if isinstance(kc, str):
+            t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
+        else:
+            t_item_wt["key_components"] = ""
+    original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
+    _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
+    return processed_tech_wt, _keys, original_tech_for_display
+def remove_over_repeated_technologies(result):
+    total_lists = len(result)
+    tech_title = {}
+    for idx, item in enumerate(result):
+        for tech in item['technologies']:
+            tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
+    threshold = total_lists * 0.3
+    print(threshold)
+    print(tech_title)
+    to_delete = []
+    for tech, lists in tech_title.items():
+      if lists > threshold:
+        print(f"This technology have been found over repeated : " + tech)
+        to_delete.append(tech)
+    for idx, item in enumerate(result):
+        result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
+    return result
+def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded_tech_embeddings):
+    selected_pairs = []
+    matrix = []
+    constraint_descriptions = [c["description"] for c in constraints]
+    constraint_embeddings = model.encode(constraint_descriptions, show_progress_bar=False)
+    for i, constraint in enumerate(constraints):
+        constraint_embedding = constraint_embeddings[i]
+        for j, tech2 in enumerate(pre_encoded_tech_data):
+            tech_embedding = pre_encoded_tech_embeddings[j]
+            purpose_sim = model.similarity(constraint_embedding, tech_embedding)
+            if np.isnan(purpose_sim):
+                purpose_sim = 0.0
+            selected_pairs.append({
+                "constraint": constraint,
+                "id2": tech2["id"],
+                "similarity": purpose_sim
+            })
+            matrix.append(purpose_sim)
+    return selected_pairs, matrix
+def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
+    if not list1 or not list2:
+        print("Warning: One or both input lists are empty. Returning an empty list.")
+        return []
+    MIN_SIMILARITY = 0.3
+    MAX_SIMILARITY = 0.8
+    possible_matches_for_each_l1 = []
+    for i in range(len(list1)):
+        valid_matches_for_l1_element = []
+        for j in range(len(list2)):
+            score = matrix[i, j]
+            if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
+                valid_matches_for_l1_element.append((list2[j], score))
+        if not valid_matches_for_l1_element:
+            print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
+                  f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
+                  "Returning an empty list as no complete combinations can be formed.")
+        else:
+          possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
+    result = []
+    for tech_list, problem in possible_matches_for_each_l1:
+        sorted_list = sorted(
+            tech_list,
+            key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
+            reverse=True
+        )
+        top5 = sorted_list[:5]
+        result.append({
+            'technologies': top5,
+            'problem': problem
+        })
+    result = remove_over_repeated_technologies(result)
+    return result
+def select_technologies(problem_technology_list):
+    distinct_techs = set()
+    candidate_map = []
+    for problem_data in problem_technology_list:
+        cand_dict = {}
+        for tech_info, sim in problem_data['technologies']:
+            tech_id = tech_info['id']
+            distinct_techs.add(tech_id)
+            cand_dict[tech_id] = float(sim)
+        candidate_map.append(cand_dict)
+    distinct_techs = sorted(list(distinct_techs))
+    n = len(problem_technology_list)
+    if n == 0:
+        return set()
+    min_k = None
+    best_set = None
+    best_avg = -1
+    print(f"Distinct technologies: {distinct_techs}")
+    print(f"Candidate map: {candidate_map}")
+    print(f"Number of problems: {n}")
+    for k in range(1, len(distinct_techs)+1):
+        if min_k is not None and k > min_k:
+            break
+        for T in itertools.combinations(distinct_techs, k):
+            total_sim = 0.0
+            covered = True
+            print(f"Trying combination: {T}")
+            for i in range(n):
+                max_sim = -1.0
+                found = False
+                for tech in T:
+                    if tech in candidate_map[i]:
+                        found = True
+                        sim_val = candidate_map[i][tech]
+                        if sim_val > max_sim:
+                            max_sim = sim_val
+                if not found:
+                    covered = False
+                    break
+                else:
+                    total_sim += max_sim
+            if covered:
+                avg_sim = total_sim / n
+                if min_k is None or k < min_k:
+                    min_k = k
+                    best_set = T
+                    best_avg = avg_sim
+                elif k == min_k and avg_sim > best_avg:
+                    best_set = T
+                    best_avg = avg_sim
+        if min_k is not None and k == min_k:
+            break
+    if best_set is None:
+        return set()
+    return set(best_set)

src/services/utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import pickle
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.stem import *
+nltk.download("punkt_tab")
+FILE_PATH = "/app/src/ressources/technologies_database.xlsx"
+def set_prompt(problem):
+    prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
+    Take into account different technical domains to encompass the whole problem.
+    Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
+    Technical problem :
+    """ + problem
+    return prompt
+def load_technologies_excel():
+    df = pd.read_excel(FILE_PATH)
+    return df
+def load_technologies():
+    EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl'
+    try:
+        with open(EMBEDDINGS_FILE, 'rb') as f:
+            loaded_data = pickle.load(f)
+        global_tech = loaded_data['global_tech']
+        global_tech_embedding = loaded_data['global_tech_embeddings']
+        return global_tech, global_tech_embedding
+    except Exception as e:
+        print(f"Error: {e}")
+def tech_to_dict(technologies):
+    tech_dict = []
+    for index, tech in enumerate(technologies):
+        if not tech.find("<title>") > 1:
+            tab = tech.split("\n")
+            tab.pop(0)
+            tab.pop(len(tab)-1)
+            tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
+                                "purpose": tab[1][tab[1].find(": ")+2:],
+                                "key_components": tab[2][tab[2].find(": ")+2:],
+                                "advantages": tab[3][tab[3].find(": ")+2:],
+                                "limitations": tab[4][tab[4].find(": ")+2:],
+                                "id": index})
+    return tech_dict
+def save_dataframe(df, title):
+    pd.DataFrame(df).to_excel(title)
+    return title
+def stem(data,data_type):
+    stemmer = SnowballStemmer("english")
+    processed_data = []
+    if data_type == "technologies":
+      for t_item in data:
+          processed_data.append({
+              "title": stemmer.stem(t_item["title"]),
+              "purpose": stemmer.stem(t_item["purpose"]),
+              "key_components": stemmer.stem(t_item["key_components"]),
+              "advantages": stemmer.stem(t_item["advantages"]),
+              "limitations": stemmer.stem(t_item["limitations"]),
+              "id": t_item["id"]
+          })
+    else:
+      for t_item in data:
+          print(t_item)
+          processed_data.append({
+              "title": stemmer.stem(t_item),
+              "description": stemmer.stem(data[t_item])
+              })
+    return processed_data
+def get_technologies_by_id(id_list, technologies):
+    result = []
+    id_set = set(id_list)
+    for tech in technologies:
+        if tech.get('id') in id_set:
+            result.append(tech)
+    return result
+def save_to_pickle(result_similarites):
+    constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
+    max_id2 = max([item['id2'] for item in result_similarites])
+    row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
+    col_labels = list(range(1, max_id2 + 1))
+    num_rows = len(constraint_titles)
+    num_cols = max_id2
+    matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
+    for item in result_similarites:
+        row_idx = row_label_to_index[item['constraint']['title']]
+        col_idx = item['id2'] - 1 #
+        similarity_value = item['similarity'].item()
+        matrix[row_idx, col_idx] = similarity_value
+    print(f"Successfully created matrix with shape: {matrix.shape}")
+    print(f"Number of rows (unique constraints): {num_rows}")
+    print(f"Number of columns (max id2): {num_cols}")
+    print("\nExample 5x5 block of the created matrix (NaN for missing values):")
+    print(matrix[:5, :5])
+    output_filename = "cosine_similarity_matrix_with_labels.pkl"
+    data_to_save = {
+        'matrix': matrix,
+        'row_labels': constraint_titles,
+        'col_labels': col_labels
+    }
+    with open(output_filename, 'wb') as f:
+        pickle.dump(data_to_save, f)
+    print(f"\nMatrix and labels saved to {output_filename}")
+    return output_filename