insight-finder

Running

App Files Files Community

heymenn commited on Jun 24

Commit

2d541bb

verified ·

1 Parent(s): dc7081e

Delete src

Browse files

Files changed (5) hide show

src/__init__.py +0 -0
src/core.py +0 -17
src/services/processor.py +0 -218
src/services/technologies_database.xlsx +0 -3
src/services/utils.py +0 -111

src/__init__.py DELETED Viewed

File without changes

src/core.py DELETED Viewed

@@ -1,17 +0,0 @@
-from src.services.utils import *
-from src.services.processor import *
-def process_input(data):
-    prompt = set_prompt(data.problem)
-    constraints = retrieve_constraints(prompt)
-    constraints_stemmed = stem(constraints, "constraints")
-    save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
-    df = load_technologies()
-    global_tech, keys, original_tech = preprocess_tech_data(df)
-    save_dataframe(global_tech, "global_tech.xlsx")
-    result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
-    save_to_pickle(result_similarities)
-    best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
-    best_technologies_id = select_technologies(best_combinations)
-    best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
-    return best_technologies

src/services/processor.py DELETED Viewed

@@ -1,218 +0,0 @@
-from src.services.utils import tech_to_dict, stem
-import requests as r
-import json
-import nltk
-import itertools
-import numpy as np
-from sentence_transformers import *
-model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-def retrieve_constraints(prompt):
-    request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
-    response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
-    decoded_content = json.loads(response.content.decode())
-    print(f"response : {decoded_content}")
-    llm_response = decoded_content["content"]
-    start_marker = '{'
-    end_marker = '}'
-    start_index = llm_response.find(start_marker) + len(start_marker)
-    end_index = llm_response.find(end_marker, start_index)
-    json_str = llm_response[start_index:end_index].strip()
-    constraints_json = json.loads("{"+json_str+"}")
-    return constraints_json
-def preprocess_tech_data(_df):
-    if _df is None or "description" not in _df.columns:
-        return [], []
-    technologies_list = _df["description"].to_list()
-    tech_dict_raw = tech_to_dict(technologies_list)
-    tech_dict_filtered = [
-        t for t in tech_dict_raw if (
-            len(t.get("title", "")) >= 5 and
-            len(t.get("advantages", "")) >= 5 and
-            len(t.get("key_components", "")) >= 5
-        )
-    ]
-    if not tech_dict_filtered:
-        return [], []
-    processed_tech_wt = stem(tech_dict_filtered,"technologies")
-    for t_item_wt in processed_tech_wt:
-        kc = t_item_wt.get("key_components")
-        if isinstance(kc, str):
-            t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
-        else:
-            t_item_wt["key_components"] = ""
-    original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
-    _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
-    return processed_tech_wt, _keys, original_tech_for_display
-def remove_over_repeated_technologies(result):
-    total_lists = len(result)
-    tech_title = {}
-    for idx, item in enumerate(result):
-        for tech in item['technologies']:
-            tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
-    threshold = total_lists * 0.3
-    print(threshold)
-    print(tech_title)
-    to_delete = []
-    for tech, lists in tech_title.items():
-      if lists > threshold:
-        print(f"This technology have been found over repeated : " + tech)
-        to_delete.append(tech)
-    for idx, item in enumerate(result):
-        result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
-    return result
-def get_contrastive_similarities(global_tech, constraints):
-    selected_pairs = []
-    matrix = []
-    for i, constraint in enumerate(constraints):
-        print(constraint)
-        for j, tech2 in enumerate(global_tech):
-            if i >= j:
-                continue
-            purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
-            print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
-            print(f"Purpose Similarity: {purpose_sim}")
-            selected_pairs.append({
-                    "constraint": constraint,
-                    "id2": tech2["id"],
-                    "similarity": purpose_sim
-            })
-            if purpose_sim == np.float32(None):
-                purpose_sim = 0.0
-            matrix.append(purpose_sim)
-    return selected_pairs,matrix
-def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
-    if not list1 or not list2:
-        print("Warning: One or both input lists are empty. Returning an empty list.")
-        return []
-    MIN_SIMILARITY = 0.3
-    MAX_SIMILARITY = 0.8
-    possible_matches_for_each_l1 = []
-    for i in range(len(list1)):
-        valid_matches_for_l1_element = []
-        for j in range(len(list2)):
-            score = matrix[i, j]
-            if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
-                valid_matches_for_l1_element.append((list2[j], score))
-        if not valid_matches_for_l1_element:
-            print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
-                  f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
-                  "Returning an empty list as no complete combinations can be formed.")
-        else:
-          possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
-    result = []
-    for tech_list, problem in possible_matches_for_each_l1:
-        sorted_list = sorted(
-            tech_list,
-            key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
-            reverse=True
-        )
-        top5 = sorted_list[:5]
-        result.append({
-            'technologies': top5,
-            'problem': problem
-        })
-    result = remove_over_repeated_technologies(result)
-    return result
-def select_technologies(problem_technology_list):
-    distinct_techs = set()
-    candidate_map = []
-    for problem_data in problem_technology_list:
-        cand_dict = {}
-        for tech_info, sim in problem_data['technologies']:
-            tech_id = tech_info['id']
-            distinct_techs.add(tech_id)
-            cand_dict[tech_id] = float(sim)
-        candidate_map.append(cand_dict)
-    distinct_techs = sorted(list(distinct_techs))
-    n = len(problem_technology_list)
-    if n == 0:
-        return set()
-    min_k = None
-    best_set = None
-    best_avg = -1
-    print(f"Distinct technologies: {distinct_techs}")
-    print(f"Candidate map: {candidate_map}")
-    print(f"Number of problems: {n}")
-    for k in range(1, len(distinct_techs)+1):
-        if min_k is not None and k > min_k:
-            break
-        for T in itertools.combinations(distinct_techs, k):
-            total_sim = 0.0
-            covered = True
-            print(f"Trying combination: {T}")
-            for i in range(n):
-                max_sim = -1.0
-                found = False
-                for tech in T:
-                    if tech in candidate_map[i]:
-                        found = True
-                        sim_val = candidate_map[i][tech]
-                        if sim_val > max_sim:
-                            max_sim = sim_val
-                if not found:
-                    covered = False
-                    break
-                else:
-                    total_sim += max_sim
-            if covered:
-                avg_sim = total_sim / n
-                if min_k is None or k < min_k:
-                    min_k = k
-                    best_set = T
-                    best_avg = avg_sim
-                elif k == min_k and avg_sim > best_avg:
-                    best_set = T
-                    best_avg = avg_sim
-        if min_k is not None and k == min_k:
-            break
-    if best_set is None:
-        return set()
-    return set(best_set)

src/services/technologies_database.xlsx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
-size 213138

src/services/utils.py DELETED Viewed

@@ -1,111 +0,0 @@
-import pickle
-import numpy as np
-import pandas as pd
-import nltk
-from nltk.stem import *
-nltk.download("punkt_tab")
-TECH_PATH = "/app/src/services/technologies_database.xlsx"
-def set_prompt(problem):
-    prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
-    Take into account different technical domains to encompass the whole problem.
-    Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
-    Technical problem :
-    """ + problem
-    return prompt
-def load_technologies():
-    df = pd.read_excel(TECH_PATH)
-    return df
-def tech_to_dict(technologies):
-    tech_dict = []
-    for index, tech in enumerate(technologies):
-        if not tech.find("<title>") > 1:
-            tab = tech.split("\n")
-            tab.pop(0)
-            tab.pop(len(tab)-1)
-            tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
-                                "purpose": tab[1][tab[1].find(": ")+2:],
-                                "key_components": tab[2][tab[2].find(": ")+2:],
-                                "advantages": tab[3][tab[3].find(": ")+2:],
-                                "limitations": tab[4][tab[4].find(": ")+2:],
-                                "id": index})
-    return tech_dict
-def save_dataframe(df, title):
-    pd.DataFrame(df).to_excel(title)
-    return title
-def stem(data,data_type):
-    stemmer = SnowballStemmer("english")
-    processed_data = []
-    if data_type == "technologies":
-      for t_item in data:
-          processed_data.append({
-              "title": stemmer.stem(t_item["title"]),
-              "purpose": stemmer.stem(t_item["purpose"]),
-              "key_components": stemmer.stem(t_item["key_components"]),
-              "advantages": stemmer.stem(t_item["advantages"]),
-              "limitations": stemmer.stem(t_item["limitations"]),
-              "id": t_item["id"]
-          })
-    else:
-      for t_item in data:
-          print(t_item)
-          processed_data.append({
-              "title": stemmer.stem(t_item),
-              "description": stemmer.stem(data[t_item])
-              })
-    return processed_data
-def get_technologies_by_id(id_list, technologies):
-    result = []
-    id_set = set(id_list)
-    for tech in technologies:
-        if tech.get('id') in id_set:
-            result.append(tech)
-    return result
-def save_to_pickle(result_similarites):
-    constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
-    max_id2 = max([item['id2'] for item in result_similarites])
-    row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
-    col_labels = list(range(1, max_id2 + 1))
-    num_rows = len(constraint_titles)
-    num_cols = max_id2
-    matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
-    for item in result_similarites:
-        row_idx = row_label_to_index[item['constraint']['title']]
-        col_idx = item['id2'] - 1 #
-        similarity_value = item['similarity'].item()
-        matrix[row_idx, col_idx] = similarity_value
-    print(f"Successfully created matrix with shape: {matrix.shape}")
-    print(f"Number of rows (unique constraints): {num_rows}")
-    print(f"Number of columns (max id2): {num_cols}")
-    print("\nExample 5x5 block of the created matrix (NaN for missing values):")
-    print(matrix[:5, :5])
-    output_filename = "cosine_similarity_matrix_with_labels.pkl"
-    data_to_save = {
-        'matrix': matrix,
-        'row_labels': constraint_titles,
-        'col_labels': col_labels
-    }
-    with open(output_filename, 'wb') as f:
-        pickle.dump(data_to_save, f)
-    print(f"\nMatrix and labels saved to {output_filename}")
-    return output_filename