Upload 6 files
Browse files- .gitattributes +1 -0
- app.py +20 -0
- src/__init__.py +0 -0
- src/core.py +17 -0
- src/services/processor.py +217 -0
- src/services/technologies_database.xlsx +3 -0
- src/services/utils.py +110 -0
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
|
37 |
+
src/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
|
4 |
+
from src.core import process_input
|
5 |
+
|
6 |
+
app = FastAPI(
|
7 |
+
title="Insight Finder",
|
8 |
+
description="Find relevant technologies from a problem",
|
9 |
+
)
|
10 |
+
|
11 |
+
class InputData(BaseModel):
|
12 |
+
problem: str
|
13 |
+
|
14 |
+
class OutputData(BaseModel):
|
15 |
+
technologies: list
|
16 |
+
|
17 |
+
@app.post("/process", response_model=OutputData)
|
18 |
+
async def process(data: InputData):
|
19 |
+
result = process_input(data)
|
20 |
+
return result
|
src/__init__.py
ADDED
File without changes
|
src/core.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.services.utils import *
|
2 |
+
from src.services.processor import *
|
3 |
+
|
4 |
+
def process_input(data):
|
5 |
+
prompt = set_prompt(data)
|
6 |
+
constraints = retrieve_constraints(prompt)
|
7 |
+
constraints_stemmed = stem(constraints, "constraints")
|
8 |
+
save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
|
9 |
+
df = load_technologies()
|
10 |
+
global_tech, keys, original_tech = preprocess_tech_data(df)
|
11 |
+
save_dataframe(global_tech, "global_tech.xlsx")
|
12 |
+
result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
|
13 |
+
save_to_pickle(result_similarities)
|
14 |
+
best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
|
15 |
+
best_technologies_id = select_technologies(best_combinations)
|
16 |
+
best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
|
17 |
+
return best_technologies
|
src/services/processor.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.services.utils import tech_to_dict, stem
|
2 |
+
import requests as r
|
3 |
+
import json
|
4 |
+
import nltk
|
5 |
+
import itertools
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from sentence_transformers import *
|
9 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
10 |
+
|
11 |
+
def retrieve_constraints(prompt):
|
12 |
+
request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
|
13 |
+
response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
|
14 |
+
|
15 |
+
decoded_content = json.loads(response.content.decode())
|
16 |
+
llm_response = decoded_content["content"][0]["message"]["content"]
|
17 |
+
|
18 |
+
start_marker = '{'
|
19 |
+
end_marker = '}'
|
20 |
+
start_index = llm_response.find(start_marker) + len(start_marker)
|
21 |
+
end_index = llm_response.find(end_marker, start_index)
|
22 |
+
json_str = llm_response[start_index:end_index].strip()
|
23 |
+
|
24 |
+
constraints_json = json.loads("{"+json_str+"}")
|
25 |
+
|
26 |
+
return constraints_json
|
27 |
+
|
28 |
+
|
29 |
+
def preprocess_tech_data(_df):
|
30 |
+
if _df is None or "description" not in _df.columns:
|
31 |
+
return [], []
|
32 |
+
|
33 |
+
technologies_list = _df["description"].to_list()
|
34 |
+
tech_dict_raw = tech_to_dict(technologies_list)
|
35 |
+
|
36 |
+
tech_dict_filtered = [
|
37 |
+
t for t in tech_dict_raw if (
|
38 |
+
len(t.get("title", "")) >= 5 and
|
39 |
+
len(t.get("advantages", "")) >= 5 and
|
40 |
+
len(t.get("key_components", "")) >= 5
|
41 |
+
)
|
42 |
+
]
|
43 |
+
|
44 |
+
if not tech_dict_filtered:
|
45 |
+
return [], []
|
46 |
+
|
47 |
+
processed_tech_wt = stem(tech_dict_filtered,"technologies")
|
48 |
+
|
49 |
+
for t_item_wt in processed_tech_wt:
|
50 |
+
kc = t_item_wt.get("key_components")
|
51 |
+
if isinstance(kc, str):
|
52 |
+
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
|
53 |
+
else:
|
54 |
+
t_item_wt["key_components"] = ""
|
55 |
+
|
56 |
+
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
|
57 |
+
|
58 |
+
|
59 |
+
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
|
60 |
+
return processed_tech_wt, _keys, original_tech_for_display
|
61 |
+
|
62 |
+
|
63 |
+
def remove_over_repeated_technologies(result):
|
64 |
+
total_lists = len(result)
|
65 |
+
tech_title = {}
|
66 |
+
|
67 |
+
for idx, item in enumerate(result):
|
68 |
+
for tech in item['technologies']:
|
69 |
+
tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
|
70 |
+
|
71 |
+
threshold = total_lists * 0.3
|
72 |
+
print(threshold)
|
73 |
+
print(tech_title)
|
74 |
+
to_delete = []
|
75 |
+
for tech, lists in tech_title.items():
|
76 |
+
if lists > threshold:
|
77 |
+
print(f"This technology have been found over repeated : " + tech)
|
78 |
+
to_delete.append(tech)
|
79 |
+
|
80 |
+
for idx, item in enumerate(result):
|
81 |
+
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
|
82 |
+
|
83 |
+
return result
|
84 |
+
|
85 |
+
def get_contrastive_similarities(global_tech, constraints):
|
86 |
+
selected_pairs = []
|
87 |
+
matrix = []
|
88 |
+
|
89 |
+
for i, constraint in enumerate(constraints):
|
90 |
+
print(constraint)
|
91 |
+
for j, tech2 in enumerate(global_tech):
|
92 |
+
if i >= j:
|
93 |
+
continue
|
94 |
+
|
95 |
+
purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
|
96 |
+
|
97 |
+
print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
|
98 |
+
print(f"Purpose Similarity: {purpose_sim}")
|
99 |
+
selected_pairs.append({
|
100 |
+
"constraint": constraint,
|
101 |
+
"id2": tech2["id"],
|
102 |
+
"similarity": purpose_sim
|
103 |
+
})
|
104 |
+
if purpose_sim == np.float32(None):
|
105 |
+
purpose_sim = 0.0
|
106 |
+
matrix.append(purpose_sim)
|
107 |
+
|
108 |
+
return selected_pairs,matrix
|
109 |
+
|
110 |
+
|
111 |
+
def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
|
112 |
+
if not list1 or not list2:
|
113 |
+
print("Warning: One or both input lists are empty. Returning an empty list.")
|
114 |
+
return []
|
115 |
+
|
116 |
+
MIN_SIMILARITY = 0.3
|
117 |
+
MAX_SIMILARITY = 0.8
|
118 |
+
|
119 |
+
possible_matches_for_each_l1 = []
|
120 |
+
for i in range(len(list1)):
|
121 |
+
valid_matches_for_l1_element = []
|
122 |
+
for j in range(len(list2)):
|
123 |
+
score = matrix[i, j]
|
124 |
+
|
125 |
+
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
126 |
+
valid_matches_for_l1_element.append((list2[j], score))
|
127 |
+
|
128 |
+
if not valid_matches_for_l1_element:
|
129 |
+
print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
|
130 |
+
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
131 |
+
"Returning an empty list as no complete combinations can be formed.")
|
132 |
+
|
133 |
+
else:
|
134 |
+
possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
|
135 |
+
|
136 |
+
result = []
|
137 |
+
for tech_list, problem in possible_matches_for_each_l1:
|
138 |
+
sorted_list = sorted(
|
139 |
+
tech_list,
|
140 |
+
key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
|
141 |
+
reverse=True
|
142 |
+
)
|
143 |
+
top5 = sorted_list[:5]
|
144 |
+
result.append({
|
145 |
+
'technologies': top5,
|
146 |
+
'problem': problem
|
147 |
+
})
|
148 |
+
|
149 |
+
result = remove_over_repeated_technologies(result)
|
150 |
+
return result
|
151 |
+
|
152 |
+
|
153 |
+
def select_technologies(problem_technology_list):
|
154 |
+
distinct_techs = set()
|
155 |
+
candidate_map = []
|
156 |
+
|
157 |
+
for problem_data in problem_technology_list:
|
158 |
+
cand_dict = {}
|
159 |
+
for tech_info, sim in problem_data['technologies']:
|
160 |
+
tech_id = tech_info['id']
|
161 |
+
distinct_techs.add(tech_id)
|
162 |
+
cand_dict[tech_id] = float(sim)
|
163 |
+
candidate_map.append(cand_dict)
|
164 |
+
|
165 |
+
distinct_techs = sorted(list(distinct_techs))
|
166 |
+
n = len(problem_technology_list)
|
167 |
+
|
168 |
+
if n == 0:
|
169 |
+
return set()
|
170 |
+
|
171 |
+
min_k = None
|
172 |
+
best_set = None
|
173 |
+
best_avg = -1
|
174 |
+
|
175 |
+
print(f"Distinct technologies: {distinct_techs}")
|
176 |
+
print(f"Candidate map: {candidate_map}")
|
177 |
+
print(f"Number of problems: {n}")
|
178 |
+
|
179 |
+
for k in range(1, len(distinct_techs)+1):
|
180 |
+
if min_k is not None and k > min_k:
|
181 |
+
break
|
182 |
+
|
183 |
+
for T in itertools.combinations(distinct_techs, k):
|
184 |
+
total_sim = 0.0
|
185 |
+
covered = True
|
186 |
+
print(f"Trying combination: {T}")
|
187 |
+
for i in range(n):
|
188 |
+
max_sim = -1.0
|
189 |
+
found = False
|
190 |
+
for tech in T:
|
191 |
+
if tech in candidate_map[i]:
|
192 |
+
found = True
|
193 |
+
sim_val = candidate_map[i][tech]
|
194 |
+
if sim_val > max_sim:
|
195 |
+
max_sim = sim_val
|
196 |
+
if not found:
|
197 |
+
covered = False
|
198 |
+
break
|
199 |
+
else:
|
200 |
+
total_sim += max_sim
|
201 |
+
|
202 |
+
if covered:
|
203 |
+
avg_sim = total_sim / n
|
204 |
+
if min_k is None or k < min_k:
|
205 |
+
min_k = k
|
206 |
+
best_set = T
|
207 |
+
best_avg = avg_sim
|
208 |
+
elif k == min_k and avg_sim > best_avg:
|
209 |
+
best_set = T
|
210 |
+
best_avg = avg_sim
|
211 |
+
|
212 |
+
if min_k is not None and k == min_k:
|
213 |
+
break
|
214 |
+
|
215 |
+
if best_set is None:
|
216 |
+
return set()
|
217 |
+
return set(best_set)
|
src/services/technologies_database.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
|
3 |
+
size 213138
|
src/services/utils.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
import nltk
|
6 |
+
from nltk.stem import *
|
7 |
+
nltk.download("punkt_tab")
|
8 |
+
|
9 |
+
|
10 |
+
def set_prompt(InputData):
|
11 |
+
prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
|
12 |
+
Take into account different technical domains to encompass the whole problem.
|
13 |
+
Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
|
14 |
+
Technical problem :
|
15 |
+
""" + InputData['problem']
|
16 |
+
return prompt
|
17 |
+
|
18 |
+
def load_technologies():
|
19 |
+
df = pd.read_excel('technologies_database.xlsx')
|
20 |
+
return df
|
21 |
+
|
22 |
+
def tech_to_dict(technologies):
|
23 |
+
tech_dict = []
|
24 |
+
for index, tech in enumerate(technologies):
|
25 |
+
if not tech.find("<title>") > 1:
|
26 |
+
tab = tech.split("\n")
|
27 |
+
tab.pop(0)
|
28 |
+
tab.pop(len(tab)-1)
|
29 |
+
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
|
30 |
+
"purpose": tab[1][tab[1].find(": ")+2:],
|
31 |
+
"key_components": tab[2][tab[2].find(": ")+2:],
|
32 |
+
"advantages": tab[3][tab[3].find(": ")+2:],
|
33 |
+
"limitations": tab[4][tab[4].find(": ")+2:],
|
34 |
+
"id": index})
|
35 |
+
return tech_dict
|
36 |
+
|
37 |
+
def save_dataframe(df, title):
|
38 |
+
pd.DataFrame(df).to_excel(title)
|
39 |
+
return title
|
40 |
+
|
41 |
+
def stem(data,data_type):
|
42 |
+
stemmer = SnowballStemmer("english")
|
43 |
+
processed_data = []
|
44 |
+
if data_type == "technologies":
|
45 |
+
for t_item in data:
|
46 |
+
processed_data.append({
|
47 |
+
"title": stemmer.stem(t_item["title"]),
|
48 |
+
"purpose": stemmer.stem(t_item["purpose"]),
|
49 |
+
"key_components": stemmer.stem(t_item["key_components"]),
|
50 |
+
"advantages": stemmer.stem(t_item["advantages"]),
|
51 |
+
"limitations": stemmer.stem(t_item["limitations"]),
|
52 |
+
"id": t_item["id"]
|
53 |
+
})
|
54 |
+
else:
|
55 |
+
for t_item in data:
|
56 |
+
print(t_item)
|
57 |
+
processed_data.append({
|
58 |
+
"title": stemmer.stem(t_item),
|
59 |
+
"description": stemmer.stem(data[t_item])
|
60 |
+
})
|
61 |
+
|
62 |
+
return processed_data
|
63 |
+
|
64 |
+
|
65 |
+
def get_technologies_by_id(id_list, technologies):
|
66 |
+
result = []
|
67 |
+
id_set = set(id_list)
|
68 |
+
for tech in technologies:
|
69 |
+
if tech.get('id') in id_set:
|
70 |
+
result.append(tech)
|
71 |
+
return result
|
72 |
+
|
73 |
+
def save_to_pickle(result_similarites):
|
74 |
+
|
75 |
+
constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
|
76 |
+
max_id2 = max([item['id2'] for item in result_similarites])
|
77 |
+
|
78 |
+
row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
|
79 |
+
col_labels = list(range(1, max_id2 + 1))
|
80 |
+
|
81 |
+
num_rows = len(constraint_titles)
|
82 |
+
num_cols = max_id2
|
83 |
+
|
84 |
+
matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
|
85 |
+
|
86 |
+
for item in result_similarites:
|
87 |
+
row_idx = row_label_to_index[item['constraint']['title']]
|
88 |
+
col_idx = item['id2'] - 1 #
|
89 |
+
similarity_value = item['similarity'].item()
|
90 |
+
|
91 |
+
matrix[row_idx, col_idx] = similarity_value
|
92 |
+
|
93 |
+
print(f"Successfully created matrix with shape: {matrix.shape}")
|
94 |
+
print(f"Number of rows (unique constraints): {num_rows}")
|
95 |
+
print(f"Number of columns (max id2): {num_cols}")
|
96 |
+
print("\nExample 5x5 block of the created matrix (NaN for missing values):")
|
97 |
+
print(matrix[:5, :5])
|
98 |
+
|
99 |
+
output_filename = "cosine_similarity_matrix_with_labels.pkl"
|
100 |
+
data_to_save = {
|
101 |
+
'matrix': matrix,
|
102 |
+
'row_labels': constraint_titles,
|
103 |
+
'col_labels': col_labels
|
104 |
+
}
|
105 |
+
|
106 |
+
with open(output_filename, 'wb') as f:
|
107 |
+
pickle.dump(data_to_save, f)
|
108 |
+
|
109 |
+
print(f"\nMatrix and labels saved to {output_filename}")
|
110 |
+
return output_filename
|