heymenn commited on
Commit
b03d3b6
·
verified ·
1 Parent(s): 6cf2fcb

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
37
+ src/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+
4
+ from src.core import process_input
5
+
6
+ app = FastAPI(
7
+ title="Insight Finder",
8
+ description="Find relevant technologies from a problem",
9
+ )
10
+
11
+ class InputData(BaseModel):
12
+ problem: str
13
+
14
+ class OutputData(BaseModel):
15
+ technologies: list
16
+
17
+ @app.post("/process", response_model=OutputData)
18
+ async def process(data: InputData):
19
+ result = process_input(data)
20
+ return result
src/__init__.py ADDED
File without changes
src/core.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.services.utils import *
2
+ from src.services.processor import *
3
+
4
+ def process_input(data):
5
+ prompt = set_prompt(data)
6
+ constraints = retrieve_constraints(prompt)
7
+ constraints_stemmed = stem(constraints, "constraints")
8
+ save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
9
+ df = load_technologies()
10
+ global_tech, keys, original_tech = preprocess_tech_data(df)
11
+ save_dataframe(global_tech, "global_tech.xlsx")
12
+ result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
13
+ save_to_pickle(result_similarities)
14
+ best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
15
+ best_technologies_id = select_technologies(best_combinations)
16
+ best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
17
+ return best_technologies
src/services/processor.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.services.utils import tech_to_dict, stem
2
+ import requests as r
3
+ import json
4
+ import nltk
5
+ import itertools
6
+ import numpy as np
7
+
8
+ from sentence_transformers import *
9
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
10
+
11
+ def retrieve_constraints(prompt):
12
+ request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
13
+ response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
14
+
15
+ decoded_content = json.loads(response.content.decode())
16
+ llm_response = decoded_content["content"][0]["message"]["content"]
17
+
18
+ start_marker = '{'
19
+ end_marker = '}'
20
+ start_index = llm_response.find(start_marker) + len(start_marker)
21
+ end_index = llm_response.find(end_marker, start_index)
22
+ json_str = llm_response[start_index:end_index].strip()
23
+
24
+ constraints_json = json.loads("{"+json_str+"}")
25
+
26
+ return constraints_json
27
+
28
+
29
+ def preprocess_tech_data(_df):
30
+ if _df is None or "description" not in _df.columns:
31
+ return [], []
32
+
33
+ technologies_list = _df["description"].to_list()
34
+ tech_dict_raw = tech_to_dict(technologies_list)
35
+
36
+ tech_dict_filtered = [
37
+ t for t in tech_dict_raw if (
38
+ len(t.get("title", "")) >= 5 and
39
+ len(t.get("advantages", "")) >= 5 and
40
+ len(t.get("key_components", "")) >= 5
41
+ )
42
+ ]
43
+
44
+ if not tech_dict_filtered:
45
+ return [], []
46
+
47
+ processed_tech_wt = stem(tech_dict_filtered,"technologies")
48
+
49
+ for t_item_wt in processed_tech_wt:
50
+ kc = t_item_wt.get("key_components")
51
+ if isinstance(kc, str):
52
+ t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
53
+ else:
54
+ t_item_wt["key_components"] = ""
55
+
56
+ original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
57
+
58
+
59
+ _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
60
+ return processed_tech_wt, _keys, original_tech_for_display
61
+
62
+
63
+ def remove_over_repeated_technologies(result):
64
+ total_lists = len(result)
65
+ tech_title = {}
66
+
67
+ for idx, item in enumerate(result):
68
+ for tech in item['technologies']:
69
+ tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
70
+
71
+ threshold = total_lists * 0.3
72
+ print(threshold)
73
+ print(tech_title)
74
+ to_delete = []
75
+ for tech, lists in tech_title.items():
76
+ if lists > threshold:
77
+ print(f"This technology have been found over repeated : " + tech)
78
+ to_delete.append(tech)
79
+
80
+ for idx, item in enumerate(result):
81
+ result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
82
+
83
+ return result
84
+
85
+ def get_contrastive_similarities(global_tech, constraints):
86
+ selected_pairs = []
87
+ matrix = []
88
+
89
+ for i, constraint in enumerate(constraints):
90
+ print(constraint)
91
+ for j, tech2 in enumerate(global_tech):
92
+ if i >= j:
93
+ continue
94
+
95
+ purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
96
+
97
+ print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
98
+ print(f"Purpose Similarity: {purpose_sim}")
99
+ selected_pairs.append({
100
+ "constraint": constraint,
101
+ "id2": tech2["id"],
102
+ "similarity": purpose_sim
103
+ })
104
+ if purpose_sim == np.float32(None):
105
+ purpose_sim = 0.0
106
+ matrix.append(purpose_sim)
107
+
108
+ return selected_pairs,matrix
109
+
110
+
111
+ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
112
+ if not list1 or not list2:
113
+ print("Warning: One or both input lists are empty. Returning an empty list.")
114
+ return []
115
+
116
+ MIN_SIMILARITY = 0.3
117
+ MAX_SIMILARITY = 0.8
118
+
119
+ possible_matches_for_each_l1 = []
120
+ for i in range(len(list1)):
121
+ valid_matches_for_l1_element = []
122
+ for j in range(len(list2)):
123
+ score = matrix[i, j]
124
+
125
+ if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
126
+ valid_matches_for_l1_element.append((list2[j], score))
127
+
128
+ if not valid_matches_for_l1_element:
129
+ print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
130
+ f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
131
+ "Returning an empty list as no complete combinations can be formed.")
132
+
133
+ else:
134
+ possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
135
+
136
+ result = []
137
+ for tech_list, problem in possible_matches_for_each_l1:
138
+ sorted_list = sorted(
139
+ tech_list,
140
+ key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
141
+ reverse=True
142
+ )
143
+ top5 = sorted_list[:5]
144
+ result.append({
145
+ 'technologies': top5,
146
+ 'problem': problem
147
+ })
148
+
149
+ result = remove_over_repeated_technologies(result)
150
+ return result
151
+
152
+
153
+ def select_technologies(problem_technology_list):
154
+ distinct_techs = set()
155
+ candidate_map = []
156
+
157
+ for problem_data in problem_technology_list:
158
+ cand_dict = {}
159
+ for tech_info, sim in problem_data['technologies']:
160
+ tech_id = tech_info['id']
161
+ distinct_techs.add(tech_id)
162
+ cand_dict[tech_id] = float(sim)
163
+ candidate_map.append(cand_dict)
164
+
165
+ distinct_techs = sorted(list(distinct_techs))
166
+ n = len(problem_technology_list)
167
+
168
+ if n == 0:
169
+ return set()
170
+
171
+ min_k = None
172
+ best_set = None
173
+ best_avg = -1
174
+
175
+ print(f"Distinct technologies: {distinct_techs}")
176
+ print(f"Candidate map: {candidate_map}")
177
+ print(f"Number of problems: {n}")
178
+
179
+ for k in range(1, len(distinct_techs)+1):
180
+ if min_k is not None and k > min_k:
181
+ break
182
+
183
+ for T in itertools.combinations(distinct_techs, k):
184
+ total_sim = 0.0
185
+ covered = True
186
+ print(f"Trying combination: {T}")
187
+ for i in range(n):
188
+ max_sim = -1.0
189
+ found = False
190
+ for tech in T:
191
+ if tech in candidate_map[i]:
192
+ found = True
193
+ sim_val = candidate_map[i][tech]
194
+ if sim_val > max_sim:
195
+ max_sim = sim_val
196
+ if not found:
197
+ covered = False
198
+ break
199
+ else:
200
+ total_sim += max_sim
201
+
202
+ if covered:
203
+ avg_sim = total_sim / n
204
+ if min_k is None or k < min_k:
205
+ min_k = k
206
+ best_set = T
207
+ best_avg = avg_sim
208
+ elif k == min_k and avg_sim > best_avg:
209
+ best_set = T
210
+ best_avg = avg_sim
211
+
212
+ if min_k is not None and k == min_k:
213
+ break
214
+
215
+ if best_set is None:
216
+ return set()
217
+ return set(best_set)
src/services/technologies_database.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
3
+ size 213138
src/services/utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ import nltk
6
+ from nltk.stem import *
7
+ nltk.download("punkt_tab")
8
+
9
+
10
+ def set_prompt(InputData):
11
+ prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
12
+ Take into account different technical domains to encompass the whole problem.
13
+ Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
14
+ Technical problem :
15
+ """ + InputData['problem']
16
+ return prompt
17
+
18
+ def load_technologies():
19
+ df = pd.read_excel('technologies_database.xlsx')
20
+ return df
21
+
22
+ def tech_to_dict(technologies):
23
+ tech_dict = []
24
+ for index, tech in enumerate(technologies):
25
+ if not tech.find("<title>") > 1:
26
+ tab = tech.split("\n")
27
+ tab.pop(0)
28
+ tab.pop(len(tab)-1)
29
+ tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
30
+ "purpose": tab[1][tab[1].find(": ")+2:],
31
+ "key_components": tab[2][tab[2].find(": ")+2:],
32
+ "advantages": tab[3][tab[3].find(": ")+2:],
33
+ "limitations": tab[4][tab[4].find(": ")+2:],
34
+ "id": index})
35
+ return tech_dict
36
+
37
+ def save_dataframe(df, title):
38
+ pd.DataFrame(df).to_excel(title)
39
+ return title
40
+
41
+ def stem(data,data_type):
42
+ stemmer = SnowballStemmer("english")
43
+ processed_data = []
44
+ if data_type == "technologies":
45
+ for t_item in data:
46
+ processed_data.append({
47
+ "title": stemmer.stem(t_item["title"]),
48
+ "purpose": stemmer.stem(t_item["purpose"]),
49
+ "key_components": stemmer.stem(t_item["key_components"]),
50
+ "advantages": stemmer.stem(t_item["advantages"]),
51
+ "limitations": stemmer.stem(t_item["limitations"]),
52
+ "id": t_item["id"]
53
+ })
54
+ else:
55
+ for t_item in data:
56
+ print(t_item)
57
+ processed_data.append({
58
+ "title": stemmer.stem(t_item),
59
+ "description": stemmer.stem(data[t_item])
60
+ })
61
+
62
+ return processed_data
63
+
64
+
65
+ def get_technologies_by_id(id_list, technologies):
66
+ result = []
67
+ id_set = set(id_list)
68
+ for tech in technologies:
69
+ if tech.get('id') in id_set:
70
+ result.append(tech)
71
+ return result
72
+
73
+ def save_to_pickle(result_similarites):
74
+
75
+ constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
76
+ max_id2 = max([item['id2'] for item in result_similarites])
77
+
78
+ row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
79
+ col_labels = list(range(1, max_id2 + 1))
80
+
81
+ num_rows = len(constraint_titles)
82
+ num_cols = max_id2
83
+
84
+ matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
85
+
86
+ for item in result_similarites:
87
+ row_idx = row_label_to_index[item['constraint']['title']]
88
+ col_idx = item['id2'] - 1 #
89
+ similarity_value = item['similarity'].item()
90
+
91
+ matrix[row_idx, col_idx] = similarity_value
92
+
93
+ print(f"Successfully created matrix with shape: {matrix.shape}")
94
+ print(f"Number of rows (unique constraints): {num_rows}")
95
+ print(f"Number of columns (max id2): {num_cols}")
96
+ print("\nExample 5x5 block of the created matrix (NaN for missing values):")
97
+ print(matrix[:5, :5])
98
+
99
+ output_filename = "cosine_similarity_matrix_with_labels.pkl"
100
+ data_to_save = {
101
+ 'matrix': matrix,
102
+ 'row_labels': constraint_titles,
103
+ 'col_labels': col_labels
104
+ }
105
+
106
+ with open(output_filename, 'wb') as f:
107
+ pickle.dump(data_to_save, f)
108
+
109
+ print(f"\nMatrix and labels saved to {output_filename}")
110
+ return output_filename