heymenn commited on
Commit
2d541bb
·
verified ·
1 Parent(s): dc7081e

Delete src

Browse files
src/__init__.py DELETED
File without changes
src/core.py DELETED
@@ -1,17 +0,0 @@
1
- from src.services.utils import *
2
- from src.services.processor import *
3
-
4
- def process_input(data):
5
- prompt = set_prompt(data.problem)
6
- constraints = retrieve_constraints(prompt)
7
- constraints_stemmed = stem(constraints, "constraints")
8
- save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
9
- df = load_technologies()
10
- global_tech, keys, original_tech = preprocess_tech_data(df)
11
- save_dataframe(global_tech, "global_tech.xlsx")
12
- result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
13
- save_to_pickle(result_similarities)
14
- best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
15
- best_technologies_id = select_technologies(best_combinations)
16
- best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
17
- return best_technologies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/processor.py DELETED
@@ -1,218 +0,0 @@
1
- from src.services.utils import tech_to_dict, stem
2
- import requests as r
3
- import json
4
- import nltk
5
- import itertools
6
- import numpy as np
7
-
8
- from sentence_transformers import *
9
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
10
-
11
- def retrieve_constraints(prompt):
12
- request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
13
- response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
14
-
15
- decoded_content = json.loads(response.content.decode())
16
- print(f"response : {decoded_content}")
17
- llm_response = decoded_content["content"]
18
-
19
- start_marker = '{'
20
- end_marker = '}'
21
- start_index = llm_response.find(start_marker) + len(start_marker)
22
- end_index = llm_response.find(end_marker, start_index)
23
- json_str = llm_response[start_index:end_index].strip()
24
-
25
- constraints_json = json.loads("{"+json_str+"}")
26
-
27
- return constraints_json
28
-
29
-
30
- def preprocess_tech_data(_df):
31
- if _df is None or "description" not in _df.columns:
32
- return [], []
33
-
34
- technologies_list = _df["description"].to_list()
35
- tech_dict_raw = tech_to_dict(technologies_list)
36
-
37
- tech_dict_filtered = [
38
- t for t in tech_dict_raw if (
39
- len(t.get("title", "")) >= 5 and
40
- len(t.get("advantages", "")) >= 5 and
41
- len(t.get("key_components", "")) >= 5
42
- )
43
- ]
44
-
45
- if not tech_dict_filtered:
46
- return [], []
47
-
48
- processed_tech_wt = stem(tech_dict_filtered,"technologies")
49
-
50
- for t_item_wt in processed_tech_wt:
51
- kc = t_item_wt.get("key_components")
52
- if isinstance(kc, str):
53
- t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
54
- else:
55
- t_item_wt["key_components"] = ""
56
-
57
- original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
58
-
59
-
60
- _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
61
- return processed_tech_wt, _keys, original_tech_for_display
62
-
63
-
64
- def remove_over_repeated_technologies(result):
65
- total_lists = len(result)
66
- tech_title = {}
67
-
68
- for idx, item in enumerate(result):
69
- for tech in item['technologies']:
70
- tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
71
-
72
- threshold = total_lists * 0.3
73
- print(threshold)
74
- print(tech_title)
75
- to_delete = []
76
- for tech, lists in tech_title.items():
77
- if lists > threshold:
78
- print(f"This technology have been found over repeated : " + tech)
79
- to_delete.append(tech)
80
-
81
- for idx, item in enumerate(result):
82
- result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
83
-
84
- return result
85
-
86
- def get_contrastive_similarities(global_tech, constraints):
87
- selected_pairs = []
88
- matrix = []
89
-
90
- for i, constraint in enumerate(constraints):
91
- print(constraint)
92
- for j, tech2 in enumerate(global_tech):
93
- if i >= j:
94
- continue
95
-
96
- purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
97
-
98
- print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
99
- print(f"Purpose Similarity: {purpose_sim}")
100
- selected_pairs.append({
101
- "constraint": constraint,
102
- "id2": tech2["id"],
103
- "similarity": purpose_sim
104
- })
105
- if purpose_sim == np.float32(None):
106
- purpose_sim = 0.0
107
- matrix.append(purpose_sim)
108
-
109
- return selected_pairs,matrix
110
-
111
-
112
- def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
113
- if not list1 or not list2:
114
- print("Warning: One or both input lists are empty. Returning an empty list.")
115
- return []
116
-
117
- MIN_SIMILARITY = 0.3
118
- MAX_SIMILARITY = 0.8
119
-
120
- possible_matches_for_each_l1 = []
121
- for i in range(len(list1)):
122
- valid_matches_for_l1_element = []
123
- for j in range(len(list2)):
124
- score = matrix[i, j]
125
-
126
- if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
127
- valid_matches_for_l1_element.append((list2[j], score))
128
-
129
- if not valid_matches_for_l1_element:
130
- print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
131
- f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
132
- "Returning an empty list as no complete combinations can be formed.")
133
-
134
- else:
135
- possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
136
-
137
- result = []
138
- for tech_list, problem in possible_matches_for_each_l1:
139
- sorted_list = sorted(
140
- tech_list,
141
- key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
142
- reverse=True
143
- )
144
- top5 = sorted_list[:5]
145
- result.append({
146
- 'technologies': top5,
147
- 'problem': problem
148
- })
149
-
150
- result = remove_over_repeated_technologies(result)
151
- return result
152
-
153
-
154
- def select_technologies(problem_technology_list):
155
- distinct_techs = set()
156
- candidate_map = []
157
-
158
- for problem_data in problem_technology_list:
159
- cand_dict = {}
160
- for tech_info, sim in problem_data['technologies']:
161
- tech_id = tech_info['id']
162
- distinct_techs.add(tech_id)
163
- cand_dict[tech_id] = float(sim)
164
- candidate_map.append(cand_dict)
165
-
166
- distinct_techs = sorted(list(distinct_techs))
167
- n = len(problem_technology_list)
168
-
169
- if n == 0:
170
- return set()
171
-
172
- min_k = None
173
- best_set = None
174
- best_avg = -1
175
-
176
- print(f"Distinct technologies: {distinct_techs}")
177
- print(f"Candidate map: {candidate_map}")
178
- print(f"Number of problems: {n}")
179
-
180
- for k in range(1, len(distinct_techs)+1):
181
- if min_k is not None and k > min_k:
182
- break
183
-
184
- for T in itertools.combinations(distinct_techs, k):
185
- total_sim = 0.0
186
- covered = True
187
- print(f"Trying combination: {T}")
188
- for i in range(n):
189
- max_sim = -1.0
190
- found = False
191
- for tech in T:
192
- if tech in candidate_map[i]:
193
- found = True
194
- sim_val = candidate_map[i][tech]
195
- if sim_val > max_sim:
196
- max_sim = sim_val
197
- if not found:
198
- covered = False
199
- break
200
- else:
201
- total_sim += max_sim
202
-
203
- if covered:
204
- avg_sim = total_sim / n
205
- if min_k is None or k < min_k:
206
- min_k = k
207
- best_set = T
208
- best_avg = avg_sim
209
- elif k == min_k and avg_sim > best_avg:
210
- best_set = T
211
- best_avg = avg_sim
212
-
213
- if min_k is not None and k == min_k:
214
- break
215
-
216
- if best_set is None:
217
- return set()
218
- return set(best_set)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/technologies_database.xlsx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
3
- size 213138
 
 
 
 
src/services/utils.py DELETED
@@ -1,111 +0,0 @@
1
- import pickle
2
- import numpy as np
3
- import pandas as pd
4
-
5
- import nltk
6
- from nltk.stem import *
7
- nltk.download("punkt_tab")
8
-
9
- TECH_PATH = "/app/src/services/technologies_database.xlsx"
10
-
11
- def set_prompt(problem):
12
- prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
13
- Take into account different technical domains to encompass the whole problem.
14
- Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
15
- Technical problem :
16
- """ + problem
17
- return prompt
18
-
19
- def load_technologies():
20
- df = pd.read_excel(TECH_PATH)
21
- return df
22
-
23
- def tech_to_dict(technologies):
24
- tech_dict = []
25
- for index, tech in enumerate(technologies):
26
- if not tech.find("<title>") > 1:
27
- tab = tech.split("\n")
28
- tab.pop(0)
29
- tab.pop(len(tab)-1)
30
- tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
31
- "purpose": tab[1][tab[1].find(": ")+2:],
32
- "key_components": tab[2][tab[2].find(": ")+2:],
33
- "advantages": tab[3][tab[3].find(": ")+2:],
34
- "limitations": tab[4][tab[4].find(": ")+2:],
35
- "id": index})
36
- return tech_dict
37
-
38
- def save_dataframe(df, title):
39
- pd.DataFrame(df).to_excel(title)
40
- return title
41
-
42
- def stem(data,data_type):
43
- stemmer = SnowballStemmer("english")
44
- processed_data = []
45
- if data_type == "technologies":
46
- for t_item in data:
47
- processed_data.append({
48
- "title": stemmer.stem(t_item["title"]),
49
- "purpose": stemmer.stem(t_item["purpose"]),
50
- "key_components": stemmer.stem(t_item["key_components"]),
51
- "advantages": stemmer.stem(t_item["advantages"]),
52
- "limitations": stemmer.stem(t_item["limitations"]),
53
- "id": t_item["id"]
54
- })
55
- else:
56
- for t_item in data:
57
- print(t_item)
58
- processed_data.append({
59
- "title": stemmer.stem(t_item),
60
- "description": stemmer.stem(data[t_item])
61
- })
62
-
63
- return processed_data
64
-
65
-
66
- def get_technologies_by_id(id_list, technologies):
67
- result = []
68
- id_set = set(id_list)
69
- for tech in technologies:
70
- if tech.get('id') in id_set:
71
- result.append(tech)
72
- return result
73
-
74
- def save_to_pickle(result_similarites):
75
-
76
- constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
77
- max_id2 = max([item['id2'] for item in result_similarites])
78
-
79
- row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
80
- col_labels = list(range(1, max_id2 + 1))
81
-
82
- num_rows = len(constraint_titles)
83
- num_cols = max_id2
84
-
85
- matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
86
-
87
- for item in result_similarites:
88
- row_idx = row_label_to_index[item['constraint']['title']]
89
- col_idx = item['id2'] - 1 #
90
- similarity_value = item['similarity'].item()
91
-
92
- matrix[row_idx, col_idx] = similarity_value
93
-
94
- print(f"Successfully created matrix with shape: {matrix.shape}")
95
- print(f"Number of rows (unique constraints): {num_rows}")
96
- print(f"Number of columns (max id2): {num_cols}")
97
- print("\nExample 5x5 block of the created matrix (NaN for missing values):")
98
- print(matrix[:5, :5])
99
-
100
- output_filename = "cosine_similarity_matrix_with_labels.pkl"
101
- data_to_save = {
102
- 'matrix': matrix,
103
- 'row_labels': constraint_titles,
104
- 'col_labels': col_labels
105
- }
106
-
107
- with open(output_filename, 'wb') as f:
108
- pickle.dump(data_to_save, f)
109
-
110
- print(f"\nMatrix and labels saved to {output_filename}")
111
- return output_filename