heymenn commited on
Commit
1f05644
·
verified ·
1 Parent(s): 2d541bb

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
37
  src/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  app/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
37
  src/services/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
38
+ src/ressources/technologies_database.xlsx filter=lfs diff=lfs merge=lfs -text
src/__init__.py ADDED
File without changes
src/core.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.services.utils import *
2
+ from src.services.processor import *
3
+
4
+ def process_input(data):
5
+ prompt = set_prompt(data.problem)
6
+
7
+ constraints = retrieve_constraints(prompt)
8
+ constraints_stemmed = stem(constraints, "constraints")
9
+
10
+ save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
11
+
12
+ global_tech, global_tech_embeddings = load_technologies()
13
+
14
+ #global_tech, keys, original_tech = preprocess_tech_data(df)
15
+
16
+ save_dataframe(global_tech, "global_tech.xlsx")
17
+
18
+ result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, global_tech, global_tech_embeddings, )
19
+
20
+ save_to_pickle(result_similarities)
21
+
22
+ best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
23
+ best_technologies_id = select_technologies(best_combinations)
24
+ best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
25
+
26
+ return best_technologies
src/ressources/global_tech_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a62fa7bbe756522e26ed283c36fe42a62ba950ea1c765cc85e35ffb335894993
3
+ size 1809459
src/ressources/technologies_database.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
3
+ size 213138
src/services/external_process.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is used to compute the embedding of the technologies, easily executable on google colab
2
+
3
+ #!pip install sentence-transformers
4
+ #!pip install nltk
5
+
6
+ import numpy as np
7
+ from sentence_transformers import SentenceTransformer
8
+ import pickle
9
+ import pandas as pd
10
+ import nltk
11
+ from nltk.stem import *
12
+ nltk.download("punkt_tab")
13
+
14
+
15
+ print("Loading SentenceTransformer model...")
16
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
17
+ print("Model loaded.")
18
+
19
+ def load_technologies():
20
+ df = pd.read_excel('technologies_database.xlsx')
21
+ return df
22
+
23
+ def tech_to_dict(technologies):
24
+ tech_dict = []
25
+ for index, tech in enumerate(technologies):
26
+ if not tech.find("<title>") > 1:
27
+ tab = tech.split("\n")
28
+ tab.pop(0)
29
+ tab.pop(len(tab)-1)
30
+ tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
31
+ "purpose": tab[1][tab[1].find(": ")+2:],
32
+ "key_components": tab[2][tab[2].find(": ")+2:],
33
+ "advantages": tab[3][tab[3].find(": ")+2:],
34
+ "limitations": tab[4][tab[4].find(": ")+2:],
35
+ "id": index})
36
+ return tech_dict
37
+
38
+ def stem(data,data_type):
39
+ stemmer = SnowballStemmer("english")
40
+ processed_data = []
41
+ if data_type == "technologies":
42
+ for t_item in data:
43
+ processed_data.append({
44
+ "title": stemmer.stem(t_item["title"]),
45
+ "purpose": stemmer.stem(t_item["purpose"]),
46
+ "key_components": stemmer.stem(t_item["key_components"]),
47
+ "advantages": stemmer.stem(t_item["advantages"]),
48
+ "limitations": stemmer.stem(t_item["limitations"]),
49
+ "id": t_item["id"]
50
+ })
51
+ else:
52
+ for t_item in data:
53
+ print(t_item)
54
+ processed_data.append({
55
+ "title": stemmer.stem(t_item),
56
+ "description": stemmer.stem(data[t_item])
57
+ })
58
+
59
+ return processed_data
60
+
61
+ def preprocess_tech_data(_df):
62
+ if _df is None or "description" not in _df.columns:
63
+ return [], []
64
+
65
+ technologies_list = _df["description"].to_list()
66
+ tech_dict_raw = tech_to_dict(technologies_list)
67
+
68
+ tech_dict_filtered = [
69
+ t for t in tech_dict_raw if (
70
+ len(t.get("title", "")) >= 5 and
71
+ len(t.get("advantages", "")) >= 5 and
72
+ len(t.get("key_components", "")) >= 5
73
+ )
74
+ ]
75
+
76
+ if not tech_dict_filtered:
77
+ return [], []
78
+
79
+ processed_tech_wt = stem(tech_dict_filtered,"technologies")
80
+
81
+ for t_item_wt in processed_tech_wt:
82
+ kc = t_item_wt.get("key_components")
83
+ if isinstance(kc, str):
84
+ t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
85
+ else:
86
+ t_item_wt["key_components"] = ""
87
+
88
+ original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
89
+
90
+
91
+ _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
92
+ return processed_tech_wt, _keys, original_tech_for_display
93
+
94
+
95
+ df = load_technologies()
96
+ global_tech,keys,original_tech = preprocess_tech_data(df)
97
+ global_tech_purposes = [t["purpose"] for t in global_tech]
98
+
99
+ # Encode all global_tech purposes into embeddings
100
+ print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.")
101
+ global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True)
102
+ print("Global tech embeddings created.")
103
+
104
+ # Define the filename for the pickle file
105
+ output_filename = 'global_tech_embeddings.pkl'
106
+
107
+ # Save the embeddings and the global_tech data (optional, but good for context)
108
+ # Saving global_tech alongside embeddings ensures you have the original data if needed
109
+ data_to_save = {
110
+ 'global_tech': global_tech, # The original list of dictionaries
111
+ 'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings
112
+ }
113
+
114
+ print(f"Saving embeddings and global_tech data to {output_filename}...")
115
+ with open(output_filename, 'wb') as f:
116
+ pickle.dump(data_to_save, f)
117
+ print(f"Data saved successfully to {output_filename}.")
118
+
119
+ print(f"\nTo load this file later in your API, use: \n"
120
+ f"with open('{output_filename}', 'rb') as f:\n"
121
+ f" loaded_data = pickle.load(f)\n"
122
+ f"global_tech = loaded_data['global_tech']\n"
123
+ f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n")
124
+
src/services/processor.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.services.utils import tech_to_dict, stem
2
+ import requests as r
3
+ import json
4
+ import nltk
5
+ import itertools
6
+ import numpy as np
7
+
8
+ from sentence_transformers import *
9
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
10
+
11
+ def retrieve_constraints(prompt):
12
+ request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
13
+ response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
14
+ print(f"response : {response}")
15
+ decoded_content = json.loads(response.content.decode())
16
+ llm_response = decoded_content["content"][0]["message"]["content"]
17
+
18
+ start_marker = '{'
19
+ end_marker = '}'
20
+ start_index = llm_response.find(start_marker) + len(start_marker)
21
+ end_index = llm_response.find(end_marker, start_index)
22
+ json_str = llm_response[start_index:end_index].strip()
23
+
24
+ constraints_json = json.loads("{"+json_str+"}")
25
+
26
+ return constraints_json
27
+
28
+
29
+ def preprocess_tech_data(_df):
30
+ if _df is None or "description" not in _df.columns:
31
+ return [], []
32
+
33
+ technologies_list = _df["description"].to_list()
34
+ tech_dict_raw = tech_to_dict(technologies_list)
35
+
36
+ tech_dict_filtered = [
37
+ t for t in tech_dict_raw if (
38
+ len(t.get("title", "")) >= 5 and
39
+ len(t.get("advantages", "")) >= 5 and
40
+ len(t.get("key_components", "")) >= 5
41
+ )
42
+ ]
43
+
44
+ if not tech_dict_filtered:
45
+ return [], []
46
+
47
+ processed_tech_wt = stem(tech_dict_filtered,"technologies")
48
+
49
+ for t_item_wt in processed_tech_wt:
50
+ kc = t_item_wt.get("key_components")
51
+ if isinstance(kc, str):
52
+ t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
53
+ else:
54
+ t_item_wt["key_components"] = ""
55
+
56
+ original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
57
+
58
+
59
+ _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
60
+ return processed_tech_wt, _keys, original_tech_for_display
61
+
62
+
63
+ def remove_over_repeated_technologies(result):
64
+ total_lists = len(result)
65
+ tech_title = {}
66
+
67
+ for idx, item in enumerate(result):
68
+ for tech in item['technologies']:
69
+ tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
70
+
71
+ threshold = total_lists * 0.3
72
+ print(threshold)
73
+ print(tech_title)
74
+ to_delete = []
75
+ for tech, lists in tech_title.items():
76
+ if lists > threshold:
77
+ print(f"This technology have been found over repeated : " + tech)
78
+ to_delete.append(tech)
79
+
80
+ for idx, item in enumerate(result):
81
+ result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
82
+
83
+ return result
84
+
85
+ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded_tech_embeddings):
86
+ selected_pairs = []
87
+ matrix = []
88
+
89
+ constraint_descriptions = [c["description"] for c in constraints]
90
+ constraint_embeddings = model.encode(constraint_descriptions, show_progress_bar=False)
91
+
92
+ for i, constraint in enumerate(constraints):
93
+ constraint_embedding = constraint_embeddings[i]
94
+
95
+ for j, tech2 in enumerate(pre_encoded_tech_data):
96
+ tech_embedding = pre_encoded_tech_embeddings[j]
97
+
98
+ purpose_sim = model.similarity(constraint_embedding, tech_embedding)
99
+
100
+ if np.isnan(purpose_sim):
101
+ purpose_sim = 0.0
102
+
103
+ selected_pairs.append({
104
+ "constraint": constraint,
105
+ "id2": tech2["id"],
106
+ "similarity": purpose_sim
107
+ })
108
+ matrix.append(purpose_sim)
109
+
110
+ return selected_pairs, matrix
111
+
112
+ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
113
+ if not list1 or not list2:
114
+ print("Warning: One or both input lists are empty. Returning an empty list.")
115
+ return []
116
+
117
+ MIN_SIMILARITY = 0.3
118
+ MAX_SIMILARITY = 0.8
119
+
120
+ possible_matches_for_each_l1 = []
121
+ for i in range(len(list1)):
122
+ valid_matches_for_l1_element = []
123
+ for j in range(len(list2)):
124
+ score = matrix[i, j]
125
+
126
+ if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
127
+ valid_matches_for_l1_element.append((list2[j], score))
128
+
129
+ if not valid_matches_for_l1_element:
130
+ print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
131
+ f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
132
+ "Returning an empty list as no complete combinations can be formed.")
133
+
134
+ else:
135
+ possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
136
+
137
+ result = []
138
+ for tech_list, problem in possible_matches_for_each_l1:
139
+ sorted_list = sorted(
140
+ tech_list,
141
+ key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
142
+ reverse=True
143
+ )
144
+ top5 = sorted_list[:5]
145
+ result.append({
146
+ 'technologies': top5,
147
+ 'problem': problem
148
+ })
149
+
150
+ result = remove_over_repeated_technologies(result)
151
+ return result
152
+
153
+
154
+ def select_technologies(problem_technology_list):
155
+ distinct_techs = set()
156
+ candidate_map = []
157
+
158
+ for problem_data in problem_technology_list:
159
+ cand_dict = {}
160
+ for tech_info, sim in problem_data['technologies']:
161
+ tech_id = tech_info['id']
162
+ distinct_techs.add(tech_id)
163
+ cand_dict[tech_id] = float(sim)
164
+ candidate_map.append(cand_dict)
165
+
166
+ distinct_techs = sorted(list(distinct_techs))
167
+ n = len(problem_technology_list)
168
+
169
+ if n == 0:
170
+ return set()
171
+
172
+ min_k = None
173
+ best_set = None
174
+ best_avg = -1
175
+
176
+ print(f"Distinct technologies: {distinct_techs}")
177
+ print(f"Candidate map: {candidate_map}")
178
+ print(f"Number of problems: {n}")
179
+
180
+ for k in range(1, len(distinct_techs)+1):
181
+ if min_k is not None and k > min_k:
182
+ break
183
+
184
+ for T in itertools.combinations(distinct_techs, k):
185
+ total_sim = 0.0
186
+ covered = True
187
+ print(f"Trying combination: {T}")
188
+ for i in range(n):
189
+ max_sim = -1.0
190
+ found = False
191
+ for tech in T:
192
+ if tech in candidate_map[i]:
193
+ found = True
194
+ sim_val = candidate_map[i][tech]
195
+ if sim_val > max_sim:
196
+ max_sim = sim_val
197
+ if not found:
198
+ covered = False
199
+ break
200
+ else:
201
+ total_sim += max_sim
202
+
203
+ if covered:
204
+ avg_sim = total_sim / n
205
+ if min_k is None or k < min_k:
206
+ min_k = k
207
+ best_set = T
208
+ best_avg = avg_sim
209
+ elif k == min_k and avg_sim > best_avg:
210
+ best_set = T
211
+ best_avg = avg_sim
212
+
213
+ if min_k is not None and k == min_k:
214
+ break
215
+
216
+ if best_set is None:
217
+ return set()
218
+ return set(best_set)
src/services/utils.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ import nltk
6
+ from nltk.stem import *
7
+ nltk.download("punkt_tab")
8
+
9
+ FILE_PATH = "/app/src/ressources/technologies_database.xlsx"
10
+
11
+ def set_prompt(problem):
12
+ prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
13
+ Take into account different technical domains to encompass the whole problem.
14
+ Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
15
+ Technical problem :
16
+ """ + problem
17
+ return prompt
18
+
19
+ def load_technologies_excel():
20
+ df = pd.read_excel(FILE_PATH)
21
+ return df
22
+
23
+ def load_technologies():
24
+ EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl'
25
+
26
+ try:
27
+ with open(EMBEDDINGS_FILE, 'rb') as f:
28
+ loaded_data = pickle.load(f)
29
+ global_tech = loaded_data['global_tech']
30
+ global_tech_embedding = loaded_data['global_tech_embeddings']
31
+ return global_tech, global_tech_embedding
32
+ except Exception as e:
33
+ print(f"Error: {e}")
34
+
35
+ def tech_to_dict(technologies):
36
+ tech_dict = []
37
+ for index, tech in enumerate(technologies):
38
+ if not tech.find("<title>") > 1:
39
+ tab = tech.split("\n")
40
+ tab.pop(0)
41
+ tab.pop(len(tab)-1)
42
+ tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
43
+ "purpose": tab[1][tab[1].find(": ")+2:],
44
+ "key_components": tab[2][tab[2].find(": ")+2:],
45
+ "advantages": tab[3][tab[3].find(": ")+2:],
46
+ "limitations": tab[4][tab[4].find(": ")+2:],
47
+ "id": index})
48
+ return tech_dict
49
+
50
+ def save_dataframe(df, title):
51
+ pd.DataFrame(df).to_excel(title)
52
+ return title
53
+
54
+ def stem(data,data_type):
55
+ stemmer = SnowballStemmer("english")
56
+ processed_data = []
57
+ if data_type == "technologies":
58
+ for t_item in data:
59
+ processed_data.append({
60
+ "title": stemmer.stem(t_item["title"]),
61
+ "purpose": stemmer.stem(t_item["purpose"]),
62
+ "key_components": stemmer.stem(t_item["key_components"]),
63
+ "advantages": stemmer.stem(t_item["advantages"]),
64
+ "limitations": stemmer.stem(t_item["limitations"]),
65
+ "id": t_item["id"]
66
+ })
67
+ else:
68
+ for t_item in data:
69
+ print(t_item)
70
+ processed_data.append({
71
+ "title": stemmer.stem(t_item),
72
+ "description": stemmer.stem(data[t_item])
73
+ })
74
+
75
+ return processed_data
76
+
77
+
78
+ def get_technologies_by_id(id_list, technologies):
79
+ result = []
80
+ id_set = set(id_list)
81
+ for tech in technologies:
82
+ if tech.get('id') in id_set:
83
+ result.append(tech)
84
+ return result
85
+
86
+ def save_to_pickle(result_similarites):
87
+
88
+ constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
89
+ max_id2 = max([item['id2'] for item in result_similarites])
90
+
91
+ row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
92
+ col_labels = list(range(1, max_id2 + 1))
93
+
94
+ num_rows = len(constraint_titles)
95
+ num_cols = max_id2
96
+
97
+ matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
98
+
99
+ for item in result_similarites:
100
+ row_idx = row_label_to_index[item['constraint']['title']]
101
+ col_idx = item['id2'] - 1 #
102
+ similarity_value = item['similarity'].item()
103
+
104
+ matrix[row_idx, col_idx] = similarity_value
105
+
106
+ print(f"Successfully created matrix with shape: {matrix.shape}")
107
+ print(f"Number of rows (unique constraints): {num_rows}")
108
+ print(f"Number of columns (max id2): {num_cols}")
109
+ print("\nExample 5x5 block of the created matrix (NaN for missing values):")
110
+ print(matrix[:5, :5])
111
+
112
+ output_filename = "cosine_similarity_matrix_with_labels.pkl"
113
+ data_to_save = {
114
+ 'matrix': matrix,
115
+ 'row_labels': constraint_titles,
116
+ 'col_labels': col_labels
117
+ }
118
+
119
+ with open(output_filename, 'wb') as f:
120
+ pickle.dump(data_to_save, f)
121
+
122
+ print(f"\nMatrix and labels saved to {output_filename}")
123
+ return output_filename