Delete src
Browse files- src/__init__.py +0 -0
- src/core.py +0 -17
- src/services/processor.py +0 -218
- src/services/technologies_database.xlsx +0 -3
- src/services/utils.py +0 -111
src/__init__.py
DELETED
File without changes
|
src/core.py
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
from src.services.utils import *
|
2 |
-
from src.services.processor import *
|
3 |
-
|
4 |
-
def process_input(data):
|
5 |
-
prompt = set_prompt(data.problem)
|
6 |
-
constraints = retrieve_constraints(prompt)
|
7 |
-
constraints_stemmed = stem(constraints, "constraints")
|
8 |
-
save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
|
9 |
-
df = load_technologies()
|
10 |
-
global_tech, keys, original_tech = preprocess_tech_data(df)
|
11 |
-
save_dataframe(global_tech, "global_tech.xlsx")
|
12 |
-
result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
|
13 |
-
save_to_pickle(result_similarities)
|
14 |
-
best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
|
15 |
-
best_technologies_id = select_technologies(best_combinations)
|
16 |
-
best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
|
17 |
-
return best_technologies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services/processor.py
DELETED
@@ -1,218 +0,0 @@
|
|
1 |
-
from src.services.utils import tech_to_dict, stem
|
2 |
-
import requests as r
|
3 |
-
import json
|
4 |
-
import nltk
|
5 |
-
import itertools
|
6 |
-
import numpy as np
|
7 |
-
|
8 |
-
from sentence_transformers import *
|
9 |
-
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
10 |
-
|
11 |
-
def retrieve_constraints(prompt):
|
12 |
-
request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
|
13 |
-
response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
|
14 |
-
|
15 |
-
decoded_content = json.loads(response.content.decode())
|
16 |
-
print(f"response : {decoded_content}")
|
17 |
-
llm_response = decoded_content["content"]
|
18 |
-
|
19 |
-
start_marker = '{'
|
20 |
-
end_marker = '}'
|
21 |
-
start_index = llm_response.find(start_marker) + len(start_marker)
|
22 |
-
end_index = llm_response.find(end_marker, start_index)
|
23 |
-
json_str = llm_response[start_index:end_index].strip()
|
24 |
-
|
25 |
-
constraints_json = json.loads("{"+json_str+"}")
|
26 |
-
|
27 |
-
return constraints_json
|
28 |
-
|
29 |
-
|
30 |
-
def preprocess_tech_data(_df):
|
31 |
-
if _df is None or "description" not in _df.columns:
|
32 |
-
return [], []
|
33 |
-
|
34 |
-
technologies_list = _df["description"].to_list()
|
35 |
-
tech_dict_raw = tech_to_dict(technologies_list)
|
36 |
-
|
37 |
-
tech_dict_filtered = [
|
38 |
-
t for t in tech_dict_raw if (
|
39 |
-
len(t.get("title", "")) >= 5 and
|
40 |
-
len(t.get("advantages", "")) >= 5 and
|
41 |
-
len(t.get("key_components", "")) >= 5
|
42 |
-
)
|
43 |
-
]
|
44 |
-
|
45 |
-
if not tech_dict_filtered:
|
46 |
-
return [], []
|
47 |
-
|
48 |
-
processed_tech_wt = stem(tech_dict_filtered,"technologies")
|
49 |
-
|
50 |
-
for t_item_wt in processed_tech_wt:
|
51 |
-
kc = t_item_wt.get("key_components")
|
52 |
-
if isinstance(kc, str):
|
53 |
-
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
|
54 |
-
else:
|
55 |
-
t_item_wt["key_components"] = ""
|
56 |
-
|
57 |
-
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
|
58 |
-
|
59 |
-
|
60 |
-
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
|
61 |
-
return processed_tech_wt, _keys, original_tech_for_display
|
62 |
-
|
63 |
-
|
64 |
-
def remove_over_repeated_technologies(result):
|
65 |
-
total_lists = len(result)
|
66 |
-
tech_title = {}
|
67 |
-
|
68 |
-
for idx, item in enumerate(result):
|
69 |
-
for tech in item['technologies']:
|
70 |
-
tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
|
71 |
-
|
72 |
-
threshold = total_lists * 0.3
|
73 |
-
print(threshold)
|
74 |
-
print(tech_title)
|
75 |
-
to_delete = []
|
76 |
-
for tech, lists in tech_title.items():
|
77 |
-
if lists > threshold:
|
78 |
-
print(f"This technology have been found over repeated : " + tech)
|
79 |
-
to_delete.append(tech)
|
80 |
-
|
81 |
-
for idx, item in enumerate(result):
|
82 |
-
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
|
83 |
-
|
84 |
-
return result
|
85 |
-
|
86 |
-
def get_contrastive_similarities(global_tech, constraints):
|
87 |
-
selected_pairs = []
|
88 |
-
matrix = []
|
89 |
-
|
90 |
-
for i, constraint in enumerate(constraints):
|
91 |
-
print(constraint)
|
92 |
-
for j, tech2 in enumerate(global_tech):
|
93 |
-
if i >= j:
|
94 |
-
continue
|
95 |
-
|
96 |
-
purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
|
97 |
-
|
98 |
-
print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
|
99 |
-
print(f"Purpose Similarity: {purpose_sim}")
|
100 |
-
selected_pairs.append({
|
101 |
-
"constraint": constraint,
|
102 |
-
"id2": tech2["id"],
|
103 |
-
"similarity": purpose_sim
|
104 |
-
})
|
105 |
-
if purpose_sim == np.float32(None):
|
106 |
-
purpose_sim = 0.0
|
107 |
-
matrix.append(purpose_sim)
|
108 |
-
|
109 |
-
return selected_pairs,matrix
|
110 |
-
|
111 |
-
|
112 |
-
def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
|
113 |
-
if not list1 or not list2:
|
114 |
-
print("Warning: One or both input lists are empty. Returning an empty list.")
|
115 |
-
return []
|
116 |
-
|
117 |
-
MIN_SIMILARITY = 0.3
|
118 |
-
MAX_SIMILARITY = 0.8
|
119 |
-
|
120 |
-
possible_matches_for_each_l1 = []
|
121 |
-
for i in range(len(list1)):
|
122 |
-
valid_matches_for_l1_element = []
|
123 |
-
for j in range(len(list2)):
|
124 |
-
score = matrix[i, j]
|
125 |
-
|
126 |
-
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
127 |
-
valid_matches_for_l1_element.append((list2[j], score))
|
128 |
-
|
129 |
-
if not valid_matches_for_l1_element:
|
130 |
-
print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
|
131 |
-
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
132 |
-
"Returning an empty list as no complete combinations can be formed.")
|
133 |
-
|
134 |
-
else:
|
135 |
-
possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
|
136 |
-
|
137 |
-
result = []
|
138 |
-
for tech_list, problem in possible_matches_for_each_l1:
|
139 |
-
sorted_list = sorted(
|
140 |
-
tech_list,
|
141 |
-
key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
|
142 |
-
reverse=True
|
143 |
-
)
|
144 |
-
top5 = sorted_list[:5]
|
145 |
-
result.append({
|
146 |
-
'technologies': top5,
|
147 |
-
'problem': problem
|
148 |
-
})
|
149 |
-
|
150 |
-
result = remove_over_repeated_technologies(result)
|
151 |
-
return result
|
152 |
-
|
153 |
-
|
154 |
-
def select_technologies(problem_technology_list):
|
155 |
-
distinct_techs = set()
|
156 |
-
candidate_map = []
|
157 |
-
|
158 |
-
for problem_data in problem_technology_list:
|
159 |
-
cand_dict = {}
|
160 |
-
for tech_info, sim in problem_data['technologies']:
|
161 |
-
tech_id = tech_info['id']
|
162 |
-
distinct_techs.add(tech_id)
|
163 |
-
cand_dict[tech_id] = float(sim)
|
164 |
-
candidate_map.append(cand_dict)
|
165 |
-
|
166 |
-
distinct_techs = sorted(list(distinct_techs))
|
167 |
-
n = len(problem_technology_list)
|
168 |
-
|
169 |
-
if n == 0:
|
170 |
-
return set()
|
171 |
-
|
172 |
-
min_k = None
|
173 |
-
best_set = None
|
174 |
-
best_avg = -1
|
175 |
-
|
176 |
-
print(f"Distinct technologies: {distinct_techs}")
|
177 |
-
print(f"Candidate map: {candidate_map}")
|
178 |
-
print(f"Number of problems: {n}")
|
179 |
-
|
180 |
-
for k in range(1, len(distinct_techs)+1):
|
181 |
-
if min_k is not None and k > min_k:
|
182 |
-
break
|
183 |
-
|
184 |
-
for T in itertools.combinations(distinct_techs, k):
|
185 |
-
total_sim = 0.0
|
186 |
-
covered = True
|
187 |
-
print(f"Trying combination: {T}")
|
188 |
-
for i in range(n):
|
189 |
-
max_sim = -1.0
|
190 |
-
found = False
|
191 |
-
for tech in T:
|
192 |
-
if tech in candidate_map[i]:
|
193 |
-
found = True
|
194 |
-
sim_val = candidate_map[i][tech]
|
195 |
-
if sim_val > max_sim:
|
196 |
-
max_sim = sim_val
|
197 |
-
if not found:
|
198 |
-
covered = False
|
199 |
-
break
|
200 |
-
else:
|
201 |
-
total_sim += max_sim
|
202 |
-
|
203 |
-
if covered:
|
204 |
-
avg_sim = total_sim / n
|
205 |
-
if min_k is None or k < min_k:
|
206 |
-
min_k = k
|
207 |
-
best_set = T
|
208 |
-
best_avg = avg_sim
|
209 |
-
elif k == min_k and avg_sim > best_avg:
|
210 |
-
best_set = T
|
211 |
-
best_avg = avg_sim
|
212 |
-
|
213 |
-
if min_k is not None and k == min_k:
|
214 |
-
break
|
215 |
-
|
216 |
-
if best_set is None:
|
217 |
-
return set()
|
218 |
-
return set(best_set)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services/technologies_database.xlsx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
|
3 |
-
size 213138
|
|
|
|
|
|
|
|
src/services/utils.py
DELETED
@@ -1,111 +0,0 @@
|
|
1 |
-
import pickle
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
-
|
5 |
-
import nltk
|
6 |
-
from nltk.stem import *
|
7 |
-
nltk.download("punkt_tab")
|
8 |
-
|
9 |
-
TECH_PATH = "/app/src/services/technologies_database.xlsx"
|
10 |
-
|
11 |
-
def set_prompt(problem):
|
12 |
-
prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
|
13 |
-
Take into account different technical domains to encompass the whole problem.
|
14 |
-
Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
|
15 |
-
Technical problem :
|
16 |
-
""" + problem
|
17 |
-
return prompt
|
18 |
-
|
19 |
-
def load_technologies():
|
20 |
-
df = pd.read_excel(TECH_PATH)
|
21 |
-
return df
|
22 |
-
|
23 |
-
def tech_to_dict(technologies):
|
24 |
-
tech_dict = []
|
25 |
-
for index, tech in enumerate(technologies):
|
26 |
-
if not tech.find("<title>") > 1:
|
27 |
-
tab = tech.split("\n")
|
28 |
-
tab.pop(0)
|
29 |
-
tab.pop(len(tab)-1)
|
30 |
-
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
|
31 |
-
"purpose": tab[1][tab[1].find(": ")+2:],
|
32 |
-
"key_components": tab[2][tab[2].find(": ")+2:],
|
33 |
-
"advantages": tab[3][tab[3].find(": ")+2:],
|
34 |
-
"limitations": tab[4][tab[4].find(": ")+2:],
|
35 |
-
"id": index})
|
36 |
-
return tech_dict
|
37 |
-
|
38 |
-
def save_dataframe(df, title):
|
39 |
-
pd.DataFrame(df).to_excel(title)
|
40 |
-
return title
|
41 |
-
|
42 |
-
def stem(data,data_type):
|
43 |
-
stemmer = SnowballStemmer("english")
|
44 |
-
processed_data = []
|
45 |
-
if data_type == "technologies":
|
46 |
-
for t_item in data:
|
47 |
-
processed_data.append({
|
48 |
-
"title": stemmer.stem(t_item["title"]),
|
49 |
-
"purpose": stemmer.stem(t_item["purpose"]),
|
50 |
-
"key_components": stemmer.stem(t_item["key_components"]),
|
51 |
-
"advantages": stemmer.stem(t_item["advantages"]),
|
52 |
-
"limitations": stemmer.stem(t_item["limitations"]),
|
53 |
-
"id": t_item["id"]
|
54 |
-
})
|
55 |
-
else:
|
56 |
-
for t_item in data:
|
57 |
-
print(t_item)
|
58 |
-
processed_data.append({
|
59 |
-
"title": stemmer.stem(t_item),
|
60 |
-
"description": stemmer.stem(data[t_item])
|
61 |
-
})
|
62 |
-
|
63 |
-
return processed_data
|
64 |
-
|
65 |
-
|
66 |
-
def get_technologies_by_id(id_list, technologies):
|
67 |
-
result = []
|
68 |
-
id_set = set(id_list)
|
69 |
-
for tech in technologies:
|
70 |
-
if tech.get('id') in id_set:
|
71 |
-
result.append(tech)
|
72 |
-
return result
|
73 |
-
|
74 |
-
def save_to_pickle(result_similarites):
|
75 |
-
|
76 |
-
constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
|
77 |
-
max_id2 = max([item['id2'] for item in result_similarites])
|
78 |
-
|
79 |
-
row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
|
80 |
-
col_labels = list(range(1, max_id2 + 1))
|
81 |
-
|
82 |
-
num_rows = len(constraint_titles)
|
83 |
-
num_cols = max_id2
|
84 |
-
|
85 |
-
matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
|
86 |
-
|
87 |
-
for item in result_similarites:
|
88 |
-
row_idx = row_label_to_index[item['constraint']['title']]
|
89 |
-
col_idx = item['id2'] - 1 #
|
90 |
-
similarity_value = item['similarity'].item()
|
91 |
-
|
92 |
-
matrix[row_idx, col_idx] = similarity_value
|
93 |
-
|
94 |
-
print(f"Successfully created matrix with shape: {matrix.shape}")
|
95 |
-
print(f"Number of rows (unique constraints): {num_rows}")
|
96 |
-
print(f"Number of columns (max id2): {num_cols}")
|
97 |
-
print("\nExample 5x5 block of the created matrix (NaN for missing values):")
|
98 |
-
print(matrix[:5, :5])
|
99 |
-
|
100 |
-
output_filename = "cosine_similarity_matrix_with_labels.pkl"
|
101 |
-
data_to_save = {
|
102 |
-
'matrix': matrix,
|
103 |
-
'row_labels': constraint_titles,
|
104 |
-
'col_labels': col_labels
|
105 |
-
}
|
106 |
-
|
107 |
-
with open(output_filename, 'wb') as f:
|
108 |
-
pickle.dump(data_to_save, f)
|
109 |
-
|
110 |
-
print(f"\nMatrix and labels saved to {output_filename}")
|
111 |
-
return output_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|