jzou1995 commited on
Commit
be75490
Β·
verified Β·
1 Parent(s): f1b652f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -154
app.py CHANGED
@@ -32,157 +32,121 @@ def initialize_gemini(api_key: str):
32
  )
33
  return model
34
 
35
- def google_search_company_info(company_name: str) -> str:
36
  """
37
- Search for basic company information to help with NAICS classification
 
 
 
38
  """
39
  company_info = ""
 
40
 
41
- # Create search queries focused on company information
42
- queries = [
43
- f"what is {company_name} company",
44
- f"{company_name} company about us",
45
- f"{company_name} business description",
46
- f"{company_name} company profile",
47
- f"what does {company_name} company do"
48
  ]
49
 
50
- try:
51
- print(f"πŸ” Searching for information about '{company_name}'...")
52
-
53
- for query in queries[:2]: # Limit to first 2 queries to save time
54
- try:
55
- # Search with each query
56
- search_results = search(query, stop=2, pause=2)
57
-
58
- for result_url in search_results:
59
- try:
60
- response = requests.get(result_url, timeout=5)
61
- if response.status_code == 200:
62
- # Extract text from paragraphs
63
- soup = BeautifulSoup(response.text, 'html.parser')
64
- paragraphs = soup.find_all('p')
65
-
66
- # Get text from first 3 substantial paragraphs
67
- for p in paragraphs:
68
- text = p.get_text().strip()
69
- if len(text) > 100 and company_name.lower() in text.lower():
70
- company_info += text + "\n\n"
71
- if len(company_info) > 500:
72
- break
73
-
74
- if len(company_info) > 500:
75
- break
76
- except Exception as e:
77
- print(f" ⚠️ Error fetching {result_url}: {e}")
78
-
79
- if len(company_info) > 500:
80
- break
81
- except Exception as e:
82
- print(f" ⚠️ Error with query '{query}': {e}")
83
- continue
84
-
85
- return company_info.strip()
86
- except Exception as e:
87
- print(f"❌ Error searching for company info: {str(e)}")
88
- return ""
89
-
90
- def google_search_naics(company_name: str) -> List[str]:
91
- """
92
- Find potential NAICS codes for a company using multiple targeted Google searches
93
- Uses more specific search queries to improve results
94
- """
95
- naics_codes = set()
96
-
97
- # Create multiple search queries for better results
98
- queries = [
99
- f"2022 NAICS code for {company_name}",
100
- f"NAICS 2022 classification for {company_name}",
101
- f"{company_name} business NAICS 2022 code",
102
- f"{company_name} industry NAICS code 2022",
103
- f"what is {company_name} company NAICS code"
104
  ]
105
 
 
 
106
  try:
107
- print(f"πŸ”Ž Searching Google for NAICS codes for '{company_name}'...")
108
 
109
- for query in queries:
110
  print(f" Query: {query}")
111
  try:
112
- # Search with each query, limiting to 3 results per query
113
  search_results = search(query, stop=3, pause=2)
114
 
115
  for result_url in search_results:
116
  try:
117
  response = requests.get(result_url, timeout=5)
118
  if response.status_code == 200:
119
- # Extract 6-digit NAICS codes
120
  found_codes = re.findall(r'\b\d{6}\b', response.text)
121
- naics_codes.update(found_codes)
122
-
123
- # If we find codes, print them
124
  if found_codes:
 
125
  print(f" Found codes in {result_url}: {found_codes}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  except Exception as e:
127
  print(f" ⚠️ Error fetching {result_url}: {e}")
 
 
 
 
 
128
  except Exception as e:
129
  print(f" ⚠️ Error with query '{query}': {e}")
130
  continue
131
 
132
- # Return unique codes, limited to 10 most common
133
- return list(naics_codes)[:10]
134
  except Exception as e:
135
- print(f"❌ Error performing Google search: {str(e)}")
136
- return []
137
 
138
- def get_naics_classification(model, company_name: str, context: str, candidates: List[str]) -> dict:
139
  """
140
- Use Gemini AI to determine the most appropriate NAICS code from candidates
141
- First provides reasoning, then returns the NAICS code and explanation
142
  """
143
  try:
144
  print("πŸ€– AI is analyzing NAICS classification...")
145
 
146
- # Get additional company information from Google
147
- company_info = google_search_company_info(company_name)
148
  if company_info:
149
- print(f"πŸ“ Found additional company information:\n{company_info[:200]}...")
150
- # Add the found information to the context
151
  if context:
152
- context = f"{context}\n\nAdditional information found online:\n{company_info}"
153
  else:
154
- context = f"Information found online:\n{company_info}"
155
-
156
- # If we have candidate codes from Google search
157
- if candidates:
158
- # Create a prompt that asks for research on the candidates
 
159
  prompt = f"""
160
- You are a NAICS code classification expert. Based on the company information provided and the NAICS code candidates found from Google search, determine the most appropriate NAICS code.
161
 
162
  Company Name: {company_name}
163
- Context Information: {context}
164
 
165
- NAICS Code Candidates from Google Search: {candidates}
166
 
167
- First, research what these NAICS codes represent:
168
- 1. For each NAICS code candidate, briefly explain what industry or business activity it corresponds to.
169
- 2. Then explain which industry classification best matches this company based on the name and context provided.
170
- 3. Finally, select the single most appropriate NAICS code from the candidates, or suggest a different one if none match.
171
 
172
  Your response should be in this format:
173
- RESEARCH: [Brief explanation of what each NAICS code represents]
174
- REASONING: [Your detailed reasoning about why the chosen industry classification is most appropriate for this company]
175
  NAICS_CODE: [6-digit NAICS code]
176
  """
177
- # If no candidates were found from Google search
178
  else:
179
  prompt = f"""
180
  You are a NAICS code classification expert. Based on the company information provided, determine the most appropriate NAICS code.
181
 
182
  Company Name: {company_name}
183
- Context Information: {context}
184
 
185
- First, analyze what industry this company likely belongs to based on its name and the provided context.
186
  Consider standard business classifications and determine the most appropriate category.
187
  Then provide the single most appropriate 6-digit NAICS code.
188
 
@@ -196,12 +160,6 @@ NAICS_CODE: [6-digit NAICS code]
196
  # Create result dictionary
197
  result = {}
198
 
199
- # Extract research if available
200
- if "RESEARCH:" in response_text:
201
- research_match = re.search(r'RESEARCH:(.*?)REASONING:', response_text, re.DOTALL | re.IGNORECASE)
202
- if research_match:
203
- result["research"] = research_match.group(1).strip()
204
-
205
  # Extract reasoning
206
  reasoning_match = re.search(r'REASONING:(.*?)NAICS_CODE:', response_text, re.DOTALL | re.IGNORECASE)
207
  result["reasoning"] = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
@@ -225,15 +183,7 @@ NAICS_CODE: [6-digit NAICS code]
225
 
226
  def find_naics_code(company_name: str, context: str = "", api_key: Optional[str] = None) -> Dict:
227
  """
228
- Core function to find NAICS code for a company that can be called from different interfaces
229
-
230
- Args:
231
- company_name: Name of the company
232
- context: Brief description of the company (optional)
233
- api_key: Google Gemini API key (if None, will try to get from environment variable)
234
-
235
- Returns:
236
- Dictionary with NAICS code, reasoning, and optional research
237
  """
238
  # Get API key from environment if not provided
239
  if not api_key:
@@ -255,20 +205,16 @@ def find_naics_code(company_name: str, context: str = "", api_key: Optional[str]
255
  "reasoning": f"Error: {str(e)}"
256
  }
257
 
258
- # Find NAICS Code Candidates via Google search
259
- naics_candidates = google_search_naics(company_name)
260
 
261
- # Get classification from Gemini
262
- if not naics_candidates:
263
- print("No NAICS codes found from Google search.")
264
- result = get_naics_classification(model, company_name, context, [])
265
- else:
266
- print(f"Found {len(naics_candidates)} NAICS candidates: {naics_candidates}")
267
- result = get_naics_classification(model, company_name, context, naics_candidates)
268
 
269
  # Add metadata
270
  result["company_name"] = company_name
271
  result["context"] = context
 
272
  result["candidates"] = naics_candidates
273
 
274
  return result
@@ -280,7 +226,7 @@ def create_gradio_interface():
280
 
281
  with gr.Blocks(title="NAICS Code Finder") as demo:
282
  gr.Markdown("# NAICS Code Finder")
283
- gr.Markdown("Enter a company name to find its appropriate NAICS code. The tool will search for information about the company and relevant NAICS codes online.")
284
 
285
  with gr.Row():
286
  with gr.Column():
@@ -304,69 +250,55 @@ def create_gradio_interface():
304
  naics_output = gr.Markdown(label="NAICS Code")
305
  with gr.Accordion("Company Information", open=False):
306
  company_info_output = gr.Markdown()
307
- with gr.Accordion("NAICS Codes Research", open=False):
308
- research_output = gr.Markdown()
309
  with gr.Accordion("Classification Reasoning", open=True):
310
  reasoning_output = gr.Markdown()
311
 
312
  # Functions for the interface
313
  def process_company(company_name, company_description, api_key):
314
  if not company_name:
315
- return "Please enter a company name", "", "", "", ""
316
 
317
  # Use API key from input or environment
318
  key_to_use = api_key if api_key else os.environ.get('GEMINI_API_KEY')
319
  if not key_to_use:
320
- return "No API key provided. Please enter your Gemini API key.", "", "", "", ""
321
 
322
- status_md = "πŸ” Searching for company information...\n\n"
323
- yield status_md, "", "", "", ""
324
-
325
- # Get company info first
326
- company_info = google_search_company_info(company_name)
327
- if company_info:
328
- company_info_md = f"## Information found about {company_name}\n\n{company_info}"
329
- status_md += "βœ… Found company information\n\n"
330
- else:
331
- company_info_md = f"No detailed information found for {company_name}"
332
- status_md += "⚠️ No company information found\n\n"
333
-
334
- yield status_md, "", company_info_md, "", ""
335
-
336
- # Get NAICS candidates
337
- status_md += "πŸ” Searching for NAICS codes...\n\n"
338
- yield status_md, "", company_info_md, "", ""
339
 
340
  # Run the core functionality
341
  result = find_naics_code(company_name, company_description, key_to_use)
342
 
 
 
 
 
 
 
 
 
343
  if "candidates" in result and result["candidates"]:
344
- status_md += f"βœ… Found {len(result['candidates'])} potential NAICS codes\n\n"
345
  else:
346
  status_md += "⚠️ No specific NAICS codes found in search results\n\n"
347
 
348
  status_md += "πŸ€– Analyzing classification...\n\n"
349
- yield status_md, "", company_info_md, "", ""
350
 
351
  # Format the NAICS code output
352
  naics_code_md = f"## NAICS Code: {result['naics_code']}"
353
-
354
- # Format the research output
355
- research_md = ""
356
- if "research" in result and result["research"]:
357
- research_md = f"## Research on NAICS Codes\n\n{result['research']}"
358
 
359
  # Format the reasoning output
360
  reasoning_md = f"## Analysis\n\n{result['reasoning']}"
361
 
362
  status_md += "βœ… Classification complete!"
363
 
364
- return status_md, naics_code_md, company_info_md, research_md, reasoning_md
365
 
366
  submit_btn.click(
367
  process_company,
368
  inputs=[company_name, company_description, api_key],
369
- outputs=[status_output, naics_output, company_info_output, research_output, reasoning_output]
370
  )
371
 
372
  gr.Examples(
 
32
  )
33
  return model
34
 
35
+ def combined_google_search(company_name: str) -> Tuple[str, List[str]]:
36
  """
37
+ Combined search function that finds both company information and NAICS codes
38
+
39
+ Returns:
40
+ Tuple containing (company_info, naics_code_candidates)
41
  """
42
  company_info = ""
43
+ naics_codes = set()
44
 
45
+ # Create comprehensive search queries
46
+ info_queries = [
47
+ f"what is {company_name} company business industry sector",
48
+ f"{company_name} company about us business description",
49
+ f"{company_name} company profile what they do"
 
 
50
  ]
51
 
52
+ naics_queries = [
53
+ f"2022 NAICS code for {company_name} company",
54
+ f"{company_name} NAICS 2022 classification",
55
+ f"what is {company_name} industry NAICS code 2022"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ]
57
 
58
+ all_queries = info_queries + naics_queries
59
+
60
  try:
61
+ print(f"πŸ” Searching for information about '{company_name}'...")
62
 
63
+ for query in all_queries:
64
  print(f" Query: {query}")
65
  try:
66
+ # Search with each query
67
  search_results = search(query, stop=3, pause=2)
68
 
69
  for result_url in search_results:
70
  try:
71
  response = requests.get(result_url, timeout=5)
72
  if response.status_code == 200:
73
+ # Extract NAICS codes
74
  found_codes = re.findall(r'\b\d{6}\b', response.text)
 
 
 
75
  if found_codes:
76
+ naics_codes.update(found_codes)
77
  print(f" Found codes in {result_url}: {found_codes}")
78
+
79
+ # Extract company information
80
+ if len(company_info) < 1000: # Only if we need more info
81
+ soup = BeautifulSoup(response.text, 'html.parser')
82
+ paragraphs = soup.find_all('p')
83
+
84
+ # Get text from paragraphs that mention the company
85
+ for p in paragraphs:
86
+ text = p.get_text().strip()
87
+ if len(text) > 80 and company_name.lower() in text.lower():
88
+ company_info += text + "\n\n"
89
+ if len(company_info) > 1000:
90
+ break
91
+
92
  except Exception as e:
93
  print(f" ⚠️ Error fetching {result_url}: {e}")
94
+
95
+ # If we have enough information, move to the next query
96
+ if len(company_info) > 1000 and len(naics_codes) > 0:
97
+ break
98
+
99
  except Exception as e:
100
  print(f" ⚠️ Error with query '{query}': {e}")
101
  continue
102
 
103
+ # Return company info and NAICS codes
104
+ return company_info.strip(), list(naics_codes)[:10]
105
  except Exception as e:
106
+ print(f"❌ Error during Google search: {str(e)}")
107
+ return "", []
108
 
109
+ def analyze_naics_code(model, company_name: str, context: str, company_info: str, naics_candidates: List[str]) -> dict:
110
  """
111
+ Use Gemini AI to determine the most appropriate NAICS code
 
112
  """
113
  try:
114
  print("πŸ€– AI is analyzing NAICS classification...")
115
 
116
+ # Combine provided context with discovered company info
 
117
  if company_info:
 
 
118
  if context:
119
+ combined_context = f"{context}\n\nAdditional information found online:\n{company_info}"
120
  else:
121
+ combined_context = f"Information found online:\n{company_info}"
122
+ else:
123
+ combined_context = context
124
+
125
+ # Create the prompt based on whether we have candidate codes
126
+ if naics_candidates:
127
  prompt = f"""
128
+ You are a NAICS code classification expert. Based on the company information provided and any NAICS code candidates found from online research, determine the most appropriate NAICS code.
129
 
130
  Company Name: {company_name}
131
+ Information about the company: {combined_context}
132
 
133
+ NAICS Code Candidates found in research: {naics_candidates}
134
 
135
+ First, analyze what these NAICS codes represent and which industry this company belongs to based on the information provided.
136
+ Then select the single most appropriate 6-digit NAICS code.
 
 
137
 
138
  Your response should be in this format:
139
+ REASONING: [Your detailed reasoning about why the chosen industry classification is most appropriate for this company, including what business activities it performs]
 
140
  NAICS_CODE: [6-digit NAICS code]
141
  """
 
142
  else:
143
  prompt = f"""
144
  You are a NAICS code classification expert. Based on the company information provided, determine the most appropriate NAICS code.
145
 
146
  Company Name: {company_name}
147
+ Information about the company: {combined_context}
148
 
149
+ Analyze what industry this company likely belongs to based on its name and the provided information.
150
  Consider standard business classifications and determine the most appropriate category.
151
  Then provide the single most appropriate 6-digit NAICS code.
152
 
 
160
  # Create result dictionary
161
  result = {}
162
 
 
 
 
 
 
 
163
  # Extract reasoning
164
  reasoning_match = re.search(r'REASONING:(.*?)NAICS_CODE:', response_text, re.DOTALL | re.IGNORECASE)
165
  result["reasoning"] = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
 
183
 
184
  def find_naics_code(company_name: str, context: str = "", api_key: Optional[str] = None) -> Dict:
185
  """
186
+ Core function to find NAICS code for a company
 
 
 
 
 
 
 
 
187
  """
188
  # Get API key from environment if not provided
189
  if not api_key:
 
205
  "reasoning": f"Error: {str(e)}"
206
  }
207
 
208
+ # Run the combined search
209
+ company_info, naics_candidates = combined_google_search(company_name)
210
 
211
+ # Get AI analysis
212
+ result = analyze_naics_code(model, company_name, context, company_info, naics_candidates)
 
 
 
 
 
213
 
214
  # Add metadata
215
  result["company_name"] = company_name
216
  result["context"] = context
217
+ result["company_info"] = company_info
218
  result["candidates"] = naics_candidates
219
 
220
  return result
 
226
 
227
  with gr.Blocks(title="NAICS Code Finder") as demo:
228
  gr.Markdown("# NAICS Code Finder")
229
+ gr.Markdown("Enter a company name to find its appropriate NAICS code. The tool will search for information about the company and find the most appropriate classification.")
230
 
231
  with gr.Row():
232
  with gr.Column():
 
250
  naics_output = gr.Markdown(label="NAICS Code")
251
  with gr.Accordion("Company Information", open=False):
252
  company_info_output = gr.Markdown()
 
 
253
  with gr.Accordion("Classification Reasoning", open=True):
254
  reasoning_output = gr.Markdown()
255
 
256
  # Functions for the interface
257
  def process_company(company_name, company_description, api_key):
258
  if not company_name:
259
+ return "Please enter a company name", "", "", ""
260
 
261
  # Use API key from input or environment
262
  key_to_use = api_key if api_key else os.environ.get('GEMINI_API_KEY')
263
  if not key_to_use:
264
+ return "No API key provided. Please enter your Gemini API key.", "", "", ""
265
 
266
+ status_md = "πŸ” Searching for company information and NAICS codes...\n\n"
267
+ yield status_md, "", "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  # Run the core functionality
270
  result = find_naics_code(company_name, company_description, key_to_use)
271
 
272
+ # Update status based on results
273
+ if "company_info" in result and result["company_info"]:
274
+ status_md += "βœ… Found company information\n\n"
275
+ company_info_md = f"## Information found about {company_name}\n\n{result['company_info']}"
276
+ else:
277
+ status_md += "⚠️ Limited company information found\n\n"
278
+ company_info_md = f"Limited information found for {company_name}"
279
+
280
  if "candidates" in result and result["candidates"]:
281
+ status_md += f"βœ… Found {len(result['candidates'])} potential NAICS codes: {', '.join(result['candidates'])}\n\n"
282
  else:
283
  status_md += "⚠️ No specific NAICS codes found in search results\n\n"
284
 
285
  status_md += "πŸ€– Analyzing classification...\n\n"
286
+ yield status_md, "", company_info_md, ""
287
 
288
  # Format the NAICS code output
289
  naics_code_md = f"## NAICS Code: {result['naics_code']}"
 
 
 
 
 
290
 
291
  # Format the reasoning output
292
  reasoning_md = f"## Analysis\n\n{result['reasoning']}"
293
 
294
  status_md += "βœ… Classification complete!"
295
 
296
+ return status_md, naics_code_md, company_info_md, reasoning_md
297
 
298
  submit_btn.click(
299
  process_company,
300
  inputs=[company_name, company_description, api_key],
301
+ outputs=[status_output, naics_output, company_info_output, reasoning_output]
302
  )
303
 
304
  gr.Examples(