jzou1995's picture
Update app.py
203605e verified
raw
history blame
8.25 kB
import os
import re
import json
import requests
from typing import List, Dict
from googlesearch import search
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
def initialize_gemini(api_key: str):
"""Initialize the Google Gemini API with appropriate configurations"""
genai.configure(api_key=api_key)
generation_config = {
"temperature": 0.2,
"top_p": 0.8,
"top_k": 40,
"max_output_tokens": 1024,
}
safety_settings = {
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config=generation_config,
safety_settings=safety_settings
)
return model
def google_search_naics(company_name: str) -> List[str]:
"""
Find potential NAICS codes for a company using multiple targeted Google searches
Uses more specific search queries to improve results
"""
naics_codes = set()
# Create multiple search queries for better results
queries = [
f"NAICS code for {company_name}",
f"what is {company_name} company NAICS code",
f"{company_name} business entity NAICS classification",
f"{company_name} industry classification NAICS",
f"{company_name} company information NAICS"
]
try:
print(f"πŸ”Ž Searching Google for NAICS codes for '{company_name}'...")
for query in queries:
print(f" Query: {query}")
try:
# Search with each query, limiting to 3 results per query
search_results = search(query, stop=3, pause=2)
for result_url in search_results:
try:
response = requests.get(result_url, timeout=5)
if response.status_code == 200:
# Extract 6-digit NAICS codes
found_codes = re.findall(r'\b\d{6}\b', response.text)
naics_codes.update(found_codes)
# If we find codes, print them
if found_codes:
print(f" Found codes in {result_url}: {found_codes}")
except Exception as e:
print(f" ⚠️ Error fetching {result_url}: {e}")
except Exception as e:
print(f" ⚠️ Error with query '{query}': {e}")
continue
# Return unique codes, limited to 10 most common
return list(naics_codes)[:10]
except Exception as e:
print(f"❌ Error performing Google search: {str(e)}")
return []
def get_naics_classification(model, company_name: str, context: str, candidates: List[str]) -> dict:
"""
Use Gemini AI to determine the most appropriate NAICS code from candidates
First provides reasoning, then returns the NAICS code and explanation
"""
try:
print("πŸ€– AI is analyzing NAICS classification...")
# If we have candidate codes from Google search
if candidates:
# Create a prompt that asks for research on the candidates
prompt = f"""
You are a NAICS code classification expert. Based on the company information provided and the NAICS code candidates found from Google search, determine the most appropriate NAICS code.
Company Name: {company_name}
Context Information: {context}
NAICS Code Candidates from Google Search: {candidates}
First, research what these NAICS codes represent:
1. For each NAICS code candidate, briefly explain what industry or business activity it corresponds to.
2. Then explain which industry classification best matches this company based on the name and context provided.
3. Finally, select the single most appropriate NAICS code from the candidates, or suggest a different one if none match.
Your response should be in this format:
RESEARCH: [Brief explanation of what each NAICS code represents]
REASONING: [Your detailed reasoning about why the chosen industry classification is most appropriate for this company]
NAICS_CODE: [6-digit NAICS code]
"""
# If no candidates were found from Google search
else:
prompt = f"""
You are a NAICS code classification expert. Based on the company information provided, determine the most appropriate NAICS code.
Company Name: {company_name}
Context Information: {context}
First, analyze what industry this company likely belongs to based on its name and the provided context.
Consider standard business classifications and determine the most appropriate category.
Then provide the single most appropriate 6-digit NAICS code.
Your response should be in this format:
REASONING: [Your detailed reasoning about the company's industry classification, including what business activities it likely performs]
NAICS_CODE: [6-digit NAICS code]
"""
response = model.generate_content(prompt)
response_text = response.text.strip()
# Create result dictionary
result = {}
# Extract research if available
if "RESEARCH:" in response_text:
research_match = re.search(r'RESEARCH:(.*?)REASONING:', response_text, re.DOTALL | re.IGNORECASE)
if research_match:
result["research"] = research_match.group(1).strip()
# Extract reasoning
reasoning_match = re.search(r'REASONING:(.*?)NAICS_CODE:', response_text, re.DOTALL | re.IGNORECASE)
result["reasoning"] = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
# Extract NAICS code
naics_match = re.search(r'NAICS_CODE:(.*?)(\d{6})', response_text, re.DOTALL)
if naics_match:
result["naics_code"] = naics_match.group(2)
else:
# Try to find any 6-digit code in the response
code_match = re.search(r'\b(\d{6})\b', response_text)
result["naics_code"] = code_match.group(1) if code_match else "000000"
return result
except Exception as e:
print(f"❌ Error getting NAICS classification: {str(e)}")
return {
"naics_code": "000000",
"reasoning": f"Error analyzing company: {str(e)}"
}
def main():
"""Main function to run the NAICS classifier"""
print("πŸš€ NAICS Code Finder\n")
# Step 1: Get API Key
api_key = input("Enter your Google Gemini API Key: ")
model = initialize_gemini(api_key)
while True:
# Step 2: Get Company Info
company_name = input("\nEnter the company name (or 'exit' to quit): ")
if company_name.lower() == 'exit':
break
context = input("Enter a brief description of the company (or press Enter for none): ")
# Step 3: Find NAICS Code Candidates
naics_candidates = google_search_naics(company_name)
if not naics_candidates:
print("❌ No NAICS codes found from Google search.")
# Ask Gemini to suggest a code even without candidates
result = get_naics_classification(model, company_name, context, [])
else:
print(f"βœ… Found {len(naics_candidates)} NAICS candidates: {naics_candidates}")
# Use Gemini to select the best code
result = get_naics_classification(model, company_name, context, naics_candidates)
# Display research findings if available
if "research" in result:
print(f"\nπŸ“Š NAICS Code Research:\n{result['research']}")
# Display reasoning
print(f"\n🧠 Reasoning:\n{result['reasoning']}")
# Output the NAICS code
print(f"\nπŸ† NAICS Code: {result['naics_code']}")
print("-" * 80)
if __name__ == "__main__":
main()