Spaces:
Running
Running
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
import json | |
import re | |
from typing import Dict, List, Any | |
import requests | |
import os | |
from datetime import datetime | |
import logging | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class AIResumeExtractor: | |
def __init__(self, api_key: str = None, model_name: str = "microsoft/DialoGPT-medium"): | |
"""Initialize the AI extractor with Hugging Face API key""" | |
self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY') | |
self.model_name = model_name | |
self.base_url = "https://api-inference.huggingface.co/models" | |
# Available models for different tasks | |
self.models = { | |
"text_generation": "microsoft/DialoGPT-medium", | |
"instruction_following": "microsoft/DialoGPT-medium", | |
"question_answering": "deepset/roberta-base-squad2", | |
"summarization": "facebook/bart-large-cnn", | |
"ner": "dbmdz/bert-large-cased-finetuned-conll03-english" | |
} | |
if not self.api_key: | |
logger.warning("No Hugging Face API key found. Set HF_API_TOKEN or HUGGINGFACE_API_KEY environment variable.") | |
def _make_api_request(self, model_name: str, payload: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]: | |
""" | |
Make a request to Hugging Face Inference API with retry logic | |
""" | |
headers = { | |
"Authorization": f"Bearer {self.api_key}", | |
"Content-Type": "application/json" | |
} | |
url = f"{self.base_url}/{model_name}" | |
for attempt in range(max_retries): | |
try: | |
response = requests.post(url, headers=headers, json=payload, timeout=60) | |
if response.status_code == 200: | |
return response.json() | |
elif response.status_code == 503: | |
# Model is loading, wait and retry | |
logger.info(f"Model {model_name} is loading, waiting...") | |
import time | |
time.sleep(15) | |
continue | |
else: | |
logger.error(f"API request failed: {response.status_code} - {response.text}") | |
break | |
except requests.exceptions.RequestException as e: | |
logger.error(f"Request failed (attempt {attempt + 1}): {e}") | |
if attempt < max_retries - 1: | |
import time | |
time.sleep(3) | |
continue | |
break | |
raise Exception(f"Failed to get response from {model_name} after {max_retries} attempts") | |
def extract_sections_ai(self, text: str) -> Dict[str, Any]: | |
""" | |
Use Hugging Face AI models to extract resume sections in a structured format | |
""" | |
if not self.api_key: | |
logger.warning("No API key available, falling back to regex extraction") | |
from utils.extractor_fixed import extract_sections_spacy_fixed | |
return extract_sections_spacy_fixed(text) | |
try: | |
# Extract different sections using Hugging Face models | |
name = self._extract_name_hf(text) | |
summary = self._extract_summary_hf(text) | |
skills = self._extract_skills_hf(text) | |
experiences = self._extract_experiences_hf(text) | |
education = self._extract_education_hf(text) | |
result = { | |
"Name": name, | |
"Summary": summary, | |
"Skills": skills, | |
"StructuredExperiences": experiences, | |
"Education": education, | |
"Training": [] | |
} | |
logger.info("β Hugging Face AI extraction completed") | |
return self._post_process_extraction(result) | |
except Exception as e: | |
logger.error(f"Hugging Face AI extraction failed: {e}") | |
# Fallback to regex-based extraction | |
from utils.extractor_fixed import extract_sections_spacy_fixed | |
return extract_sections_spacy_fixed(text) | |
def _extract_name_hf(self, text: str) -> str: | |
"""Extract name using Hugging Face question-answering model""" | |
try: | |
payload = { | |
"inputs": { | |
"question": "What is the person's full name?", | |
"context": text[:1000] # First 1000 chars should contain name | |
} | |
} | |
response = self._make_api_request(self.models["question_answering"], payload) | |
if response and "answer" in response: | |
name = response["answer"].strip() | |
# Validate name format | |
if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', name): | |
return name | |
except Exception as e: | |
logger.warning(f"HF name extraction failed: {e}") | |
# Fallback to regex | |
return self._extract_name_regex(text) | |
def _extract_summary_hf(self, text: str) -> str: | |
"""Extract summary using Hugging Face summarization model""" | |
try: | |
# Find summary section first | |
summary_match = re.search( | |
r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', | |
text, re.DOTALL | |
) | |
if summary_match: | |
summary_text = summary_match.group(1).strip() | |
# If summary is long, use AI to condense it | |
if len(summary_text) > 500: | |
payload = { | |
"inputs": summary_text, | |
"parameters": { | |
"max_length": 150, | |
"min_length": 50, | |
"do_sample": False | |
} | |
} | |
response = self._make_api_request(self.models["summarization"], payload) | |
if response and isinstance(response, list) and len(response) > 0: | |
return response[0].get("summary_text", summary_text) | |
return summary_text | |
except Exception as e: | |
logger.warning(f"HF summary extraction failed: {e}") | |
# Fallback to regex | |
return self._extract_summary_regex(text) | |
def _extract_skills_hf(self, text: str) -> List[str]: | |
"""Extract skills using Hugging Face NER model and regex patterns""" | |
skills = set() | |
try: | |
# First, find the technical skills section using regex | |
skills_match = re.search( | |
r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))', | |
text, re.DOTALL | |
) | |
if skills_match: | |
skills_text = skills_match.group(1) | |
# Parse bullet-pointed skills | |
bullet_lines = re.findall(r'β\s*([^β\n]+)', skills_text) | |
for line in bullet_lines: | |
if ':' in line: | |
# Format: "Category: skill1, skill2, skill3" | |
skills_part = line.split(':', 1)[1].strip() | |
individual_skills = re.split(r',\s*', skills_part) | |
for skill in individual_skills: | |
skill = skill.strip() | |
if skill and len(skill) > 1: | |
skills.add(skill) | |
# Use NER model to find additional technical terms | |
try: | |
payload = { | |
"inputs": text[:2000] # Limit text length for NER | |
} | |
response = self._make_api_request(self.models["ner"], payload) | |
if response and isinstance(response, list): | |
for entity in response: | |
if entity.get("entity_group") in ["MISC", "ORG"] and entity.get("score", 0) > 0.8: | |
word = entity.get("word", "").strip() | |
# Filter for technical-looking terms | |
if re.match(r'^[A-Za-z][A-Za-z0-9\.\-]*$', word) and len(word) > 2: | |
skills.add(word) | |
except Exception as e: | |
logger.warning(f"NER extraction failed: {e}") | |
except Exception as e: | |
logger.warning(f"HF skills extraction failed: {e}") | |
# Enhanced common technical skills detection as fallback | |
common_skills = [ | |
'Python', 'Java', 'JavaScript', 'TypeScript', 'C++', 'C#', 'SQL', 'NoSQL', | |
'React', 'Angular', 'Vue', 'Node.js', 'Django', 'Flask', 'Spring', | |
'AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Jenkins', | |
'Git', 'GitHub', 'GitLab', 'Jira', 'Confluence', | |
'TensorFlow', 'PyTorch', 'Scikit-learn', 'Pandas', 'NumPy', 'Matplotlib', | |
'MySQL', 'PostgreSQL', 'MongoDB', 'Redis', | |
'Linux', 'Windows', 'MacOS', 'Ubuntu', | |
'Selenium', 'Pytest', 'TestNG', 'Postman', | |
'AWS Glue', 'AWS SageMaker', 'REST APIs', 'Apex', 'Bash' | |
] | |
for skill in common_skills: | |
if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE): | |
skills.add(skill) | |
return sorted(list(skills)) | |
def _extract_experiences_hf(self, text: str) -> List[Dict[str, Any]]: | |
"""Extract work experiences using Hugging Face question-answering model""" | |
experiences = [] | |
try: | |
# First find the experience section using regex | |
exp_pattern = r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' | |
match = re.search(exp_pattern, text, re.DOTALL) | |
if not match: | |
return experiences | |
exp_text = match.group(1) | |
# Parse job entries with improved patterns | |
# Pattern 1: Company | Location | Title | Date | |
pattern1 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' | |
matches1 = re.findall(pattern1, exp_text) | |
for match in matches1: | |
company, location, title, dates = match | |
# Extract responsibilities using QA model | |
responsibilities = [] | |
try: | |
# Find the section for this specific job | |
job_section = self._find_job_section(exp_text, company.strip(), title.strip()) | |
if job_section: | |
# Use QA model to extract responsibilities | |
payload = { | |
"inputs": { | |
"question": "What are the main responsibilities and achievements?", | |
"context": job_section | |
} | |
} | |
response = self._make_api_request(self.models["question_answering"], payload) | |
if response and "answer" in response: | |
resp_text = response["answer"] | |
# Split into individual responsibilities | |
responsibilities = [r.strip() for r in re.split(r'[β’β\n]', resp_text) if r.strip()] | |
# Fallback to regex if QA didn't work well | |
if len(responsibilities) < 2: | |
responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) | |
except Exception as e: | |
logger.warning(f"HF responsibility extraction failed: {e}") | |
responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) | |
experience = { | |
"title": title.strip(), | |
"company": f"{company.strip()}, {location.strip()}", | |
"date_range": dates.strip(), | |
"responsibilities": responsibilities | |
} | |
experiences.append(experience) | |
except Exception as e: | |
logger.warning(f"HF experience extraction failed: {e}") | |
return experiences | |
def _extract_education_hf(self, text: str) -> List[str]: | |
"""Extract education using Hugging Face question-answering model""" | |
education = [] | |
try: | |
payload = { | |
"inputs": { | |
"question": "What education, degrees, or certifications does this person have?", | |
"context": text | |
} | |
} | |
response = self._make_api_request(self.models["question_answering"], payload) | |
if response and "answer" in response: | |
edu_text = response["answer"] | |
# Parse the education information | |
education_items = re.split(r'[,;]', edu_text) | |
for item in education_items: | |
item = item.strip() | |
if item and len(item) > 5: # Reasonable length | |
education.append(item) | |
except Exception as e: | |
logger.warning(f"HF education extraction failed: {e}") | |
# Fallback to regex if HF extraction didn't work | |
if not education: | |
education = self._extract_education_regex(text) | |
return education | |
def _find_job_section(self, exp_text: str, company: str, title: str) -> str: | |
"""Find the specific section for a job in the experience text""" | |
lines = exp_text.split('\n') | |
job_lines = [] | |
in_job_section = False | |
for line in lines: | |
if company in line and title in line: | |
in_job_section = True | |
job_lines.append(line) | |
elif in_job_section: | |
if re.match(r'^[A-Z].*\|.*\|.*\|', line): # Next job entry | |
break | |
job_lines.append(line) | |
return '\n'.join(job_lines) | |
def _extract_name_regex(self, text: str) -> str: | |
"""Fallback regex name extraction""" | |
lines = text.split('\n')[:5] | |
for line in lines: | |
line = line.strip() | |
if re.search(r'@|phone|email|linkedin|github|π§|π|π', line.lower()): | |
continue | |
name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line) | |
if name_match: | |
return name_match.group(1) | |
return "" | |
def _extract_summary_regex(self, text: str) -> str: | |
"""Fallback regex summary extraction""" | |
summary_patterns = [ | |
r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', | |
r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' | |
] | |
for pattern in summary_patterns: | |
match = re.search(pattern, text, re.DOTALL) | |
if match: | |
summary = match.group(1).strip() | |
summary = re.sub(r'\n+', ' ', summary) | |
summary = re.sub(r'\s+', ' ', summary) | |
if len(summary) > 50: | |
return summary | |
return "" | |
def _extract_responsibilities_regex(self, exp_text: str, company: str, title: str) -> List[str]: | |
"""Extract responsibilities using regex patterns""" | |
responsibilities = [] | |
# Find the section for this specific job | |
job_section = self._find_job_section(exp_text, company, title) | |
if job_section: | |
# Look for bullet points | |
bullet_matches = re.findall(r'β\s*([^β\n]+)', job_section) | |
for match in bullet_matches: | |
resp = match.strip() | |
if len(resp) > 20: # Substantial responsibility | |
responsibilities.append(resp) | |
return responsibilities | |
def _extract_education_regex(self, text: str) -> List[str]: | |
"""Fallback regex education extraction""" | |
education = [] | |
# Look for education section | |
edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' | |
match = re.search(edu_pattern, text, re.DOTALL) | |
if match: | |
edu_text = match.group(1) | |
# Look for degree patterns | |
degree_matches = re.findall(r'β\s*([^β\n]+)', edu_text) | |
for match in degree_matches: | |
edu_item = match.strip() | |
if len(edu_item) > 10: | |
education.append(edu_item) | |
return education | |
def _post_process_extraction(self, data: Dict[str, Any]) -> Dict[str, Any]: | |
""" | |
Clean up and validate the AI-extracted data | |
""" | |
# Ensure all required fields exist | |
default_structure = { | |
"Name": "", | |
"Summary": "", | |
"Skills": [], | |
"StructuredExperiences": [], | |
"Education": [], | |
"Training": [] | |
} | |
# Merge with defaults | |
for key, default_value in default_structure.items(): | |
if key not in data: | |
data[key] = default_value | |
# Clean up skills (remove duplicates, empty entries) | |
if data["Skills"]: | |
data["Skills"] = list(set([ | |
skill.strip() | |
for skill in data["Skills"] | |
if skill and skill.strip() and len(skill.strip()) > 1 | |
])) | |
data["Skills"].sort() | |
# Clean up experiences | |
for exp in data["StructuredExperiences"]: | |
# Ensure all experience fields exist | |
exp.setdefault("title", "") | |
exp.setdefault("company", "") | |
exp.setdefault("date_range", "") | |
exp.setdefault("responsibilities", []) | |
# Clean up responsibilities | |
if exp["responsibilities"]: | |
exp["responsibilities"] = [ | |
resp.strip() | |
for resp in exp["responsibilities"] | |
if resp and resp.strip() | |
] | |
# Clean up education and training | |
for field in ["Education", "Training"]: | |
if data[field]: | |
data[field] = [ | |
item.strip() | |
for item in data[field] | |
if item and item.strip() | |
] | |
return data | |
# Convenience function for backward compatibility | |
def extract_sections_ai(text: str) -> Dict[str, Any]: | |
""" | |
Extract resume sections using AI | |
""" | |
extractor = AIResumeExtractor() | |
return extractor.extract_sections_ai(text) | |
# Test function | |
def test_ai_extraction(): | |
"""Test the Hugging Face AI extraction with sample resume""" | |
sample_text = """ | |
Jonathan Generic Smith | |
πSan Diego, CA | 321-123-1234 | π§ testemail@icloud.com | |
Summary | |
Results-driven Automation Test Engineer with 8 years of experience in Selenium and Java, | |
specializing in automation frameworks for financial and insurance domains. Expert in designing, | |
developing, and executing automated test scripts, ensuring quality software delivery with CI/CD | |
integration. Adept at working with Agile methodologies and cross-functional teams to improve | |
software reliability | |
Technical Skills | |
β Selenium WebDriver, Java, TestNG, Cucumber, Jenkins, Maven | |
β GIT, REST APIs, Apex, Bash | |
β Jira, Agile, CI/CD, Docker, Kubernetes | |
Professional Experience | |
Senior Automation Test Engineer | ABC Financial Services | Jan 2021 - Present | |
β Led automation framework enhancements using Selenium and Java, improving test efficiency. | |
β Automated end-to-end UI and API testing for financial applications, reducing manual effort by 40%. | |
Automation Test Engineer | XYZ Insurance Solutions | Jun 2017 - Dec 2020 | |
β Designed and implemented Selenium automation framework using Java and TestNG. | |
β Developed automated test scripts for insurance policy management applications. | |
Education | |
β Bachelor of Technology in Computer Science | ABC University | 2015 | |
""" | |
print("Testing Hugging Face AI extraction...") | |
extractor = AIResumeExtractor() | |
result = extractor.extract_sections_ai(sample_text) | |
print("Hugging Face AI Extraction Results:") | |
print(json.dumps(result, indent=2)) | |
return result | |
if __name__ == "__main__": | |
test_ai_extraction() |