TalentLensAI / utils /hf_cloud_extractor.py
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
#!/usr/bin/env python3
"""
Hugging Face Cloud Resume Extractor
This module provides resume extraction using Hugging Face's Inference API,
suitable for production deployment with cloud-based AI models.
"""
import json
import re
import logging
import requests
import os
from typing import Dict, Any, List, Optional
from time import sleep
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class HuggingFaceCloudExtractor:
"""
Production-ready resume extractor using Hugging Face Inference API
"""
def __init__(self, api_key: Optional[str] = None, model_name: str = "microsoft/DialoGPT-medium"):
"""
Initialize the cloud extractor
Args:
api_key: Hugging Face API key (optional, will use env var if not provided)
model_name: Name of the Hugging Face model to use
"""
self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
self.model_name = model_name
self.base_url = "https://api-inference.huggingface.co/models"
# Available models for different tasks
self.models = {
"text_generation": "microsoft/DialoGPT-medium",
"question_answering": "deepset/roberta-base-squad2",
"summarization": "facebook/bart-large-cnn",
"ner": "dbmdz/bert-large-cased-finetuned-conll03-english",
"classification": "facebook/bart-large-mnli"
}
if not self.api_key:
logger.warning("No Hugging Face API key found. Set HF_API_TOKEN or HUGGINGFACE_API_KEY environment variable.")
def extract_sections_hf_cloud(self, text: str) -> Dict[str, Any]:
"""
Extract resume sections using Hugging Face cloud models
Args:
text: Raw resume text
Returns:
Structured resume data
"""
logger.info("Starting Hugging Face cloud extraction...")
if not self.api_key:
logger.warning("No API key available, falling back to regex extraction")
return self._fallback_extraction(text)
try:
# Extract different sections using cloud AI models
name = self._extract_name_cloud(text)
summary = self._extract_summary_cloud(text)
skills = self._extract_skills_cloud(text)
experiences = self._extract_experiences_cloud(text)
education = self._extract_education_cloud(text)
contact_info = self._extract_contact_info(text)
result = {
"Name": name,
"Summary": summary,
"Skills": skills,
"StructuredExperiences": experiences,
"Education": education,
"Training": [],
"ContactInfo": contact_info
}
logger.info("βœ… Hugging Face cloud extraction completed")
return result
except Exception as e:
logger.error(f"Hugging Face cloud extraction failed: {e}")
return self._fallback_extraction(text)
def _make_api_request(self, model_name: str, payload: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]:
"""
Make a request to Hugging Face Inference API with retry logic
Args:
model_name: Name of the model to use
payload: Request payload
max_retries: Maximum number of retries
Returns:
API response
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
url = f"{self.base_url}/{model_name}"
for attempt in range(max_retries):
try:
response = requests.post(url, headers=headers, json=payload, timeout=30)
if response.status_code == 200:
return response.json()
elif response.status_code == 503:
# Model is loading, wait and retry
logger.info(f"Model {model_name} is loading, waiting...")
sleep(10)
continue
else:
logger.error(f"API request failed: {response.status_code} - {response.text}")
break
except requests.exceptions.RequestException as e:
logger.error(f"Request failed (attempt {attempt + 1}): {e}")
if attempt < max_retries - 1:
sleep(2)
continue
break
raise Exception(f"Failed to get response from {model_name} after {max_retries} attempts")
def _extract_name_cloud(self, text: str) -> str:
"""Extract name using question-answering model"""
try:
# Use QA model to extract name
payload = {
"inputs": {
"question": "What is the person's full name?",
"context": text[:1000] # First 1000 chars should contain name
}
}
response = self._make_api_request(self.models["question_answering"], payload)
if response and "answer" in response:
name = response["answer"].strip()
# Validate name format
if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', name):
return name
except Exception as e:
logger.warning(f"Cloud name extraction failed: {e}")
# Fallback to regex
return self._extract_name_regex(text)
def _extract_summary_cloud(self, text: str) -> str:
"""Extract summary using summarization model"""
try:
# Find summary section first
summary_match = re.search(
r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))',
text, re.DOTALL
)
if summary_match:
summary_text = summary_match.group(1).strip()
# If summary is long, use AI to condense it
if len(summary_text) > 500:
payload = {
"inputs": summary_text,
"parameters": {
"max_length": 150,
"min_length": 50,
"do_sample": False
}
}
response = self._make_api_request(self.models["summarization"], payload)
if response and isinstance(response, list) and len(response) > 0:
return response[0].get("summary_text", summary_text)
return summary_text
except Exception as e:
logger.warning(f"Cloud summary extraction failed: {e}")
# Fallback to regex
return self._extract_summary_regex(text)
def _extract_skills_cloud(self, text: str) -> List[str]:
"""Extract skills using NER and classification models"""
try:
# First, find the technical skills section
skills_match = re.search(
r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))',
text, re.DOTALL
)
if skills_match:
skills_text = skills_match.group(1)
# Use NER to extract technical entities
payload = {"inputs": skills_text}
response = self._make_api_request(self.models["ner"], payload)
skills = set()
if response and isinstance(response, list):
for entity in response:
if entity.get("entity_group") in ["MISC", "ORG"] or "TECH" in entity.get("entity", ""):
word = entity.get("word", "").replace("##", "").strip()
if len(word) > 2:
skills.add(word)
# Also extract from bullet points using regex
regex_skills = self._extract_skills_regex(text)
skills.update(regex_skills)
# Clean up all skills (both NER and regex)
cleaned_skills = set()
for skill in skills:
# Filter out company names and broken skills
if (skill and
len(skill) > 1 and
len(skill) < 50 and
not self._is_company_name_skill(skill) and
not self._is_broken_skill(skill)):
# Fix common parsing issues
fixed_skill = self._fix_skill_name(skill)
if fixed_skill:
cleaned_skills.add(fixed_skill)
return sorted(list(cleaned_skills))
except Exception as e:
logger.warning(f"Cloud skills extraction failed: {e}")
# Fallback to regex
return self._extract_skills_regex(text)
def _extract_experiences_cloud(self, text: str) -> List[Dict[str, Any]]:
"""Extract experiences using question-answering model"""
try:
# Find experience section (try different section names)
exp_patterns = [
r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))',
r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))'
]
exp_match = None
for pattern in exp_patterns:
exp_match = re.search(pattern, text, re.DOTALL)
if exp_match:
break
if exp_match:
exp_text = exp_match.group(1)
# Use QA to extract structured information
experiences = []
# Extract job entries using regex first
# Try 3-part format: Title | Company | Date
job_pattern_3 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
matches_3 = re.findall(job_pattern_3, exp_text)
# Try 4-part format: Company | Location | Title | Date
job_pattern_4 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
matches_4 = re.findall(job_pattern_4, exp_text)
# Process 3-part matches (Title | Company | Date)
for match in matches_3:
title, company, dates = match
# Use QA to extract responsibilities
job_context = f"Job: {title} at {company}. {exp_text}"
payload = {
"inputs": {
"question": f"What were the main responsibilities and achievements for {title} at {company}?",
"context": job_context[:2000]
}
}
# Use regex extraction for better accuracy with bullet points
responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
experience = {
"title": title.strip(),
"company": company.strip(),
"date_range": dates.strip(),
"responsibilities": responsibilities
}
experiences.append(experience)
# Process 4-part matches (Company | Location | Title | Date)
for match in matches_4:
company, location, title, dates = match
# Use QA to extract responsibilities
job_context = f"Job: {title} at {company}. {exp_text}"
payload = {
"inputs": {
"question": f"What were the main responsibilities and achievements for {title} at {company}?",
"context": job_context[:2000]
}
}
# Use regex extraction for better accuracy with bullet points
responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
experience = {
"title": title.strip(),
"company": f"{company.strip()}, {location.strip()}",
"date_range": dates.strip(),
"responsibilities": responsibilities
}
experiences.append(experience)
return experiences
except Exception as e:
logger.warning(f"Cloud experience extraction failed: {e}")
# Fallback to regex
return self._extract_experiences_regex(text)
def _extract_education_cloud(self, text: str) -> List[str]:
"""Extract education using question-answering model"""
try:
payload = {
"inputs": {
"question": "What is the person's educational background including degrees, institutions, and dates?",
"context": text
}
}
response = self._make_api_request(self.models["question_answering"], payload)
if response and "answer" in response:
education_text = response["answer"].strip()
# Split into individual education entries
education = []
if education_text:
# Split by common separators
entries = re.split(r'[;,]', education_text)
for entry in entries:
entry = entry.strip()
if len(entry) > 10:
education.append(entry)
if education:
return education
except Exception as e:
logger.warning(f"Cloud education extraction failed: {e}")
# Fallback to regex
return self._extract_education_regex(text)
def _extract_contact_info(self, text: str) -> Dict[str, str]:
"""Extract contact information (email, phone, LinkedIn)"""
contact_info = {}
# Extract email
email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
if email_match:
contact_info["email"] = email_match.group(0)
# Extract phone
phone_patterns = [
r'\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})',
r'(\d{3})[-.\s](\d{3})[-.\s](\d{4})',
r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
]
for pattern in phone_patterns:
phone_match = re.search(pattern, text)
if phone_match:
contact_info["phone"] = phone_match.group(0)
break
# Extract LinkedIn
linkedin_patterns = [
r'linkedin\.com/in/[\w-]+',
r'LinkedIn:\s*([\w-]+)',
r'linkedin\.com/[\w-]+'
]
for pattern in linkedin_patterns:
linkedin_match = re.search(pattern, text, re.IGNORECASE)
if linkedin_match:
contact_info["linkedin"] = linkedin_match.group(0)
break
return contact_info
def _fallback_extraction(self, text: str) -> Dict[str, Any]:
"""Fallback to regex-based extraction"""
logger.info("Using regex fallback extraction...")
try:
from utils.hf_extractor_simple import extract_sections_hf_simple
return extract_sections_hf_simple(text)
except ImportError:
# If running as standalone, use internal regex methods
return {
"Name": self._extract_name_regex(text),
"Summary": self._extract_summary_regex(text),
"Skills": self._extract_skills_regex(text),
"StructuredExperiences": self._extract_experiences_regex(text),
"Education": self._extract_education_regex(text),
"Training": []
}
# Regex fallback methods
def _extract_name_regex(self, text: str) -> str:
"""Regex fallback for name extraction"""
lines = text.split('\n')[:5]
for line in lines:
line = line.strip()
if re.search(r'@|phone|email|linkedin|github|πŸ“§|πŸ“ž|πŸ“', line.lower()):
continue
if len(re.findall(r'[^\w\s]', line)) > 3:
continue
name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line)
if name_match:
return name_match.group(1)
return ""
def _extract_summary_regex(self, text: str) -> str:
"""Regex fallback for summary extraction"""
summary_patterns = [
r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))',
r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))',
]
for pattern in summary_patterns:
match = re.search(pattern, text, re.DOTALL)
if match:
summary = match.group(1).strip()
summary = re.sub(r'\n+', ' ', summary)
summary = re.sub(r'\s+', ' ', summary)
if len(summary) > 50:
return summary
return ""
def _extract_skills_regex(self, text: str) -> List[str]:
"""Regex fallback for skills extraction"""
skills = set()
# Technical skills section
skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|work\s+experience|experience|education|projects?))'
match = re.search(skills_pattern, text, re.DOTALL)
if match:
skills_text = match.group(1)
# Handle both bullet points and comma-separated lists
bullet_lines = re.findall(r'●\s*([^●\n]+)', skills_text)
if not bullet_lines:
# If no bullets, treat as comma-separated list
bullet_lines = [skills_text.strip()]
for line in bullet_lines:
if ':' in line:
skills_part = line.split(':', 1)[1].strip()
else:
skills_part = line.strip()
# Split by commas and clean up
individual_skills = re.split(r',\s*', skills_part)
for skill in individual_skills:
skill = skill.strip()
skill = re.sub(r'\([^)]*\)', '', skill).strip() # Remove parentheses
skill = re.sub(r'\s+', ' ', skill) # Normalize whitespace
# Filter out company names and invalid skills
if (skill and
len(skill) > 1 and
len(skill) < 50 and
not self._is_company_name_skill(skill) and
not self._is_broken_skill(skill)):
skills.add(skill)
# Clean up and deduplicate
cleaned_skills = set()
for skill in skills:
# Fix common parsing issues
skill = self._fix_skill_name(skill)
if skill:
cleaned_skills.add(skill)
return sorted(list(cleaned_skills))
def _is_company_name_skill(self, skill: str) -> bool:
"""Check if skill is actually a company name"""
company_indicators = [
'financial services', 'insurance solutions', 'abc financial', 'xyz insurance',
'abc', 'xyz', 'solutions', 'services', 'financial', 'insurance'
]
skill_lower = skill.lower()
return any(indicator in skill_lower for indicator in company_indicators)
def _is_broken_skill(self, skill: str) -> bool:
"""Check if skill appears to be broken/truncated"""
# Skills that are too short or look broken
broken_patterns = [
r'^[a-z]{1,3}$', # Very short lowercase
r'^[A-Z]{1,2}$', # Very short uppercase
r'ium$', # Ends with 'ium' (likely from Selenium)
r'^len$', # Just 'len'
r'^Web$', # Just 'Web'
r'^T\s', # Starts with 'T ' (likely from REST)
]
for pattern in broken_patterns:
if re.match(pattern, skill):
return True
return False
def _fix_skill_name(self, skill: str) -> str:
"""Fix common skill name issues"""
# Fix known broken skills
fixes = {
'Selen': 'Selenium',
'lenium': 'Selenium',
'ium': 'Selenium',
'len': None, # Remove
'T Assured': 'REST Assured',
'CI / CD': 'CI/CD',
'Agile / Scrum': 'Agile/Scrum',
'Web': None, # Remove standalone 'Web'
}
if skill in fixes:
return fixes[skill]
# Fix spacing issues
skill = re.sub(r'\s*/\s*', '/', skill) # Fix "CI / CD" -> "CI/CD"
return skill
def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
"""Regex fallback for experience extraction"""
experiences = []
# Look for experience section (try different section names)
exp_patterns = [
r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))',
r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))'
]
exp_text = ""
for pattern in exp_patterns:
match = re.search(pattern, text, re.DOTALL)
if match:
exp_text = match.group(1)
break
if exp_text:
# Try 3-part format: Title | Company | Date
pattern_3 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
matches_3 = re.findall(pattern_3, exp_text)
# Try 4-part format: Company | Location | Title | Date
pattern_4 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
matches_4 = re.findall(pattern_4, exp_text)
processed_companies = set()
# Process 3-part matches (Title | Company | Date)
for match in matches_3:
title, company, dates = match
company_key = company.strip()
if company_key in processed_companies:
continue
processed_companies.add(company_key)
responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
experience = {
"title": title.strip(),
"company": company_key,
"date_range": dates.strip(),
"responsibilities": responsibilities
}
experiences.append(experience)
# Process 4-part matches (Company | Location | Title | Date)
for match in matches_4:
company, location, title, dates = match
company_key = f"{company.strip()}, {location.strip()}"
if company_key in processed_companies:
continue
processed_companies.add(company_key)
responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
experience = {
"title": title.strip(),
"company": company_key,
"date_range": dates.strip(),
"responsibilities": responsibilities
}
experiences.append(experience)
return experiences
def _extract_responsibilities_regex(self, exp_text: str, company: str, title: str) -> List[str]:
"""Regex fallback for responsibilities extraction"""
responsibilities = []
# Look for the job section - try different patterns
job_patterns = [
rf'{re.escape(title)}.*?{re.escape(company)}.*?\n(.*?)(?=\n[A-Z][^|\n-]*\s*\||$)',
rf'{re.escape(company)}.*?{re.escape(title)}.*?\n(.*?)(?=\n[A-Z][^|\n-]*\s*\||$)'
]
for pattern in job_patterns:
match = re.search(pattern, exp_text, re.DOTALL | re.IGNORECASE)
if match:
resp_text = match.group(1)
# Look for bullet points (● or -)
bullets = re.findall(r'[●-]\s*([^●\n-]+)', resp_text)
# Clean and fix responsibilities
for bullet in bullets:
bullet = bullet.strip()
bullet = re.sub(r'\s+', ' ', bullet)
# Fix common truncation issues
bullet = self._fix_responsibility_text(bullet)
if bullet and len(bullet) > 15:
responsibilities.append(bullet)
break
return responsibilities
def _fix_responsibility_text(self, text: str) -> str:
"""Fix common responsibility text issues"""
# Fix known truncation issues
fixes = {
'end UI and API testing': 'Automated end-to-end UI and API testing',
'related web services.': 'for policy-related web services.',
}
for broken, fixed in fixes.items():
if text.startswith(broken):
return fixed + text[len(broken):]
if text.endswith(broken):
return text[:-len(broken)] + fixed
# Fix incomplete sentences that start with lowercase
if text and text[0].islower() and not text.startswith('e.g.'):
# Likely a continuation, try to fix common patterns
if text.startswith('end '):
text = 'Automated ' + text
elif text.startswith('related '):
text = 'for policy-' + text
return text
def _extract_education_regex(self, text: str) -> List[str]:
"""Regex fallback for education extraction"""
education = []
edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))'
match = re.search(edu_pattern, text, re.DOTALL)
if match:
edu_text = match.group(1)
edu_lines = re.findall(r'●\s*([^●\n]+)', edu_text)
if not edu_lines:
edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()]
for line in edu_lines:
line = line.strip()
line = re.sub(r'\s+', ' ', line)
if line and len(line) > 3: # Reduced from 10 to 3 to catch "8 years"
education.append(line)
return education
# Convenience function for easy usage
def extract_sections_hf_cloud(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Extract resume sections using Hugging Face cloud models
Args:
text: Raw resume text
api_key: Hugging Face API key (optional)
Returns:
Structured resume data
"""
extractor = HuggingFaceCloudExtractor(api_key=api_key)
return extractor.extract_sections_hf_cloud(text)
# Test function
def test_hf_cloud_extraction():
"""Test the Hugging Face cloud extraction with sample resume"""
sample_text = """
Jonathan Edward Nguyen
πŸ“San Diego, CA | 858-900-5036 | πŸ“§ jonatngu@icloud.com
Summary
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
automation solutions, AI development, and optimizing workflows.
Technical Skills
● Programming Languages: Python, Java, SQL, Apex, Bash
● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas
● Cloud Platforms: AWS Glue, AWS SageMaker, AWS Orchestration, REST APIs
Professional Experience
TalentLens.AI | Remote | AI Developer | Feb 2025 – Present
● Built an automated test suite for LLM prompts that export reports with performance metrics
● Architected and developed an AI-powered resume screening application using Streamlit
GoFundMe | San Diego, CA | Senior Developer in Test | Oct 2021 – Dec 2024
● Built and maintained robust API and UI test suites in Python, reducing defects by 37%
● Automated environment builds using Apex and Bash, improving deployment times by 30%
Education
● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing
"""
extractor = HuggingFaceCloudExtractor()
result = extractor.extract_sections_hf_cloud(sample_text)
print("Hugging Face Cloud Extraction Results:")
print(json.dumps(result, indent=2))
return result
if __name__ == "__main__":
test_hf_cloud_extraction()