Spaces:
Running
Running
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
#!/usr/bin/env python3 | |
""" | |
Simplified Hugging Face Resume Extractor | |
This module provides resume extraction using primarily regex patterns | |
with minimal Hugging Face model usage for specific tasks only. | |
This approach is more reliable and faster than full model-based extraction. | |
""" | |
import json | |
import re | |
import logging | |
from typing import Dict, Any, List, Optional | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class SimpleHFResumeExtractor: | |
""" | |
Simplified resume extractor using primarily regex with minimal HF model usage | |
""" | |
def __init__(self): | |
"""Initialize the simple extractor""" | |
self.model_available = False | |
# Try to load a lightweight model for name extraction only | |
try: | |
# Only load if really needed and use the smallest possible model | |
logger.info("Simple HF extractor initialized (regex-based)") | |
self.model_available = False # Disable model usage for now | |
except Exception as e: | |
logger.info(f"No HF model loaded, using pure regex approach: {e}") | |
self.model_available = False | |
def extract_sections_hf_simple(self, text: str) -> Dict[str, Any]: | |
""" | |
Extract resume sections using simplified approach | |
Args: | |
text: Raw resume text | |
Returns: | |
Structured resume data | |
""" | |
logger.info("Starting simplified HF extraction...") | |
try: | |
# Extract different sections using optimized regex patterns | |
name = self._extract_name_simple(text) | |
summary = self._extract_summary_simple(text) | |
skills = self._extract_skills_simple(text) | |
experiences = self._extract_experiences_simple(text) | |
education = self._extract_education_simple(text) | |
result = { | |
"Name": name, | |
"Summary": summary, | |
"Skills": skills, | |
"StructuredExperiences": experiences, | |
"Education": education, | |
"Training": [] | |
} | |
logger.info("β Simplified HF extraction completed") | |
return result | |
except Exception as e: | |
logger.error(f"Simplified HF extraction failed: {e}") | |
# Fallback to regex-based extraction | |
from utils.extractor_fixed import extract_sections_spacy_fixed | |
return extract_sections_spacy_fixed(text) | |
def _extract_name_simple(self, text: str) -> str: | |
"""Extract name using optimized regex patterns""" | |
lines = text.split('\n')[:5] # Check first 5 lines | |
for line in lines: | |
line = line.strip() | |
# Skip lines with contact info | |
if re.search(r'@|phone|email|linkedin|github|π§|π|π', line.lower()): | |
continue | |
# Skip lines with too many special characters | |
if len(re.findall(r'[^\w\s]', line)) > 3: | |
continue | |
# Look for name-like patterns | |
name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line) | |
if name_match: | |
return name_match.group(1) | |
return "" | |
def _extract_summary_simple(self, text: str) -> str: | |
"""Extract professional summary using improved regex""" | |
# Look for summary section with better boundary detection | |
summary_patterns = [ | |
r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', | |
r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', | |
r'(?i)profile[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' | |
] | |
for pattern in summary_patterns: | |
match = re.search(pattern, text, re.DOTALL) | |
if match: | |
summary = match.group(1).strip() | |
# Clean up the summary | |
summary = re.sub(r'\n+', ' ', summary) | |
summary = re.sub(r'\s+', ' ', summary) | |
if len(summary) > 50: # Ensure it's substantial | |
return summary | |
return "" | |
def _extract_skills_simple(self, text: str) -> List[str]: | |
"""Extract skills using enhanced regex patterns""" | |
skills = set() | |
# Look for technical skills section with better parsing | |
skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))' | |
match = re.search(skills_pattern, text, re.DOTALL) | |
if match: | |
skills_text = match.group(1) | |
# Parse bullet-pointed skills with improved cleaning | |
bullet_lines = re.findall(r'β\s*([^β\n]+)', skills_text) | |
for line in bullet_lines: | |
if ':' in line: | |
# Format: "Category: skill1, skill2, skill3" | |
skills_part = line.split(':', 1)[1].strip() | |
individual_skills = re.split(r',\s*', skills_part) | |
for skill in individual_skills: | |
skill = skill.strip() | |
# Clean up parenthetical information | |
skill = re.sub(r'\([^)]*\)', '', skill).strip() | |
if skill and len(skill) > 1 and len(skill) < 50: # Reasonable length | |
skills.add(skill) | |
# Enhanced common technical skills detection | |
common_skills = [ | |
'Python', 'Java', 'JavaScript', 'TypeScript', 'C++', 'C#', 'SQL', 'NoSQL', | |
'React', 'Angular', 'Vue', 'Node.js', 'Django', 'Flask', 'Spring', | |
'AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Jenkins', | |
'Git', 'GitHub', 'GitLab', 'Jira', 'Confluence', | |
'TensorFlow', 'PyTorch', 'Scikit-learn', 'Pandas', 'NumPy', 'Matplotlib', 'Seaborn', | |
'MySQL', 'PostgreSQL', 'MongoDB', 'Redis', | |
'Linux', 'Windows', 'MacOS', 'Ubuntu', | |
'Selenium', 'Pytest', 'TestNG', 'Postman', | |
'AWS Glue', 'AWS SageMaker', 'REST APIs', 'Apex', 'Bash' | |
] | |
for skill in common_skills: | |
if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE): | |
skills.add(skill) | |
return sorted(list(skills)) | |
def _extract_experiences_simple(self, text: str) -> List[Dict[str, Any]]: | |
"""Extract work experiences using improved regex patterns""" | |
experiences = [] | |
# Look for experience section | |
exp_pattern = r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' | |
match = re.search(exp_pattern, text, re.DOTALL) | |
if not match: | |
return experiences | |
exp_text = match.group(1) | |
# Parse job entries with improved patterns | |
# Pattern 1: Company | Location | Title | Date | |
pattern1 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' | |
matches1 = re.findall(pattern1, exp_text) | |
processed_companies = set() # Track to avoid duplicates | |
for match in matches1: | |
company, location, title, dates = match | |
company_key = f"{company.strip()}, {location.strip()}" | |
# Skip if we've already processed this company | |
if company_key in processed_companies: | |
continue | |
processed_companies.add(company_key) | |
# Extract responsibilities for this specific job | |
responsibilities = self._extract_responsibilities_simple(exp_text, company.strip(), title.strip()) | |
experience = { | |
"title": title.strip(), | |
"company": company_key, | |
"date_range": dates.strip(), | |
"responsibilities": responsibilities | |
} | |
experiences.append(experience) | |
return experiences | |
def _extract_responsibilities_simple(self, exp_text: str, company: str, title: str) -> List[str]: | |
"""Extract responsibilities for a specific job using improved regex""" | |
responsibilities = [] | |
# Create a pattern to find the job entry and extract bullet points after it | |
# Look for the company and title, then capture bullet points until next job or section | |
job_pattern = rf'{re.escape(company)}.*?{re.escape(title)}.*?\n(.*?)(?=\n[A-Z][^|\n]*\s*\||$)' | |
match = re.search(job_pattern, exp_text, re.DOTALL | re.IGNORECASE) | |
if match: | |
resp_text = match.group(1) | |
# Extract bullet points with improved cleaning | |
bullets = re.findall(r'β\s*([^β\n]+)', resp_text) | |
for bullet in bullets: | |
bullet = bullet.strip() | |
# Clean up the bullet point | |
bullet = re.sub(r'\s+', ' ', bullet) # Normalize whitespace | |
if bullet and len(bullet) > 15: # Ensure substantial content | |
responsibilities.append(bullet) | |
return responsibilities | |
def _extract_education_simple(self, text: str) -> List[str]: | |
"""Extract education information using improved regex""" | |
education = [] | |
# Look for education section with better boundary detection | |
edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' | |
match = re.search(edu_pattern, text, re.DOTALL) | |
if match: | |
edu_text = match.group(1) | |
# Extract bullet points or lines with improved cleaning | |
edu_lines = re.findall(r'β\s*([^β\n]+)', edu_text) | |
if not edu_lines: | |
# Try line-by-line for non-bulleted education | |
edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] | |
for line in edu_lines: | |
line = line.strip() | |
# Clean up the education entry | |
line = re.sub(r'\s+', ' ', line) # Normalize whitespace | |
if line and len(line) > 3: # Reduced to catch short entries like "8 years" | |
education.append(line) | |
return education | |
# Convenience function for easy usage | |
def extract_sections_hf_simple(text: str) -> Dict[str, Any]: | |
""" | |
Extract resume sections using simplified Hugging Face approach | |
Args: | |
text: Raw resume text | |
Returns: | |
Structured resume data | |
""" | |
extractor = SimpleHFResumeExtractor() | |
return extractor.extract_sections_hf_simple(text) | |
# Test function | |
def test_simple_hf_extraction(): | |
"""Test the simplified HF extraction with sample resume""" | |
sample_text = """ | |
Jonathan Edward Nguyen | |
πSan Diego, CA | 858-900-5036 | π§ jonatngu@icloud.com | |
Summary | |
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable | |
automation solutions, AI development, and optimizing workflows. | |
Technical Skills | |
β Programming Languages: Python, Java, SQL, Apex, Bash | |
β Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas | |
β Cloud Platforms: AWS Glue, AWS SageMaker, AWS Orchestration, REST APIs | |
Professional Experience | |
TalentLens.AI | Remote | AI Developer | Feb 2025 β Present | |
β Built an automated test suite for LLM prompts that export reports with performance metrics | |
β Architected and developed an AI-powered resume screening application using Streamlit | |
GoFundMe | San Diego, CA | Senior Developer in Test | Oct 2021 β Dec 2024 | |
β Built and maintained robust API and UI test suites in Python, reducing defects by 37% | |
β Automated environment builds using Apex and Bash, improving deployment times by 30% | |
Education | |
β California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing | |
""" | |
extractor = SimpleHFResumeExtractor() | |
result = extractor.extract_sections_hf_simple(sample_text) | |
print("Simplified HF Extraction Results:") | |
print(json.dumps(result, indent=2)) | |
return result | |
if __name__ == "__main__": | |
test_simple_hf_extraction() |