#!/usr/bin/env python3 """ Simplified Hugging Face Resume Extractor This module provides resume extraction using primarily regex patterns with minimal Hugging Face model usage for specific tasks only. This approach is more reliable and faster than full model-based extraction. """ import json import re import logging from typing import Dict, Any, List, Optional # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class SimpleHFResumeExtractor: """ Simplified resume extractor using primarily regex with minimal HF model usage """ def __init__(self): """Initialize the simple extractor""" self.model_available = False # Try to load a lightweight model for name extraction only try: # Only load if really needed and use the smallest possible model logger.info("Simple HF extractor initialized (regex-based)") self.model_available = False # Disable model usage for now except Exception as e: logger.info(f"No HF model loaded, using pure regex approach: {e}") self.model_available = False def extract_sections_hf_simple(self, text: str) -> Dict[str, Any]: """ Extract resume sections using simplified approach Args: text: Raw resume text Returns: Structured resume data """ logger.info("Starting simplified HF extraction...") try: # Extract different sections using optimized regex patterns name = self._extract_name_simple(text) summary = self._extract_summary_simple(text) skills = self._extract_skills_simple(text) experiences = self._extract_experiences_simple(text) education = self._extract_education_simple(text) result = { "Name": name, "Summary": summary, "Skills": skills, "StructuredExperiences": experiences, "Education": education, "Training": [] } logger.info("βœ… Simplified HF extraction completed") return result except Exception as e: logger.error(f"Simplified HF extraction failed: {e}") # Fallback to regex-based extraction from utils.extractor_fixed import extract_sections_spacy_fixed return extract_sections_spacy_fixed(text) def _extract_name_simple(self, text: str) -> str: """Extract name using optimized regex patterns""" lines = text.split('\n')[:5] # Check first 5 lines for line in lines: line = line.strip() # Skip lines with contact info if re.search(r'@|phone|email|linkedin|github|πŸ“§|πŸ“ž|πŸ“', line.lower()): continue # Skip lines with too many special characters if len(re.findall(r'[^\w\s]', line)) > 3: continue # Look for name-like patterns name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line) if name_match: return name_match.group(1) return "" def _extract_summary_simple(self, text: str) -> str: """Extract professional summary using improved regex""" # Look for summary section with better boundary detection summary_patterns = [ r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', r'(?i)profile[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' ] for pattern in summary_patterns: match = re.search(pattern, text, re.DOTALL) if match: summary = match.group(1).strip() # Clean up the summary summary = re.sub(r'\n+', ' ', summary) summary = re.sub(r'\s+', ' ', summary) if len(summary) > 50: # Ensure it's substantial return summary return "" def _extract_skills_simple(self, text: str) -> List[str]: """Extract skills using enhanced regex patterns""" skills = set() # Look for technical skills section with better parsing skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))' match = re.search(skills_pattern, text, re.DOTALL) if match: skills_text = match.group(1) # Parse bullet-pointed skills with improved cleaning bullet_lines = re.findall(r'●\s*([^●\n]+)', skills_text) for line in bullet_lines: if ':' in line: # Format: "Category: skill1, skill2, skill3" skills_part = line.split(':', 1)[1].strip() individual_skills = re.split(r',\s*', skills_part) for skill in individual_skills: skill = skill.strip() # Clean up parenthetical information skill = re.sub(r'\([^)]*\)', '', skill).strip() if skill and len(skill) > 1 and len(skill) < 50: # Reasonable length skills.add(skill) # Enhanced common technical skills detection common_skills = [ 'Python', 'Java', 'JavaScript', 'TypeScript', 'C++', 'C#', 'SQL', 'NoSQL', 'React', 'Angular', 'Vue', 'Node.js', 'Django', 'Flask', 'Spring', 'AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Jenkins', 'Git', 'GitHub', 'GitLab', 'Jira', 'Confluence', 'TensorFlow', 'PyTorch', 'Scikit-learn', 'Pandas', 'NumPy', 'Matplotlib', 'Seaborn', 'MySQL', 'PostgreSQL', 'MongoDB', 'Redis', 'Linux', 'Windows', 'MacOS', 'Ubuntu', 'Selenium', 'Pytest', 'TestNG', 'Postman', 'AWS Glue', 'AWS SageMaker', 'REST APIs', 'Apex', 'Bash' ] for skill in common_skills: if re.search(rf'\b{re.escape(skill)}\b', text, re.IGNORECASE): skills.add(skill) return sorted(list(skills)) def _extract_experiences_simple(self, text: str) -> List[Dict[str, Any]]: """Extract work experiences using improved regex patterns""" experiences = [] # Look for experience section exp_pattern = r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' match = re.search(exp_pattern, text, re.DOTALL) if not match: return experiences exp_text = match.group(1) # Parse job entries with improved patterns # Pattern 1: Company | Location | Title | Date pattern1 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' matches1 = re.findall(pattern1, exp_text) processed_companies = set() # Track to avoid duplicates for match in matches1: company, location, title, dates = match company_key = f"{company.strip()}, {location.strip()}" # Skip if we've already processed this company if company_key in processed_companies: continue processed_companies.add(company_key) # Extract responsibilities for this specific job responsibilities = self._extract_responsibilities_simple(exp_text, company.strip(), title.strip()) experience = { "title": title.strip(), "company": company_key, "date_range": dates.strip(), "responsibilities": responsibilities } experiences.append(experience) return experiences def _extract_responsibilities_simple(self, exp_text: str, company: str, title: str) -> List[str]: """Extract responsibilities for a specific job using improved regex""" responsibilities = [] # Create a pattern to find the job entry and extract bullet points after it # Look for the company and title, then capture bullet points until next job or section job_pattern = rf'{re.escape(company)}.*?{re.escape(title)}.*?\n(.*?)(?=\n[A-Z][^|\n]*\s*\||$)' match = re.search(job_pattern, exp_text, re.DOTALL | re.IGNORECASE) if match: resp_text = match.group(1) # Extract bullet points with improved cleaning bullets = re.findall(r'●\s*([^●\n]+)', resp_text) for bullet in bullets: bullet = bullet.strip() # Clean up the bullet point bullet = re.sub(r'\s+', ' ', bullet) # Normalize whitespace if bullet and len(bullet) > 15: # Ensure substantial content responsibilities.append(bullet) return responsibilities def _extract_education_simple(self, text: str) -> List[str]: """Extract education information using improved regex""" education = [] # Look for education section with better boundary detection edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' match = re.search(edu_pattern, text, re.DOTALL) if match: edu_text = match.group(1) # Extract bullet points or lines with improved cleaning edu_lines = re.findall(r'●\s*([^●\n]+)', edu_text) if not edu_lines: # Try line-by-line for non-bulleted education edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] for line in edu_lines: line = line.strip() # Clean up the education entry line = re.sub(r'\s+', ' ', line) # Normalize whitespace if line and len(line) > 3: # Reduced to catch short entries like "8 years" education.append(line) return education # Convenience function for easy usage def extract_sections_hf_simple(text: str) -> Dict[str, Any]: """ Extract resume sections using simplified Hugging Face approach Args: text: Raw resume text Returns: Structured resume data """ extractor = SimpleHFResumeExtractor() return extractor.extract_sections_hf_simple(text) # Test function def test_simple_hf_extraction(): """Test the simplified HF extraction with sample resume""" sample_text = """ Jonathan Edward Nguyen πŸ“San Diego, CA | 858-900-5036 | πŸ“§ jonatngu@icloud.com Summary Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable automation solutions, AI development, and optimizing workflows. Technical Skills ● Programming Languages: Python, Java, SQL, Apex, Bash ● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas ● Cloud Platforms: AWS Glue, AWS SageMaker, AWS Orchestration, REST APIs Professional Experience TalentLens.AI | Remote | AI Developer | Feb 2025 – Present ● Built an automated test suite for LLM prompts that export reports with performance metrics ● Architected and developed an AI-powered resume screening application using Streamlit GoFundMe | San Diego, CA | Senior Developer in Test | Oct 2021 – Dec 2024 ● Built and maintained robust API and UI test suites in Python, reducing defects by 37% ● Automated environment builds using Apex and Bash, improving deployment times by 30% Education ● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing """ extractor = SimpleHFResumeExtractor() result = extractor.extract_sections_hf_simple(sample_text) print("Simplified HF Extraction Results:") print(json.dumps(result, indent=2)) return result if __name__ == "__main__": test_simple_hf_extraction()