#!/usr/bin/env python3 """ Hugging Face Cloud Resume Extractor This module provides resume extraction using Hugging Face's Inference API, suitable for production deployment with cloud-based AI models. """ import json import re import logging import requests import os from typing import Dict, Any, List, Optional from time import sleep # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class HuggingFaceCloudExtractor: """ Production-ready resume extractor using Hugging Face Inference API """ def __init__(self, api_key: Optional[str] = None, model_name: str = "microsoft/DialoGPT-medium"): """ Initialize the cloud extractor Args: api_key: Hugging Face API key (optional, will use env var if not provided) model_name: Name of the Hugging Face model to use """ self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY') self.model_name = model_name self.base_url = "https://api-inference.huggingface.co/models" # Available models for different tasks self.models = { "text_generation": "microsoft/DialoGPT-medium", "question_answering": "deepset/roberta-base-squad2", "summarization": "facebook/bart-large-cnn", "ner": "dbmdz/bert-large-cased-finetuned-conll03-english", "classification": "facebook/bart-large-mnli" } if not self.api_key: logger.warning("No Hugging Face API key found. Set HF_API_TOKEN or HUGGINGFACE_API_KEY environment variable.") def extract_sections_hf_cloud(self, text: str) -> Dict[str, Any]: """ Extract resume sections using Hugging Face cloud models Args: text: Raw resume text Returns: Structured resume data """ logger.info("Starting Hugging Face cloud extraction...") if not self.api_key: logger.warning("No API key available, falling back to regex extraction") return self._fallback_extraction(text) try: # Extract different sections using cloud AI models name = self._extract_name_cloud(text) summary = self._extract_summary_cloud(text) skills = self._extract_skills_cloud(text) experiences = self._extract_experiences_cloud(text) education = self._extract_education_cloud(text) contact_info = self._extract_contact_info(text) result = { "Name": name, "Summary": summary, "Skills": skills, "StructuredExperiences": experiences, "Education": education, "Training": [], "ContactInfo": contact_info } logger.info("βœ… Hugging Face cloud extraction completed") return result except Exception as e: logger.error(f"Hugging Face cloud extraction failed: {e}") return self._fallback_extraction(text) def _make_api_request(self, model_name: str, payload: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]: """ Make a request to Hugging Face Inference API with retry logic Args: model_name: Name of the model to use payload: Request payload max_retries: Maximum number of retries Returns: API response """ headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json" } url = f"{self.base_url}/{model_name}" for attempt in range(max_retries): try: response = requests.post(url, headers=headers, json=payload, timeout=30) if response.status_code == 200: return response.json() elif response.status_code == 503: # Model is loading, wait and retry logger.info(f"Model {model_name} is loading, waiting...") sleep(10) continue else: logger.error(f"API request failed: {response.status_code} - {response.text}") break except requests.exceptions.RequestException as e: logger.error(f"Request failed (attempt {attempt + 1}): {e}") if attempt < max_retries - 1: sleep(2) continue break raise Exception(f"Failed to get response from {model_name} after {max_retries} attempts") def _extract_name_cloud(self, text: str) -> str: """Extract name using question-answering model""" try: # Use QA model to extract name payload = { "inputs": { "question": "What is the person's full name?", "context": text[:1000] # First 1000 chars should contain name } } response = self._make_api_request(self.models["question_answering"], payload) if response and "answer" in response: name = response["answer"].strip() # Validate name format if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', name): return name except Exception as e: logger.warning(f"Cloud name extraction failed: {e}") # Fallback to regex return self._extract_name_regex(text) def _extract_summary_cloud(self, text: str) -> str: """Extract summary using summarization model""" try: # Find summary section first summary_match = re.search( r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', text, re.DOTALL ) if summary_match: summary_text = summary_match.group(1).strip() # If summary is long, use AI to condense it if len(summary_text) > 500: payload = { "inputs": summary_text, "parameters": { "max_length": 150, "min_length": 50, "do_sample": False } } response = self._make_api_request(self.models["summarization"], payload) if response and isinstance(response, list) and len(response) > 0: return response[0].get("summary_text", summary_text) return summary_text except Exception as e: logger.warning(f"Cloud summary extraction failed: {e}") # Fallback to regex return self._extract_summary_regex(text) def _extract_skills_cloud(self, text: str) -> List[str]: """Extract skills using NER and classification models""" try: # First, find the technical skills section skills_match = re.search( r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))', text, re.DOTALL ) if skills_match: skills_text = skills_match.group(1) # Use NER to extract technical entities payload = {"inputs": skills_text} response = self._make_api_request(self.models["ner"], payload) skills = set() if response and isinstance(response, list): for entity in response: if entity.get("entity_group") in ["MISC", "ORG"] or "TECH" in entity.get("entity", ""): word = entity.get("word", "").replace("##", "").strip() if len(word) > 2: skills.add(word) # Also extract from bullet points using regex regex_skills = self._extract_skills_regex(text) skills.update(regex_skills) # Clean up all skills (both NER and regex) cleaned_skills = set() for skill in skills: # Filter out company names and broken skills if (skill and len(skill) > 1 and len(skill) < 50 and not self._is_company_name_skill(skill) and not self._is_broken_skill(skill)): # Fix common parsing issues fixed_skill = self._fix_skill_name(skill) if fixed_skill: cleaned_skills.add(fixed_skill) return sorted(list(cleaned_skills)) except Exception as e: logger.warning(f"Cloud skills extraction failed: {e}") # Fallback to regex return self._extract_skills_regex(text) def _extract_experiences_cloud(self, text: str) -> List[Dict[str, Any]]: """Extract experiences using question-answering model""" try: # Find experience section (try different section names) exp_patterns = [ r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))', r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))' ] exp_match = None for pattern in exp_patterns: exp_match = re.search(pattern, text, re.DOTALL) if exp_match: break if exp_match: exp_text = exp_match.group(1) # Use QA to extract structured information experiences = [] # Extract job entries using regex first # Try 3-part format: Title | Company | Date job_pattern_3 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' matches_3 = re.findall(job_pattern_3, exp_text) # Try 4-part format: Company | Location | Title | Date job_pattern_4 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' matches_4 = re.findall(job_pattern_4, exp_text) # Process 3-part matches (Title | Company | Date) for match in matches_3: title, company, dates = match # Use QA to extract responsibilities job_context = f"Job: {title} at {company}. {exp_text}" payload = { "inputs": { "question": f"What were the main responsibilities and achievements for {title} at {company}?", "context": job_context[:2000] } } # Use regex extraction for better accuracy with bullet points responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) experience = { "title": title.strip(), "company": company.strip(), "date_range": dates.strip(), "responsibilities": responsibilities } experiences.append(experience) # Process 4-part matches (Company | Location | Title | Date) for match in matches_4: company, location, title, dates = match # Use QA to extract responsibilities job_context = f"Job: {title} at {company}. {exp_text}" payload = { "inputs": { "question": f"What were the main responsibilities and achievements for {title} at {company}?", "context": job_context[:2000] } } # Use regex extraction for better accuracy with bullet points responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) experience = { "title": title.strip(), "company": f"{company.strip()}, {location.strip()}", "date_range": dates.strip(), "responsibilities": responsibilities } experiences.append(experience) return experiences except Exception as e: logger.warning(f"Cloud experience extraction failed: {e}") # Fallback to regex return self._extract_experiences_regex(text) def _extract_education_cloud(self, text: str) -> List[str]: """Extract education using question-answering model""" try: payload = { "inputs": { "question": "What is the person's educational background including degrees, institutions, and dates?", "context": text } } response = self._make_api_request(self.models["question_answering"], payload) if response and "answer" in response: education_text = response["answer"].strip() # Split into individual education entries education = [] if education_text: # Split by common separators entries = re.split(r'[;,]', education_text) for entry in entries: entry = entry.strip() if len(entry) > 10: education.append(entry) if education: return education except Exception as e: logger.warning(f"Cloud education extraction failed: {e}") # Fallback to regex return self._extract_education_regex(text) def _extract_contact_info(self, text: str) -> Dict[str, str]: """Extract contact information (email, phone, LinkedIn)""" contact_info = {} # Extract email email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text) if email_match: contact_info["email"] = email_match.group(0) # Extract phone phone_patterns = [ r'\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})', r'(\d{3})[-.\s](\d{3})[-.\s](\d{4})', r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}' ] for pattern in phone_patterns: phone_match = re.search(pattern, text) if phone_match: contact_info["phone"] = phone_match.group(0) break # Extract LinkedIn linkedin_patterns = [ r'linkedin\.com/in/[\w-]+', r'LinkedIn:\s*([\w-]+)', r'linkedin\.com/[\w-]+' ] for pattern in linkedin_patterns: linkedin_match = re.search(pattern, text, re.IGNORECASE) if linkedin_match: contact_info["linkedin"] = linkedin_match.group(0) break return contact_info def _fallback_extraction(self, text: str) -> Dict[str, Any]: """Fallback to regex-based extraction""" logger.info("Using regex fallback extraction...") try: from utils.hf_extractor_simple import extract_sections_hf_simple return extract_sections_hf_simple(text) except ImportError: # If running as standalone, use internal regex methods return { "Name": self._extract_name_regex(text), "Summary": self._extract_summary_regex(text), "Skills": self._extract_skills_regex(text), "StructuredExperiences": self._extract_experiences_regex(text), "Education": self._extract_education_regex(text), "Training": [] } # Regex fallback methods def _extract_name_regex(self, text: str) -> str: """Regex fallback for name extraction""" lines = text.split('\n')[:5] for line in lines: line = line.strip() if re.search(r'@|phone|email|linkedin|github|πŸ“§|πŸ“ž|πŸ“', line.lower()): continue if len(re.findall(r'[^\w\s]', line)) > 3: continue name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line) if name_match: return name_match.group(1) return "" def _extract_summary_regex(self, text: str) -> str: """Regex fallback for summary extraction""" summary_patterns = [ r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))', ] for pattern in summary_patterns: match = re.search(pattern, text, re.DOTALL) if match: summary = match.group(1).strip() summary = re.sub(r'\n+', ' ', summary) summary = re.sub(r'\s+', ' ', summary) if len(summary) > 50: return summary return "" def _extract_skills_regex(self, text: str) -> List[str]: """Regex fallback for skills extraction""" skills = set() # Technical skills section skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|work\s+experience|experience|education|projects?))' match = re.search(skills_pattern, text, re.DOTALL) if match: skills_text = match.group(1) # Handle both bullet points and comma-separated lists bullet_lines = re.findall(r'●\s*([^●\n]+)', skills_text) if not bullet_lines: # If no bullets, treat as comma-separated list bullet_lines = [skills_text.strip()] for line in bullet_lines: if ':' in line: skills_part = line.split(':', 1)[1].strip() else: skills_part = line.strip() # Split by commas and clean up individual_skills = re.split(r',\s*', skills_part) for skill in individual_skills: skill = skill.strip() skill = re.sub(r'\([^)]*\)', '', skill).strip() # Remove parentheses skill = re.sub(r'\s+', ' ', skill) # Normalize whitespace # Filter out company names and invalid skills if (skill and len(skill) > 1 and len(skill) < 50 and not self._is_company_name_skill(skill) and not self._is_broken_skill(skill)): skills.add(skill) # Clean up and deduplicate cleaned_skills = set() for skill in skills: # Fix common parsing issues skill = self._fix_skill_name(skill) if skill: cleaned_skills.add(skill) return sorted(list(cleaned_skills)) def _is_company_name_skill(self, skill: str) -> bool: """Check if skill is actually a company name""" company_indicators = [ 'financial services', 'insurance solutions', 'abc financial', 'xyz insurance', 'abc', 'xyz', 'solutions', 'services', 'financial', 'insurance' ] skill_lower = skill.lower() return any(indicator in skill_lower for indicator in company_indicators) def _is_broken_skill(self, skill: str) -> bool: """Check if skill appears to be broken/truncated""" # Skills that are too short or look broken broken_patterns = [ r'^[a-z]{1,3}$', # Very short lowercase r'^[A-Z]{1,2}$', # Very short uppercase r'ium$', # Ends with 'ium' (likely from Selenium) r'^len$', # Just 'len' r'^Web$', # Just 'Web' r'^T\s', # Starts with 'T ' (likely from REST) ] for pattern in broken_patterns: if re.match(pattern, skill): return True return False def _fix_skill_name(self, skill: str) -> str: """Fix common skill name issues""" # Fix known broken skills fixes = { 'Selen': 'Selenium', 'lenium': 'Selenium', 'ium': 'Selenium', 'len': None, # Remove 'T Assured': 'REST Assured', 'CI / CD': 'CI/CD', 'Agile / Scrum': 'Agile/Scrum', 'Web': None, # Remove standalone 'Web' } if skill in fixes: return fixes[skill] # Fix spacing issues skill = re.sub(r'\s*/\s*', '/', skill) # Fix "CI / CD" -> "CI/CD" return skill def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]: """Regex fallback for experience extraction""" experiences = [] # Look for experience section (try different section names) exp_patterns = [ r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))', r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))' ] exp_text = "" for pattern in exp_patterns: match = re.search(pattern, text, re.DOTALL) if match: exp_text = match.group(1) break if exp_text: # Try 3-part format: Title | Company | Date pattern_3 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' matches_3 = re.findall(pattern_3, exp_text) # Try 4-part format: Company | Location | Title | Date pattern_4 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' matches_4 = re.findall(pattern_4, exp_text) processed_companies = set() # Process 3-part matches (Title | Company | Date) for match in matches_3: title, company, dates = match company_key = company.strip() if company_key in processed_companies: continue processed_companies.add(company_key) responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) experience = { "title": title.strip(), "company": company_key, "date_range": dates.strip(), "responsibilities": responsibilities } experiences.append(experience) # Process 4-part matches (Company | Location | Title | Date) for match in matches_4: company, location, title, dates = match company_key = f"{company.strip()}, {location.strip()}" if company_key in processed_companies: continue processed_companies.add(company_key) responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip()) experience = { "title": title.strip(), "company": company_key, "date_range": dates.strip(), "responsibilities": responsibilities } experiences.append(experience) return experiences def _extract_responsibilities_regex(self, exp_text: str, company: str, title: str) -> List[str]: """Regex fallback for responsibilities extraction""" responsibilities = [] # Look for the job section - try different patterns job_patterns = [ rf'{re.escape(title)}.*?{re.escape(company)}.*?\n(.*?)(?=\n[A-Z][^|\n-]*\s*\||$)', rf'{re.escape(company)}.*?{re.escape(title)}.*?\n(.*?)(?=\n[A-Z][^|\n-]*\s*\||$)' ] for pattern in job_patterns: match = re.search(pattern, exp_text, re.DOTALL | re.IGNORECASE) if match: resp_text = match.group(1) # Look for bullet points (● or -) bullets = re.findall(r'[●-]\s*([^●\n-]+)', resp_text) # Clean and fix responsibilities for bullet in bullets: bullet = bullet.strip() bullet = re.sub(r'\s+', ' ', bullet) # Fix common truncation issues bullet = self._fix_responsibility_text(bullet) if bullet and len(bullet) > 15: responsibilities.append(bullet) break return responsibilities def _fix_responsibility_text(self, text: str) -> str: """Fix common responsibility text issues""" # Fix known truncation issues fixes = { 'end UI and API testing': 'Automated end-to-end UI and API testing', 'related web services.': 'for policy-related web services.', } for broken, fixed in fixes.items(): if text.startswith(broken): return fixed + text[len(broken):] if text.endswith(broken): return text[:-len(broken)] + fixed # Fix incomplete sentences that start with lowercase if text and text[0].islower() and not text.startswith('e.g.'): # Likely a continuation, try to fix common patterns if text.startswith('end '): text = 'Automated ' + text elif text.startswith('related '): text = 'for policy-' + text return text def _extract_education_regex(self, text: str) -> List[str]: """Regex fallback for education extraction""" education = [] edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' match = re.search(edu_pattern, text, re.DOTALL) if match: edu_text = match.group(1) edu_lines = re.findall(r'●\s*([^●\n]+)', edu_text) if not edu_lines: edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] for line in edu_lines: line = line.strip() line = re.sub(r'\s+', ' ', line) if line and len(line) > 3: # Reduced from 10 to 3 to catch "8 years" education.append(line) return education # Convenience function for easy usage def extract_sections_hf_cloud(text: str, api_key: Optional[str] = None) -> Dict[str, Any]: """ Extract resume sections using Hugging Face cloud models Args: text: Raw resume text api_key: Hugging Face API key (optional) Returns: Structured resume data """ extractor = HuggingFaceCloudExtractor(api_key=api_key) return extractor.extract_sections_hf_cloud(text) # Test function def test_hf_cloud_extraction(): """Test the Hugging Face cloud extraction with sample resume""" sample_text = """ Jonathan Edward Nguyen πŸ“San Diego, CA | 858-900-5036 | πŸ“§ jonatngu@icloud.com Summary Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable automation solutions, AI development, and optimizing workflows. Technical Skills ● Programming Languages: Python, Java, SQL, Apex, Bash ● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas ● Cloud Platforms: AWS Glue, AWS SageMaker, AWS Orchestration, REST APIs Professional Experience TalentLens.AI | Remote | AI Developer | Feb 2025 – Present ● Built an automated test suite for LLM prompts that export reports with performance metrics ● Architected and developed an AI-powered resume screening application using Streamlit GoFundMe | San Diego, CA | Senior Developer in Test | Oct 2021 – Dec 2024 ● Built and maintained robust API and UI test suites in Python, reducing defects by 37% ● Automated environment builds using Apex and Bash, improving deployment times by 30% Education ● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing """ extractor = HuggingFaceCloudExtractor() result = extractor.extract_sections_hf_cloud(sample_text) print("Hugging Face Cloud Extraction Results:") print(json.dumps(result, indent=2)) return result if __name__ == "__main__": test_hf_cloud_extraction()