""" Hybrid Resume Extractor This module provides a robust resume extraction system that combines: 1. AI-powered extraction (primary) - handles diverse formats 2. Regex-based extraction (fallback) - reliable backup 3. Post-processing validation - ensures quality """ import os import json from typing import Dict, Any, Optional import logging # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class HybridResumeExtractor: """ A hybrid resume extractor that combines AI and regex approaches """ def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None): """ Initialize the hybrid extractor Args: prefer_ai: Whether to try AI extraction first use_openai: Whether to use OpenAI GPT-4 (recommended) use_huggingface: Whether to use Hugging Face models locally (simplified) use_hf_cloud: Whether to use Hugging Face cloud API api_key: API key (will auto-detect OpenAI or HF based on use_openai flag) """ self.prefer_ai = prefer_ai self.use_openai = use_openai self.use_huggingface = use_huggingface self.use_hf_cloud = use_hf_cloud # Set appropriate API key based on preference if use_openai: self.api_key = api_key or os.getenv('OPENAI_API_KEY') else: self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY') # Track which method was used for analytics self.last_method_used = None def extract_sections(self, text: str) -> Dict[str, Any]: """ Extract resume sections using hybrid approach Args: text: Raw resume text Returns: Structured resume data """ if self.prefer_ai: # Try AI extraction methods in priority order extraction_methods = [] # Build priority list of extraction methods if self.use_openai and self.api_key: extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o")) if self.use_hf_cloud: extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud")) if self.api_key and not self.use_openai: extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai")) if self.use_huggingface: extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local")) # If no specific methods enabled, try local as fallback if not extraction_methods: extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local")) # Try each method in sequence until one succeeds for method_name, method_func, method_id in extraction_methods: try: logger.info(f"Attempting {method_name} extraction...") result = method_func(text) # Validate AI result quality if self._validate_extraction_quality(result): logger.info(f"✅ {method_name} extraction successful") self.last_method_used = method_id return result else: # Check if it's an empty result (likely API failure) if not any(result.values()): logger.warning(f"⚠️ {method_name} failed (likely API key issue), trying next method...") else: logger.warning(f"⚠️ {method_name} extraction quality insufficient, trying next method...") except Exception as e: logger.warning(f"⚠️ {method_name} extraction failed: {e}, trying next method...") # Fall back to regex extraction try: logger.info("Using regex extraction...") result = self._extract_with_regex(text) self.last_method_used = "regex" logger.info("✅ Regex extraction completed") return result except Exception as e: logger.error(f"❌ Both extraction methods failed: {e}") # Return minimal structure to prevent crashes return self._get_empty_structure() def _extract_with_openai(self, text: str) -> Dict[str, Any]: """Extract using OpenAI GPT-4o""" from utils.openai_extractor import extract_sections_openai return extract_sections_openai(text, api_key=self.api_key) def _extract_with_ai(self, text: str) -> Dict[str, Any]: """Extract using Hugging Face AI models""" from utils.ai_extractor import extract_sections_ai return extract_sections_ai(text) def _extract_with_hf(self, text: str) -> Dict[str, Any]: """Extract using Hugging Face models (simplified approach)""" from utils.hf_extractor_simple import extract_sections_hf_simple return extract_sections_hf_simple(text) def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]: """Extract using Hugging Face Cloud API""" from utils.hf_cloud_extractor import extract_sections_hf_cloud return extract_sections_hf_cloud(text) def _extract_with_regex(self, text: str) -> Dict[str, Any]: """Extract using regex approach""" from utils.extractor_fixed import extract_sections_spacy_fixed return extract_sections_spacy_fixed(text) def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool: """ Validate the quality of extraction results Args: result: Extraction result to validate Returns: True if quality is acceptable, False otherwise """ # Check if basic fields are present if not result.get("Name"): return False # Check if we have either summary or experiences has_summary = bool(result.get("Summary", "").strip()) has_experiences = bool(result.get("StructuredExperiences", [])) if not (has_summary or has_experiences): return False # For professional resumes, we expect structured work experience # If we have a summary mentioning years of experience but no structured experiences, # the extraction likely failed summary = result.get("Summary", "").lower() if ("years of experience" in summary or "experience in" in summary) and not has_experiences: return False # Check skills quality (should have reasonable number) skills = result.get("Skills", []) if len(skills) > 100: # Too many skills suggests noise return False # Check experience quality experiences = result.get("StructuredExperiences", []) for exp in experiences: # Each experience should have title and company if not exp.get("title") or not exp.get("company"): return False return True def _get_empty_structure(self) -> Dict[str, Any]: """Return empty structure as last resort""" return { "Name": "", "Summary": "", "Skills": [], "StructuredExperiences": [], "Education": [], "Training": [] } def get_extraction_stats(self) -> Dict[str, Any]: """Get statistics about the last extraction""" return { "method_used": self.last_method_used, "ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud, "prefer_ai": self.prefer_ai, "use_huggingface": self.use_huggingface, "use_hf_cloud": self.use_hf_cloud } # Convenience function for easy usage def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]: """ Extract resume sections using hybrid approach Args: text: Raw resume text prefer_ai: Whether to prefer AI extraction over regex use_openai: Whether to use OpenAI GPT-4 (recommended for best results) use_huggingface: Whether to use Hugging Face models locally use_hf_cloud: Whether to use Hugging Face cloud API Returns: Structured resume data """ extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud) return extractor.extract_sections(text) # Test function def test_hybrid_extraction(): """Test the hybrid extraction with sample resumes""" # Test with Jonathan's resume jonathan_resume = '''Jonathan Edward Nguyen 📍San Diego, CA | 858-900-5036 | 📧 jonatngu@icloud.com Summary Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable automation solutions, AI development, and optimizing workflows. Technical Skills ● Programming Languages: Python, Java, SQL, Apex, Bash ● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas Professional Experience TalentLens.AI | Remote | AI Developer | Feb 2025 – Present ● Built an automated test suite for LLM prompts that export reports with performance metrics ● Architected and developed an AI-powered resume screening application using Streamlit Education ● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing''' print("🧪 TESTING HYBRID EXTRACTION") print("=" * 50) # Test with AI preference extractor = HybridResumeExtractor(prefer_ai=True) result = extractor.extract_sections(jonathan_resume) stats = extractor.get_extraction_stats() print(f"Method used: {stats['method_used']}") print(f"Name: {result.get('Name')}") print(f"Skills count: {len(result.get('Skills', []))}") print(f"Experiences count: {len(result.get('StructuredExperiences', []))}") if result.get('StructuredExperiences'): exp = result['StructuredExperiences'][0] print(f"First job: {exp.get('title')} at {exp.get('company')}") print(f"Responsibilities: {len(exp.get('responsibilities', []))}") return result if __name__ == "__main__": test_hybrid_extraction()