Spaces:

gauravbox
/

TalentLensAI

Running

File size: 10,924 Bytes

c2f9ec8

"""
Hybrid Resume Extractor

This module provides a robust resume extraction system that combines:
1. AI-powered extraction (primary) - handles diverse formats
2. Regex-based extraction (fallback) - reliable backup
3. Post-processing validation - ensures quality
"""

import os
import json
from typing import Dict, Any, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class HybridResumeExtractor:
    """
    A hybrid resume extractor that combines AI and regex approaches
    """
    
    def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None):
        """
        Initialize the hybrid extractor
        
        Args:
            prefer_ai: Whether to try AI extraction first
            use_openai: Whether to use OpenAI GPT-4 (recommended)
            use_huggingface: Whether to use Hugging Face models locally (simplified)
            use_hf_cloud: Whether to use Hugging Face cloud API
            api_key: API key (will auto-detect OpenAI or HF based on use_openai flag)
        """
        self.prefer_ai = prefer_ai
        self.use_openai = use_openai
        self.use_huggingface = use_huggingface
        self.use_hf_cloud = use_hf_cloud
        
        # Set appropriate API key based on preference
        if use_openai:
            self.api_key = api_key or os.getenv('OPENAI_API_KEY')
        else:
            self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
        
        # Track which method was used for analytics
        self.last_method_used = None
        
    def extract_sections(self, text: str) -> Dict[str, Any]:
        """
        Extract resume sections using hybrid approach
        
        Args:
            text: Raw resume text
            
        Returns:
            Structured resume data
        """
        
        if self.prefer_ai:
            # Try AI extraction methods in priority order
            extraction_methods = []
            
            # Build priority list of extraction methods
            if self.use_openai and self.api_key:
                extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o"))
            
            if self.use_hf_cloud:
                extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud"))
            
            if self.api_key and not self.use_openai:
                extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai"))
            
            if self.use_huggingface:
                extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
            
            # If no specific methods enabled, try local as fallback
            if not extraction_methods:
                extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
            
            # Try each method in sequence until one succeeds
            for method_name, method_func, method_id in extraction_methods:
                try:
                    logger.info(f"Attempting {method_name} extraction...")
                    result = method_func(text)
                    
                    # Validate AI result quality
                    if self._validate_extraction_quality(result):
                        logger.info(f"✅ {method_name} extraction successful")
                        self.last_method_used = method_id
                        return result
                    else:
                        # Check if it's an empty result (likely API failure)
                        if not any(result.values()):
                            logger.warning(f"⚠️ {method_name} failed (likely API key issue), trying next method...")
                        else:
                            logger.warning(f"⚠️ {method_name} extraction quality insufficient, trying next method...")
                        
                except Exception as e:
                    logger.warning(f"⚠️ {method_name} extraction failed: {e}, trying next method...")
        
        # Fall back to regex extraction
        try:
            logger.info("Using regex extraction...")
            result = self._extract_with_regex(text)
            self.last_method_used = "regex"
            logger.info("✅ Regex extraction completed")
            return result
            
        except Exception as e:
            logger.error(f"❌ Both extraction methods failed: {e}")
            # Return minimal structure to prevent crashes
            return self._get_empty_structure()
    
    def _extract_with_openai(self, text: str) -> Dict[str, Any]:
        """Extract using OpenAI GPT-4o"""
        from utils.openai_extractor import extract_sections_openai
        return extract_sections_openai(text, api_key=self.api_key)
    
    def _extract_with_ai(self, text: str) -> Dict[str, Any]:
        """Extract using Hugging Face AI models"""
        from utils.ai_extractor import extract_sections_ai
        return extract_sections_ai(text)
    
    def _extract_with_hf(self, text: str) -> Dict[str, Any]:
        """Extract using Hugging Face models (simplified approach)"""
        from utils.hf_extractor_simple import extract_sections_hf_simple
        return extract_sections_hf_simple(text)
    
    def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]:
        """Extract using Hugging Face Cloud API"""
        from utils.hf_cloud_extractor import extract_sections_hf_cloud
        return extract_sections_hf_cloud(text)
    
    def _extract_with_regex(self, text: str) -> Dict[str, Any]:
        """Extract using regex approach"""
        from utils.extractor_fixed import extract_sections_spacy_fixed
        return extract_sections_spacy_fixed(text)
    
    def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool:
        """
        Validate the quality of extraction results
        
        Args:
            result: Extraction result to validate
            
        Returns:
            True if quality is acceptable, False otherwise
        """
        
        # Check if basic fields are present
        if not result.get("Name"):
            return False
            
        # Check if we have either summary or experiences
        has_summary = bool(result.get("Summary", "").strip())
        has_experiences = bool(result.get("StructuredExperiences", []))
        
        if not (has_summary or has_experiences):
            return False
        
        # For professional resumes, we expect structured work experience
        # If we have a summary mentioning years of experience but no structured experiences, 
        # the extraction likely failed
        summary = result.get("Summary", "").lower()
        if ("years of experience" in summary or "experience in" in summary) and not has_experiences:
            return False
            
        # Check skills quality (should have reasonable number)
        skills = result.get("Skills", [])
        if len(skills) > 100:  # Too many skills suggests noise
            return False
            
        # Check experience quality
        experiences = result.get("StructuredExperiences", [])
        for exp in experiences:
            # Each experience should have title and company
            if not exp.get("title") or not exp.get("company"):
                return False
                
        return True
    
    def _get_empty_structure(self) -> Dict[str, Any]:
        """Return empty structure as last resort"""
        return {
            "Name": "",
            "Summary": "",
            "Skills": [],
            "StructuredExperiences": [],
            "Education": [],
            "Training": []
        }
    
    def get_extraction_stats(self) -> Dict[str, Any]:
        """Get statistics about the last extraction"""
        return {
            "method_used": self.last_method_used,
            "ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud,
            "prefer_ai": self.prefer_ai,
            "use_huggingface": self.use_huggingface,
            "use_hf_cloud": self.use_hf_cloud
        }

# Convenience function for easy usage
def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]:
    """
    Extract resume sections using hybrid approach
    
    Args:
        text: Raw resume text
        prefer_ai: Whether to prefer AI extraction over regex
        use_openai: Whether to use OpenAI GPT-4 (recommended for best results)
        use_huggingface: Whether to use Hugging Face models locally
        use_hf_cloud: Whether to use Hugging Face cloud API
        
    Returns:
        Structured resume data
    """
    extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud)
    return extractor.extract_sections(text)

# Test function
def test_hybrid_extraction():
    """Test the hybrid extraction with sample resumes"""
    
    # Test with Jonathan's resume
    jonathan_resume = '''Jonathan Edward Nguyen
📍San Diego, CA | 858-900-5036 | 📧 jonatngu@icloud.com

Summary
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
automation solutions, AI development, and optimizing workflows.

Technical Skills
● Programming Languages: Python, Java, SQL, Apex, Bash
● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas

Professional Experience
TalentLens.AI | Remote | AI Developer | Feb 2025 – Present
● Built an automated test suite for LLM prompts that export reports with performance metrics
● Architected and developed an AI-powered resume screening application using Streamlit

Education
● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing'''
    
    print("🧪 TESTING HYBRID EXTRACTION")
    print("=" * 50)
    
    # Test with AI preference
    extractor = HybridResumeExtractor(prefer_ai=True)
    result = extractor.extract_sections(jonathan_resume)
    stats = extractor.get_extraction_stats()
    
    print(f"Method used: {stats['method_used']}")
    print(f"Name: {result.get('Name')}")
    print(f"Skills count: {len(result.get('Skills', []))}")
    print(f"Experiences count: {len(result.get('StructuredExperiences', []))}")
    
    if result.get('StructuredExperiences'):
        exp = result['StructuredExperiences'][0]
        print(f"First job: {exp.get('title')} at {exp.get('company')}")
        print(f"Responsibilities: {len(exp.get('responsibilities', []))}")
    
    return result

if __name__ == "__main__":
    test_hybrid_extraction()