File size: 10,924 Bytes
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
"""
Hybrid Resume Extractor

This module provides a robust resume extraction system that combines:
1. AI-powered extraction (primary) - handles diverse formats
2. Regex-based extraction (fallback) - reliable backup
3. Post-processing validation - ensures quality
"""

import os
import json
from typing import Dict, Any, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class HybridResumeExtractor:
    """
    A hybrid resume extractor that combines AI and regex approaches
    """
    
    def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None):
        """
        Initialize the hybrid extractor
        
        Args:
            prefer_ai: Whether to try AI extraction first
            use_openai: Whether to use OpenAI GPT-4 (recommended)
            use_huggingface: Whether to use Hugging Face models locally (simplified)
            use_hf_cloud: Whether to use Hugging Face cloud API
            api_key: API key (will auto-detect OpenAI or HF based on use_openai flag)
        """
        self.prefer_ai = prefer_ai
        self.use_openai = use_openai
        self.use_huggingface = use_huggingface
        self.use_hf_cloud = use_hf_cloud
        
        # Set appropriate API key based on preference
        if use_openai:
            self.api_key = api_key or os.getenv('OPENAI_API_KEY')
        else:
            self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
        
        # Track which method was used for analytics
        self.last_method_used = None
        
    def extract_sections(self, text: str) -> Dict[str, Any]:
        """
        Extract resume sections using hybrid approach
        
        Args:
            text: Raw resume text
            
        Returns:
            Structured resume data
        """
        
        if self.prefer_ai:
            # Try AI extraction methods in priority order
            extraction_methods = []
            
            # Build priority list of extraction methods
            if self.use_openai and self.api_key:
                extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o"))
            
            if self.use_hf_cloud:
                extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud"))
            
            if self.api_key and not self.use_openai:
                extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai"))
            
            if self.use_huggingface:
                extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
            
            # If no specific methods enabled, try local as fallback
            if not extraction_methods:
                extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
            
            # Try each method in sequence until one succeeds
            for method_name, method_func, method_id in extraction_methods:
                try:
                    logger.info(f"Attempting {method_name} extraction...")
                    result = method_func(text)
                    
                    # Validate AI result quality
                    if self._validate_extraction_quality(result):
                        logger.info(f"βœ… {method_name} extraction successful")
                        self.last_method_used = method_id
                        return result
                    else:
                        # Check if it's an empty result (likely API failure)
                        if not any(result.values()):
                            logger.warning(f"⚠️ {method_name} failed (likely API key issue), trying next method...")
                        else:
                            logger.warning(f"⚠️ {method_name} extraction quality insufficient, trying next method...")
                        
                except Exception as e:
                    logger.warning(f"⚠️ {method_name} extraction failed: {e}, trying next method...")
        
        # Fall back to regex extraction
        try:
            logger.info("Using regex extraction...")
            result = self._extract_with_regex(text)
            self.last_method_used = "regex"
            logger.info("βœ… Regex extraction completed")
            return result
            
        except Exception as e:
            logger.error(f"❌ Both extraction methods failed: {e}")
            # Return minimal structure to prevent crashes
            return self._get_empty_structure()
    
    def _extract_with_openai(self, text: str) -> Dict[str, Any]:
        """Extract using OpenAI GPT-4o"""
        from utils.openai_extractor import extract_sections_openai
        return extract_sections_openai(text, api_key=self.api_key)
    
    def _extract_with_ai(self, text: str) -> Dict[str, Any]:
        """Extract using Hugging Face AI models"""
        from utils.ai_extractor import extract_sections_ai
        return extract_sections_ai(text)
    
    def _extract_with_hf(self, text: str) -> Dict[str, Any]:
        """Extract using Hugging Face models (simplified approach)"""
        from utils.hf_extractor_simple import extract_sections_hf_simple
        return extract_sections_hf_simple(text)
    
    def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]:
        """Extract using Hugging Face Cloud API"""
        from utils.hf_cloud_extractor import extract_sections_hf_cloud
        return extract_sections_hf_cloud(text)
    
    def _extract_with_regex(self, text: str) -> Dict[str, Any]:
        """Extract using regex approach"""
        from utils.extractor_fixed import extract_sections_spacy_fixed
        return extract_sections_spacy_fixed(text)
    
    def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool:
        """
        Validate the quality of extraction results
        
        Args:
            result: Extraction result to validate
            
        Returns:
            True if quality is acceptable, False otherwise
        """
        
        # Check if basic fields are present
        if not result.get("Name"):
            return False
            
        # Check if we have either summary or experiences
        has_summary = bool(result.get("Summary", "").strip())
        has_experiences = bool(result.get("StructuredExperiences", []))
        
        if not (has_summary or has_experiences):
            return False
        
        # For professional resumes, we expect structured work experience
        # If we have a summary mentioning years of experience but no structured experiences, 
        # the extraction likely failed
        summary = result.get("Summary", "").lower()
        if ("years of experience" in summary or "experience in" in summary) and not has_experiences:
            return False
            
        # Check skills quality (should have reasonable number)
        skills = result.get("Skills", [])
        if len(skills) > 100:  # Too many skills suggests noise
            return False
            
        # Check experience quality
        experiences = result.get("StructuredExperiences", [])
        for exp in experiences:
            # Each experience should have title and company
            if not exp.get("title") or not exp.get("company"):
                return False
                
        return True
    
    def _get_empty_structure(self) -> Dict[str, Any]:
        """Return empty structure as last resort"""
        return {
            "Name": "",
            "Summary": "",
            "Skills": [],
            "StructuredExperiences": [],
            "Education": [],
            "Training": []
        }
    
    def get_extraction_stats(self) -> Dict[str, Any]:
        """Get statistics about the last extraction"""
        return {
            "method_used": self.last_method_used,
            "ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud,
            "prefer_ai": self.prefer_ai,
            "use_huggingface": self.use_huggingface,
            "use_hf_cloud": self.use_hf_cloud
        }

# Convenience function for easy usage
def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]:
    """
    Extract resume sections using hybrid approach
    
    Args:
        text: Raw resume text
        prefer_ai: Whether to prefer AI extraction over regex
        use_openai: Whether to use OpenAI GPT-4 (recommended for best results)
        use_huggingface: Whether to use Hugging Face models locally
        use_hf_cloud: Whether to use Hugging Face cloud API
        
    Returns:
        Structured resume data
    """
    extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud)
    return extractor.extract_sections(text)

# Test function
def test_hybrid_extraction():
    """Test the hybrid extraction with sample resumes"""
    
    # Test with Jonathan's resume
    jonathan_resume = '''Jonathan Edward Nguyen
πŸ“San Diego, CA | 858-900-5036 | πŸ“§ jonatngu@icloud.com

Summary
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
automation solutions, AI development, and optimizing workflows.

Technical Skills
● Programming Languages: Python, Java, SQL, Apex, Bash
● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas

Professional Experience
TalentLens.AI | Remote | AI Developer | Feb 2025 – Present
● Built an automated test suite for LLM prompts that export reports with performance metrics
● Architected and developed an AI-powered resume screening application using Streamlit

Education
● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing'''
    
    print("πŸ§ͺ TESTING HYBRID EXTRACTION")
    print("=" * 50)
    
    # Test with AI preference
    extractor = HybridResumeExtractor(prefer_ai=True)
    result = extractor.extract_sections(jonathan_resume)
    stats = extractor.get_extraction_stats()
    
    print(f"Method used: {stats['method_used']}")
    print(f"Name: {result.get('Name')}")
    print(f"Skills count: {len(result.get('Skills', []))}")
    print(f"Experiences count: {len(result.get('StructuredExperiences', []))}")
    
    if result.get('StructuredExperiences'):
        exp = result['StructuredExperiences'][0]
        print(f"First job: {exp.get('title')} at {exp.get('company')}")
        print(f"Responsibilities: {len(exp.get('responsibilities', []))}")
    
    return result

if __name__ == "__main__":
    test_hybrid_extraction()