Spaces:
Running
Running
File size: 10,924 Bytes
c2f9ec8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 |
"""
Hybrid Resume Extractor
This module provides a robust resume extraction system that combines:
1. AI-powered extraction (primary) - handles diverse formats
2. Regex-based extraction (fallback) - reliable backup
3. Post-processing validation - ensures quality
"""
import os
import json
from typing import Dict, Any, Optional
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class HybridResumeExtractor:
"""
A hybrid resume extractor that combines AI and regex approaches
"""
def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None):
"""
Initialize the hybrid extractor
Args:
prefer_ai: Whether to try AI extraction first
use_openai: Whether to use OpenAI GPT-4 (recommended)
use_huggingface: Whether to use Hugging Face models locally (simplified)
use_hf_cloud: Whether to use Hugging Face cloud API
api_key: API key (will auto-detect OpenAI or HF based on use_openai flag)
"""
self.prefer_ai = prefer_ai
self.use_openai = use_openai
self.use_huggingface = use_huggingface
self.use_hf_cloud = use_hf_cloud
# Set appropriate API key based on preference
if use_openai:
self.api_key = api_key or os.getenv('OPENAI_API_KEY')
else:
self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
# Track which method was used for analytics
self.last_method_used = None
def extract_sections(self, text: str) -> Dict[str, Any]:
"""
Extract resume sections using hybrid approach
Args:
text: Raw resume text
Returns:
Structured resume data
"""
if self.prefer_ai:
# Try AI extraction methods in priority order
extraction_methods = []
# Build priority list of extraction methods
if self.use_openai and self.api_key:
extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o"))
if self.use_hf_cloud:
extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud"))
if self.api_key and not self.use_openai:
extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai"))
if self.use_huggingface:
extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
# If no specific methods enabled, try local as fallback
if not extraction_methods:
extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
# Try each method in sequence until one succeeds
for method_name, method_func, method_id in extraction_methods:
try:
logger.info(f"Attempting {method_name} extraction...")
result = method_func(text)
# Validate AI result quality
if self._validate_extraction_quality(result):
logger.info(f"β
{method_name} extraction successful")
self.last_method_used = method_id
return result
else:
# Check if it's an empty result (likely API failure)
if not any(result.values()):
logger.warning(f"β οΈ {method_name} failed (likely API key issue), trying next method...")
else:
logger.warning(f"β οΈ {method_name} extraction quality insufficient, trying next method...")
except Exception as e:
logger.warning(f"β οΈ {method_name} extraction failed: {e}, trying next method...")
# Fall back to regex extraction
try:
logger.info("Using regex extraction...")
result = self._extract_with_regex(text)
self.last_method_used = "regex"
logger.info("β
Regex extraction completed")
return result
except Exception as e:
logger.error(f"β Both extraction methods failed: {e}")
# Return minimal structure to prevent crashes
return self._get_empty_structure()
def _extract_with_openai(self, text: str) -> Dict[str, Any]:
"""Extract using OpenAI GPT-4o"""
from utils.openai_extractor import extract_sections_openai
return extract_sections_openai(text, api_key=self.api_key)
def _extract_with_ai(self, text: str) -> Dict[str, Any]:
"""Extract using Hugging Face AI models"""
from utils.ai_extractor import extract_sections_ai
return extract_sections_ai(text)
def _extract_with_hf(self, text: str) -> Dict[str, Any]:
"""Extract using Hugging Face models (simplified approach)"""
from utils.hf_extractor_simple import extract_sections_hf_simple
return extract_sections_hf_simple(text)
def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]:
"""Extract using Hugging Face Cloud API"""
from utils.hf_cloud_extractor import extract_sections_hf_cloud
return extract_sections_hf_cloud(text)
def _extract_with_regex(self, text: str) -> Dict[str, Any]:
"""Extract using regex approach"""
from utils.extractor_fixed import extract_sections_spacy_fixed
return extract_sections_spacy_fixed(text)
def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool:
"""
Validate the quality of extraction results
Args:
result: Extraction result to validate
Returns:
True if quality is acceptable, False otherwise
"""
# Check if basic fields are present
if not result.get("Name"):
return False
# Check if we have either summary or experiences
has_summary = bool(result.get("Summary", "").strip())
has_experiences = bool(result.get("StructuredExperiences", []))
if not (has_summary or has_experiences):
return False
# For professional resumes, we expect structured work experience
# If we have a summary mentioning years of experience but no structured experiences,
# the extraction likely failed
summary = result.get("Summary", "").lower()
if ("years of experience" in summary or "experience in" in summary) and not has_experiences:
return False
# Check skills quality (should have reasonable number)
skills = result.get("Skills", [])
if len(skills) > 100: # Too many skills suggests noise
return False
# Check experience quality
experiences = result.get("StructuredExperiences", [])
for exp in experiences:
# Each experience should have title and company
if not exp.get("title") or not exp.get("company"):
return False
return True
def _get_empty_structure(self) -> Dict[str, Any]:
"""Return empty structure as last resort"""
return {
"Name": "",
"Summary": "",
"Skills": [],
"StructuredExperiences": [],
"Education": [],
"Training": []
}
def get_extraction_stats(self) -> Dict[str, Any]:
"""Get statistics about the last extraction"""
return {
"method_used": self.last_method_used,
"ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud,
"prefer_ai": self.prefer_ai,
"use_huggingface": self.use_huggingface,
"use_hf_cloud": self.use_hf_cloud
}
# Convenience function for easy usage
def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]:
"""
Extract resume sections using hybrid approach
Args:
text: Raw resume text
prefer_ai: Whether to prefer AI extraction over regex
use_openai: Whether to use OpenAI GPT-4 (recommended for best results)
use_huggingface: Whether to use Hugging Face models locally
use_hf_cloud: Whether to use Hugging Face cloud API
Returns:
Structured resume data
"""
extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud)
return extractor.extract_sections(text)
# Test function
def test_hybrid_extraction():
"""Test the hybrid extraction with sample resumes"""
# Test with Jonathan's resume
jonathan_resume = '''Jonathan Edward Nguyen
πSan Diego, CA | 858-900-5036 | π§ jonatngu@icloud.com
Summary
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
automation solutions, AI development, and optimizing workflows.
Technical Skills
β Programming Languages: Python, Java, SQL, Apex, Bash
β Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas
Professional Experience
TalentLens.AI | Remote | AI Developer | Feb 2025 β Present
β Built an automated test suite for LLM prompts that export reports with performance metrics
β Architected and developed an AI-powered resume screening application using Streamlit
Education
β California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing'''
print("π§ͺ TESTING HYBRID EXTRACTION")
print("=" * 50)
# Test with AI preference
extractor = HybridResumeExtractor(prefer_ai=True)
result = extractor.extract_sections(jonathan_resume)
stats = extractor.get_extraction_stats()
print(f"Method used: {stats['method_used']}")
print(f"Name: {result.get('Name')}")
print(f"Skills count: {len(result.get('Skills', []))}")
print(f"Experiences count: {len(result.get('StructuredExperiences', []))}")
if result.get('StructuredExperiences'):
exp = result['StructuredExperiences'][0]
print(f"First job: {exp.get('title')} at {exp.get('company')}")
print(f"Responsibilities: {len(exp.get('responsibilities', []))}")
return result
if __name__ == "__main__":
test_hybrid_extraction() |