TalentLensAI / utils /hybrid_extractor.py
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
"""
Hybrid Resume Extractor
This module provides a robust resume extraction system that combines:
1. AI-powered extraction (primary) - handles diverse formats
2. Regex-based extraction (fallback) - reliable backup
3. Post-processing validation - ensures quality
"""
import os
import json
from typing import Dict, Any, Optional
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class HybridResumeExtractor:
"""
A hybrid resume extractor that combines AI and regex approaches
"""
def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None):
"""
Initialize the hybrid extractor
Args:
prefer_ai: Whether to try AI extraction first
use_openai: Whether to use OpenAI GPT-4 (recommended)
use_huggingface: Whether to use Hugging Face models locally (simplified)
use_hf_cloud: Whether to use Hugging Face cloud API
api_key: API key (will auto-detect OpenAI or HF based on use_openai flag)
"""
self.prefer_ai = prefer_ai
self.use_openai = use_openai
self.use_huggingface = use_huggingface
self.use_hf_cloud = use_hf_cloud
# Set appropriate API key based on preference
if use_openai:
self.api_key = api_key or os.getenv('OPENAI_API_KEY')
else:
self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
# Track which method was used for analytics
self.last_method_used = None
def extract_sections(self, text: str) -> Dict[str, Any]:
"""
Extract resume sections using hybrid approach
Args:
text: Raw resume text
Returns:
Structured resume data
"""
if self.prefer_ai:
# Try AI extraction methods in priority order
extraction_methods = []
# Build priority list of extraction methods
if self.use_openai and self.api_key:
extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o"))
if self.use_hf_cloud:
extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud"))
if self.api_key and not self.use_openai:
extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai"))
if self.use_huggingface:
extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
# If no specific methods enabled, try local as fallback
if not extraction_methods:
extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))
# Try each method in sequence until one succeeds
for method_name, method_func, method_id in extraction_methods:
try:
logger.info(f"Attempting {method_name} extraction...")
result = method_func(text)
# Validate AI result quality
if self._validate_extraction_quality(result):
logger.info(f"βœ… {method_name} extraction successful")
self.last_method_used = method_id
return result
else:
# Check if it's an empty result (likely API failure)
if not any(result.values()):
logger.warning(f"⚠️ {method_name} failed (likely API key issue), trying next method...")
else:
logger.warning(f"⚠️ {method_name} extraction quality insufficient, trying next method...")
except Exception as e:
logger.warning(f"⚠️ {method_name} extraction failed: {e}, trying next method...")
# Fall back to regex extraction
try:
logger.info("Using regex extraction...")
result = self._extract_with_regex(text)
self.last_method_used = "regex"
logger.info("βœ… Regex extraction completed")
return result
except Exception as e:
logger.error(f"❌ Both extraction methods failed: {e}")
# Return minimal structure to prevent crashes
return self._get_empty_structure()
def _extract_with_openai(self, text: str) -> Dict[str, Any]:
"""Extract using OpenAI GPT-4o"""
from utils.openai_extractor import extract_sections_openai
return extract_sections_openai(text, api_key=self.api_key)
def _extract_with_ai(self, text: str) -> Dict[str, Any]:
"""Extract using Hugging Face AI models"""
from utils.ai_extractor import extract_sections_ai
return extract_sections_ai(text)
def _extract_with_hf(self, text: str) -> Dict[str, Any]:
"""Extract using Hugging Face models (simplified approach)"""
from utils.hf_extractor_simple import extract_sections_hf_simple
return extract_sections_hf_simple(text)
def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]:
"""Extract using Hugging Face Cloud API"""
from utils.hf_cloud_extractor import extract_sections_hf_cloud
return extract_sections_hf_cloud(text)
def _extract_with_regex(self, text: str) -> Dict[str, Any]:
"""Extract using regex approach"""
from utils.extractor_fixed import extract_sections_spacy_fixed
return extract_sections_spacy_fixed(text)
def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool:
"""
Validate the quality of extraction results
Args:
result: Extraction result to validate
Returns:
True if quality is acceptable, False otherwise
"""
# Check if basic fields are present
if not result.get("Name"):
return False
# Check if we have either summary or experiences
has_summary = bool(result.get("Summary", "").strip())
has_experiences = bool(result.get("StructuredExperiences", []))
if not (has_summary or has_experiences):
return False
# For professional resumes, we expect structured work experience
# If we have a summary mentioning years of experience but no structured experiences,
# the extraction likely failed
summary = result.get("Summary", "").lower()
if ("years of experience" in summary or "experience in" in summary) and not has_experiences:
return False
# Check skills quality (should have reasonable number)
skills = result.get("Skills", [])
if len(skills) > 100: # Too many skills suggests noise
return False
# Check experience quality
experiences = result.get("StructuredExperiences", [])
for exp in experiences:
# Each experience should have title and company
if not exp.get("title") or not exp.get("company"):
return False
return True
def _get_empty_structure(self) -> Dict[str, Any]:
"""Return empty structure as last resort"""
return {
"Name": "",
"Summary": "",
"Skills": [],
"StructuredExperiences": [],
"Education": [],
"Training": []
}
def get_extraction_stats(self) -> Dict[str, Any]:
"""Get statistics about the last extraction"""
return {
"method_used": self.last_method_used,
"ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud,
"prefer_ai": self.prefer_ai,
"use_huggingface": self.use_huggingface,
"use_hf_cloud": self.use_hf_cloud
}
# Convenience function for easy usage
def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]:
"""
Extract resume sections using hybrid approach
Args:
text: Raw resume text
prefer_ai: Whether to prefer AI extraction over regex
use_openai: Whether to use OpenAI GPT-4 (recommended for best results)
use_huggingface: Whether to use Hugging Face models locally
use_hf_cloud: Whether to use Hugging Face cloud API
Returns:
Structured resume data
"""
extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud)
return extractor.extract_sections(text)
# Test function
def test_hybrid_extraction():
"""Test the hybrid extraction with sample resumes"""
# Test with Jonathan's resume
jonathan_resume = '''Jonathan Edward Nguyen
πŸ“San Diego, CA | 858-900-5036 | πŸ“§ jonatngu@icloud.com
Summary
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
automation solutions, AI development, and optimizing workflows.
Technical Skills
● Programming Languages: Python, Java, SQL, Apex, Bash
● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas
Professional Experience
TalentLens.AI | Remote | AI Developer | Feb 2025 – Present
● Built an automated test suite for LLM prompts that export reports with performance metrics
● Architected and developed an AI-powered resume screening application using Streamlit
Education
● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing'''
print("πŸ§ͺ TESTING HYBRID EXTRACTION")
print("=" * 50)
# Test with AI preference
extractor = HybridResumeExtractor(prefer_ai=True)
result = extractor.extract_sections(jonathan_resume)
stats = extractor.get_extraction_stats()
print(f"Method used: {stats['method_used']}")
print(f"Name: {result.get('Name')}")
print(f"Skills count: {len(result.get('Skills', []))}")
print(f"Experiences count: {len(result.get('StructuredExperiences', []))}")
if result.get('StructuredExperiences'):
exp = result['StructuredExperiences'][0]
print(f"First job: {exp.get('title')} at {exp.get('company')}")
print(f"Responsibilities: {len(exp.get('responsibilities', []))}")
return result
if __name__ == "__main__":
test_hybrid_extraction()