Spaces:
Running
Running
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
""" | |
Hybrid Resume Extractor | |
This module provides a robust resume extraction system that combines: | |
1. AI-powered extraction (primary) - handles diverse formats | |
2. Regex-based extraction (fallback) - reliable backup | |
3. Post-processing validation - ensures quality | |
""" | |
import os | |
import json | |
from typing import Dict, Any, Optional | |
import logging | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class HybridResumeExtractor: | |
""" | |
A hybrid resume extractor that combines AI and regex approaches | |
""" | |
def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None): | |
""" | |
Initialize the hybrid extractor | |
Args: | |
prefer_ai: Whether to try AI extraction first | |
use_openai: Whether to use OpenAI GPT-4 (recommended) | |
use_huggingface: Whether to use Hugging Face models locally (simplified) | |
use_hf_cloud: Whether to use Hugging Face cloud API | |
api_key: API key (will auto-detect OpenAI or HF based on use_openai flag) | |
""" | |
self.prefer_ai = prefer_ai | |
self.use_openai = use_openai | |
self.use_huggingface = use_huggingface | |
self.use_hf_cloud = use_hf_cloud | |
# Set appropriate API key based on preference | |
if use_openai: | |
self.api_key = api_key or os.getenv('OPENAI_API_KEY') | |
else: | |
self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY') | |
# Track which method was used for analytics | |
self.last_method_used = None | |
def extract_sections(self, text: str) -> Dict[str, Any]: | |
""" | |
Extract resume sections using hybrid approach | |
Args: | |
text: Raw resume text | |
Returns: | |
Structured resume data | |
""" | |
if self.prefer_ai: | |
# Try AI extraction methods in priority order | |
extraction_methods = [] | |
# Build priority list of extraction methods | |
if self.use_openai and self.api_key: | |
extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o")) | |
if self.use_hf_cloud: | |
extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud")) | |
if self.api_key and not self.use_openai: | |
extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai")) | |
if self.use_huggingface: | |
extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local")) | |
# If no specific methods enabled, try local as fallback | |
if not extraction_methods: | |
extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local")) | |
# Try each method in sequence until one succeeds | |
for method_name, method_func, method_id in extraction_methods: | |
try: | |
logger.info(f"Attempting {method_name} extraction...") | |
result = method_func(text) | |
# Validate AI result quality | |
if self._validate_extraction_quality(result): | |
logger.info(f"β {method_name} extraction successful") | |
self.last_method_used = method_id | |
return result | |
else: | |
# Check if it's an empty result (likely API failure) | |
if not any(result.values()): | |
logger.warning(f"β οΈ {method_name} failed (likely API key issue), trying next method...") | |
else: | |
logger.warning(f"β οΈ {method_name} extraction quality insufficient, trying next method...") | |
except Exception as e: | |
logger.warning(f"β οΈ {method_name} extraction failed: {e}, trying next method...") | |
# Fall back to regex extraction | |
try: | |
logger.info("Using regex extraction...") | |
result = self._extract_with_regex(text) | |
self.last_method_used = "regex" | |
logger.info("β Regex extraction completed") | |
return result | |
except Exception as e: | |
logger.error(f"β Both extraction methods failed: {e}") | |
# Return minimal structure to prevent crashes | |
return self._get_empty_structure() | |
def _extract_with_openai(self, text: str) -> Dict[str, Any]: | |
"""Extract using OpenAI GPT-4o""" | |
from utils.openai_extractor import extract_sections_openai | |
return extract_sections_openai(text, api_key=self.api_key) | |
def _extract_with_ai(self, text: str) -> Dict[str, Any]: | |
"""Extract using Hugging Face AI models""" | |
from utils.ai_extractor import extract_sections_ai | |
return extract_sections_ai(text) | |
def _extract_with_hf(self, text: str) -> Dict[str, Any]: | |
"""Extract using Hugging Face models (simplified approach)""" | |
from utils.hf_extractor_simple import extract_sections_hf_simple | |
return extract_sections_hf_simple(text) | |
def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]: | |
"""Extract using Hugging Face Cloud API""" | |
from utils.hf_cloud_extractor import extract_sections_hf_cloud | |
return extract_sections_hf_cloud(text) | |
def _extract_with_regex(self, text: str) -> Dict[str, Any]: | |
"""Extract using regex approach""" | |
from utils.extractor_fixed import extract_sections_spacy_fixed | |
return extract_sections_spacy_fixed(text) | |
def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool: | |
""" | |
Validate the quality of extraction results | |
Args: | |
result: Extraction result to validate | |
Returns: | |
True if quality is acceptable, False otherwise | |
""" | |
# Check if basic fields are present | |
if not result.get("Name"): | |
return False | |
# Check if we have either summary or experiences | |
has_summary = bool(result.get("Summary", "").strip()) | |
has_experiences = bool(result.get("StructuredExperiences", [])) | |
if not (has_summary or has_experiences): | |
return False | |
# For professional resumes, we expect structured work experience | |
# If we have a summary mentioning years of experience but no structured experiences, | |
# the extraction likely failed | |
summary = result.get("Summary", "").lower() | |
if ("years of experience" in summary or "experience in" in summary) and not has_experiences: | |
return False | |
# Check skills quality (should have reasonable number) | |
skills = result.get("Skills", []) | |
if len(skills) > 100: # Too many skills suggests noise | |
return False | |
# Check experience quality | |
experiences = result.get("StructuredExperiences", []) | |
for exp in experiences: | |
# Each experience should have title and company | |
if not exp.get("title") or not exp.get("company"): | |
return False | |
return True | |
def _get_empty_structure(self) -> Dict[str, Any]: | |
"""Return empty structure as last resort""" | |
return { | |
"Name": "", | |
"Summary": "", | |
"Skills": [], | |
"StructuredExperiences": [], | |
"Education": [], | |
"Training": [] | |
} | |
def get_extraction_stats(self) -> Dict[str, Any]: | |
"""Get statistics about the last extraction""" | |
return { | |
"method_used": self.last_method_used, | |
"ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud, | |
"prefer_ai": self.prefer_ai, | |
"use_huggingface": self.use_huggingface, | |
"use_hf_cloud": self.use_hf_cloud | |
} | |
# Convenience function for easy usage | |
def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]: | |
""" | |
Extract resume sections using hybrid approach | |
Args: | |
text: Raw resume text | |
prefer_ai: Whether to prefer AI extraction over regex | |
use_openai: Whether to use OpenAI GPT-4 (recommended for best results) | |
use_huggingface: Whether to use Hugging Face models locally | |
use_hf_cloud: Whether to use Hugging Face cloud API | |
Returns: | |
Structured resume data | |
""" | |
extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud) | |
return extractor.extract_sections(text) | |
# Test function | |
def test_hybrid_extraction(): | |
"""Test the hybrid extraction with sample resumes""" | |
# Test with Jonathan's resume | |
jonathan_resume = '''Jonathan Edward Nguyen | |
πSan Diego, CA | 858-900-5036 | π§ jonatngu@icloud.com | |
Summary | |
Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable | |
automation solutions, AI development, and optimizing workflows. | |
Technical Skills | |
β Programming Languages: Python, Java, SQL, Apex, Bash | |
β Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas | |
Professional Experience | |
TalentLens.AI | Remote | AI Developer | Feb 2025 β Present | |
β Built an automated test suite for LLM prompts that export reports with performance metrics | |
β Architected and developed an AI-powered resume screening application using Streamlit | |
Education | |
β California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing''' | |
print("π§ͺ TESTING HYBRID EXTRACTION") | |
print("=" * 50) | |
# Test with AI preference | |
extractor = HybridResumeExtractor(prefer_ai=True) | |
result = extractor.extract_sections(jonathan_resume) | |
stats = extractor.get_extraction_stats() | |
print(f"Method used: {stats['method_used']}") | |
print(f"Name: {result.get('Name')}") | |
print(f"Skills count: {len(result.get('Skills', []))}") | |
print(f"Experiences count: {len(result.get('StructuredExperiences', []))}") | |
if result.get('StructuredExperiences'): | |
exp = result['StructuredExperiences'][0] | |
print(f"First job: {exp.get('title')} at {exp.get('company')}") | |
print(f"Responsibilities: {len(exp.get('responsibilities', []))}") | |
return result | |
if __name__ == "__main__": | |
test_hybrid_extraction() |