Spaces:

gauravbox
/

TalentLensAI

Running

TalentLensAI / utils /hybrid_extractor.py

Johnny

feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.

c2f9ec8 30 days ago

raw

history blame contribute delete

10.9 kB

	"""
	Hybrid Resume Extractor

	This module provides a robust resume extraction system that combines:
	1. AI-powered extraction (primary) - handles diverse formats
	2. Regex-based extraction (fallback) - reliable backup
	3. Post-processing validation - ensures quality
	"""

	import os
	import json
	from typing import Dict, Any, Optional
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class HybridResumeExtractor:
	"""
	A hybrid resume extractor that combines AI and regex approaches
	"""

	def __init__(self, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False, api_key: Optional[str] = None):
	"""
	Initialize the hybrid extractor

	Args:
	prefer_ai: Whether to try AI extraction first
	use_openai: Whether to use OpenAI GPT-4 (recommended)
	use_huggingface: Whether to use Hugging Face models locally (simplified)
	use_hf_cloud: Whether to use Hugging Face cloud API
	api_key: API key (will auto-detect OpenAI or HF based on use_openai flag)
	"""
	self.prefer_ai = prefer_ai
	self.use_openai = use_openai
	self.use_huggingface = use_huggingface
	self.use_hf_cloud = use_hf_cloud

	# Set appropriate API key based on preference
	if use_openai:
	self.api_key = api_key or os.getenv('OPENAI_API_KEY')
	else:
	self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')

	# Track which method was used for analytics
	self.last_method_used = None

	def extract_sections(self, text: str) -> Dict[str, Any]:
	"""
	Extract resume sections using hybrid approach

	Args:
	text: Raw resume text

	Returns:
	Structured resume data
	"""

	if self.prefer_ai:
	# Try AI extraction methods in priority order
	extraction_methods = []

	# Build priority list of extraction methods
	if self.use_openai and self.api_key:
	extraction_methods.append(("OpenAI GPT-4o", self._extract_with_openai, "openai_gpt4o"))

	if self.use_hf_cloud:
	extraction_methods.append(("Hugging Face Cloud", self._extract_with_hf_cloud, "huggingface_cloud"))

	if self.api_key and not self.use_openai:
	extraction_methods.append(("Hugging Face AI", self._extract_with_ai, "huggingface_ai"))

	if self.use_huggingface:
	extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))

	# If no specific methods enabled, try local as fallback
	if not extraction_methods:
	extraction_methods.append(("Hugging Face Local", self._extract_with_hf, "huggingface_local"))

	# Try each method in sequence until one succeeds
	for method_name, method_func, method_id in extraction_methods:
	try:
	logger.info(f"Attempting {method_name} extraction...")
	result = method_func(text)

	# Validate AI result quality
	if self._validate_extraction_quality(result):
	logger.info(f"✅ {method_name} extraction successful")
	self.last_method_used = method_id
	return result
	else:
	# Check if it's an empty result (likely API failure)
	if not any(result.values()):
	logger.warning(f"⚠️ {method_name} failed (likely API key issue), trying next method...")
	else:
	logger.warning(f"⚠️ {method_name} extraction quality insufficient, trying next method...")

	except Exception as e:
	logger.warning(f"⚠️ {method_name} extraction failed: {e}, trying next method...")

	# Fall back to regex extraction
	try:
	logger.info("Using regex extraction...")
	result = self._extract_with_regex(text)
	self.last_method_used = "regex"
	logger.info("✅ Regex extraction completed")
	return result

	except Exception as e:
	logger.error(f"❌ Both extraction methods failed: {e}")
	# Return minimal structure to prevent crashes
	return self._get_empty_structure()

	def _extract_with_openai(self, text: str) -> Dict[str, Any]:
	"""Extract using OpenAI GPT-4o"""
	from utils.openai_extractor import extract_sections_openai
	return extract_sections_openai(text, api_key=self.api_key)

	def _extract_with_ai(self, text: str) -> Dict[str, Any]:
	"""Extract using Hugging Face AI models"""
	from utils.ai_extractor import extract_sections_ai
	return extract_sections_ai(text)

	def _extract_with_hf(self, text: str) -> Dict[str, Any]:
	"""Extract using Hugging Face models (simplified approach)"""
	from utils.hf_extractor_simple import extract_sections_hf_simple
	return extract_sections_hf_simple(text)

	def _extract_with_hf_cloud(self, text: str) -> Dict[str, Any]:
	"""Extract using Hugging Face Cloud API"""
	from utils.hf_cloud_extractor import extract_sections_hf_cloud
	return extract_sections_hf_cloud(text)

	def _extract_with_regex(self, text: str) -> Dict[str, Any]:
	"""Extract using regex approach"""
	from utils.extractor_fixed import extract_sections_spacy_fixed
	return extract_sections_spacy_fixed(text)

	def _validate_extraction_quality(self, result: Dict[str, Any]) -> bool:
	"""
	Validate the quality of extraction results

	Args:
	result: Extraction result to validate

	Returns:
	True if quality is acceptable, False otherwise
	"""

	# Check if basic fields are present
	if not result.get("Name"):
	return False

	# Check if we have either summary or experiences
	has_summary = bool(result.get("Summary", "").strip())
	has_experiences = bool(result.get("StructuredExperiences", []))

	if not (has_summary or has_experiences):
	return False

	# For professional resumes, we expect structured work experience
	# If we have a summary mentioning years of experience but no structured experiences,
	# the extraction likely failed
	summary = result.get("Summary", "").lower()
	if ("years of experience" in summary or "experience in" in summary) and not has_experiences:
	return False

	# Check skills quality (should have reasonable number)
	skills = result.get("Skills", [])
	if len(skills) > 100: # Too many skills suggests noise
	return False

	# Check experience quality
	experiences = result.get("StructuredExperiences", [])
	for exp in experiences:
	# Each experience should have title and company
	if not exp.get("title") or not exp.get("company"):
	return False

	return True

	def _get_empty_structure(self) -> Dict[str, Any]:
	"""Return empty structure as last resort"""
	return {
	"Name": "",
	"Summary": "",
	"Skills": [],
	"StructuredExperiences": [],
	"Education": [],
	"Training": []
	}

	def get_extraction_stats(self) -> Dict[str, Any]:
	"""Get statistics about the last extraction"""
	return {
	"method_used": self.last_method_used,
	"ai_available": bool(self.api_key) or self.use_huggingface or self.use_hf_cloud,
	"prefer_ai": self.prefer_ai,
	"use_huggingface": self.use_huggingface,
	"use_hf_cloud": self.use_hf_cloud
	}

	# Convenience function for easy usage
	def extract_resume_sections(text: str, prefer_ai: bool = True, use_openai: bool = True, use_huggingface: bool = False, use_hf_cloud: bool = False) -> Dict[str, Any]:
	"""
	Extract resume sections using hybrid approach

	Args:
	text: Raw resume text
	prefer_ai: Whether to prefer AI extraction over regex
	use_openai: Whether to use OpenAI GPT-4 (recommended for best results)
	use_huggingface: Whether to use Hugging Face models locally
	use_hf_cloud: Whether to use Hugging Face cloud API

	Returns:
	Structured resume data
	"""
	extractor = HybridResumeExtractor(prefer_ai=prefer_ai, use_openai=use_openai, use_huggingface=use_huggingface, use_hf_cloud=use_hf_cloud)
	return extractor.extract_sections(text)

	# Test function
	def test_hybrid_extraction():
	"""Test the hybrid extraction with sample resumes"""

	# Test with Jonathan's resume
	jonathan_resume = '''Jonathan Edward Nguyen
	📍San Diego, CA \| 858-900-5036 \| 📧 jonatngu@icloud.com

	Summary
	Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
	automation solutions, AI development, and optimizing workflows.

	Technical Skills
	● Programming Languages: Python, Java, SQL, Apex, Bash
	● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas

	Professional Experience
	TalentLens.AI \| Remote \| AI Developer \| Feb 2025 – Present
	● Built an automated test suite for LLM prompts that export reports with performance metrics
	● Architected and developed an AI-powered resume screening application using Streamlit

	Education
	● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing'''

	print("🧪 TESTING HYBRID EXTRACTION")
	print("=" * 50)

	# Test with AI preference
	extractor = HybridResumeExtractor(prefer_ai=True)
	result = extractor.extract_sections(jonathan_resume)
	stats = extractor.get_extraction_stats()

	print(f"Method used: {stats['method_used']}")
	print(f"Name: {result.get('Name')}")
	print(f"Skills count: {len(result.get('Skills', []))}")
	print(f"Experiences count: {len(result.get('StructuredExperiences', []))}")

	if result.get('StructuredExperiences'):
	exp = result['StructuredExperiences'][0]
	print(f"First job: {exp.get('title')} at {exp.get('company')}")
	print(f"Responsibilities: {len(exp.get('responsibilities', []))}")

	return result

	if __name__ == "__main__":
	test_hybrid_extraction()