Spaces:

gauravbox
/

TalentLensAI

Running

TalentLensAI / utils /hf_cloud_extractor.py

Johnny

feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.

c2f9ec8 about 1 month ago

raw

history blame contribute delete

30.6 kB

	#!/usr/bin/env python3
	"""
	Hugging Face Cloud Resume Extractor

	This module provides resume extraction using Hugging Face's Inference API,
	suitable for production deployment with cloud-based AI models.
	"""

	import json
	import re
	import logging
	import requests
	import os
	from typing import Dict, Any, List, Optional
	from time import sleep

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class HuggingFaceCloudExtractor:
	"""
	Production-ready resume extractor using Hugging Face Inference API
	"""

	def __init__(self, api_key: Optional[str] = None, model_name: str = "microsoft/DialoGPT-medium"):
	"""
	Initialize the cloud extractor

	Args:
	api_key: Hugging Face API key (optional, will use env var if not provided)
	model_name: Name of the Hugging Face model to use
	"""
	self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
	self.model_name = model_name
	self.base_url = "https://api-inference.huggingface.co/models"

	# Available models for different tasks
	self.models = {
	"text_generation": "microsoft/DialoGPT-medium",
	"question_answering": "deepset/roberta-base-squad2",
	"summarization": "facebook/bart-large-cnn",
	"ner": "dbmdz/bert-large-cased-finetuned-conll03-english",
	"classification": "facebook/bart-large-mnli"
	}

	if not self.api_key:
	logger.warning("No Hugging Face API key found. Set HF_API_TOKEN or HUGGINGFACE_API_KEY environment variable.")

	def extract_sections_hf_cloud(self, text: str) -> Dict[str, Any]:
	"""
	Extract resume sections using Hugging Face cloud models

	Args:
	text: Raw resume text

	Returns:
	Structured resume data
	"""
	logger.info("Starting Hugging Face cloud extraction...")

	if not self.api_key:
	logger.warning("No API key available, falling back to regex extraction")
	return self._fallback_extraction(text)

	try:
	# Extract different sections using cloud AI models
	name = self._extract_name_cloud(text)
	summary = self._extract_summary_cloud(text)
	skills = self._extract_skills_cloud(text)
	experiences = self._extract_experiences_cloud(text)
	education = self._extract_education_cloud(text)
	contact_info = self._extract_contact_info(text)

	result = {
	"Name": name,
	"Summary": summary,
	"Skills": skills,
	"StructuredExperiences": experiences,
	"Education": education,
	"Training": [],
	"ContactInfo": contact_info
	}

	logger.info("✅ Hugging Face cloud extraction completed")
	return result

	except Exception as e:
	logger.error(f"Hugging Face cloud extraction failed: {e}")
	return self._fallback_extraction(text)

	def _make_api_request(self, model_name: str, payload: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]:
	"""
	Make a request to Hugging Face Inference API with retry logic

	Args:
	model_name: Name of the model to use
	payload: Request payload
	max_retries: Maximum number of retries

	Returns:
	API response
	"""
	headers = {
	"Authorization": f"Bearer {self.api_key}",
	"Content-Type": "application/json"
	}

	url = f"{self.base_url}/{model_name}"

	for attempt in range(max_retries):
	try:
	response = requests.post(url, headers=headers, json=payload, timeout=30)

	if response.status_code == 200:
	return response.json()
	elif response.status_code == 503:
	# Model is loading, wait and retry
	logger.info(f"Model {model_name} is loading, waiting...")
	sleep(10)
	continue
	else:
	logger.error(f"API request failed: {response.status_code} - {response.text}")
	break

	except requests.exceptions.RequestException as e:
	logger.error(f"Request failed (attempt {attempt + 1}): {e}")
	if attempt < max_retries - 1:
	sleep(2)
	continue
	break

	raise Exception(f"Failed to get response from {model_name} after {max_retries} attempts")

	def _extract_name_cloud(self, text: str) -> str:
	"""Extract name using question-answering model"""
	try:
	# Use QA model to extract name
	payload = {
	"inputs": {
	"question": "What is the person's full name?",
	"context": text[:1000] # First 1000 chars should contain name
	}
	}

	response = self._make_api_request(self.models["question_answering"], payload)

	if response and "answer" in response:
	name = response["answer"].strip()
	# Validate name format
	if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', name):
	return name

	except Exception as e:
	logger.warning(f"Cloud name extraction failed: {e}")

	# Fallback to regex
	return self._extract_name_regex(text)

	def _extract_summary_cloud(self, text: str) -> str:
	"""Extract summary using summarization model"""
	try:
	# Find summary section first
	summary_match = re.search(
	r'(?i)(?:professional\s+)?summary[:\s]\n(.?)(?=\n\s*(?:technical\s+skills?\|skills?\|experience\|education))',
	text, re.DOTALL
	)

	if summary_match:
	summary_text = summary_match.group(1).strip()

	# If summary is long, use AI to condense it
	if len(summary_text) > 500:
	payload = {
	"inputs": summary_text,
	"parameters": {
	"max_length": 150,
	"min_length": 50,
	"do_sample": False
	}
	}

	response = self._make_api_request(self.models["summarization"], payload)

	if response and isinstance(response, list) and len(response) > 0:
	return response[0].get("summary_text", summary_text)

	return summary_text

	except Exception as e:
	logger.warning(f"Cloud summary extraction failed: {e}")

	# Fallback to regex
	return self._extract_summary_regex(text)

	def _extract_skills_cloud(self, text: str) -> List[str]:
	"""Extract skills using NER and classification models"""
	try:
	# First, find the technical skills section
	skills_match = re.search(
	r'(?i)technical\s+skills?[:\s]\n(.?)(?=\n\s*(?:professional\s+experience\|experience\|education\|projects?))',
	text, re.DOTALL
	)

	if skills_match:
	skills_text = skills_match.group(1)

	# Use NER to extract technical entities
	payload = {"inputs": skills_text}
	response = self._make_api_request(self.models["ner"], payload)

	skills = set()

	if response and isinstance(response, list):
	for entity in response:
	if entity.get("entity_group") in ["MISC", "ORG"] or "TECH" in entity.get("entity", ""):
	word = entity.get("word", "").replace("##", "").strip()
	if len(word) > 2:
	skills.add(word)

	# Also extract from bullet points using regex
	regex_skills = self._extract_skills_regex(text)
	skills.update(regex_skills)

	# Clean up all skills (both NER and regex)
	cleaned_skills = set()
	for skill in skills:
	# Filter out company names and broken skills
	if (skill and
	len(skill) > 1 and
	len(skill) < 50 and
	not self._is_company_name_skill(skill) and
	not self._is_broken_skill(skill)):

	# Fix common parsing issues
	fixed_skill = self._fix_skill_name(skill)
	if fixed_skill:
	cleaned_skills.add(fixed_skill)

	return sorted(list(cleaned_skills))

	except Exception as e:
	logger.warning(f"Cloud skills extraction failed: {e}")

	# Fallback to regex
	return self._extract_skills_regex(text)

	def _extract_experiences_cloud(self, text: str) -> List[Dict[str, Any]]:
	"""Extract experiences using question-answering model"""
	try:
	# Find experience section (try different section names)
	exp_patterns = [
	r'(?i)(?:work\s+)?experience[:\s]\n(.?)(?=\n\s*(?:education\|projects?\|certifications?\|page\s+\d+\|$))',
	r'(?i)(?:professional\s+)?experience[:\s]\n(.?)(?=\n\s*(?:education\|projects?\|certifications?\|page\s+\d+\|$))'
	]

	exp_match = None
	for pattern in exp_patterns:
	exp_match = re.search(pattern, text, re.DOTALL)
	if exp_match:
	break

	if exp_match:
	exp_text = exp_match.group(1)

	# Use QA to extract structured information
	experiences = []

	# Extract job entries using regex first
	# Try 3-part format: Title \| Company \| Date
	job_pattern_3 = r'([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)'
	matches_3 = re.findall(job_pattern_3, exp_text)

	# Try 4-part format: Company \| Location \| Title \| Date
	job_pattern_4 = r'([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)'
	matches_4 = re.findall(job_pattern_4, exp_text)

	# Process 3-part matches (Title \| Company \| Date)
	for match in matches_3:
	title, company, dates = match

	# Use QA to extract responsibilities
	job_context = f"Job: {title} at {company}. {exp_text}"

	payload = {
	"inputs": {
	"question": f"What were the main responsibilities and achievements for {title} at {company}?",
	"context": job_context[:2000]
	}
	}

	# Use regex extraction for better accuracy with bullet points
	responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())

	experience = {
	"title": title.strip(),
	"company": company.strip(),
	"date_range": dates.strip(),
	"responsibilities": responsibilities
	}
	experiences.append(experience)

	# Process 4-part matches (Company \| Location \| Title \| Date)
	for match in matches_4:
	company, location, title, dates = match

	# Use QA to extract responsibilities
	job_context = f"Job: {title} at {company}. {exp_text}"

	payload = {
	"inputs": {
	"question": f"What were the main responsibilities and achievements for {title} at {company}?",
	"context": job_context[:2000]
	}
	}

	# Use regex extraction for better accuracy with bullet points
	responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())

	experience = {
	"title": title.strip(),
	"company": f"{company.strip()}, {location.strip()}",
	"date_range": dates.strip(),
	"responsibilities": responsibilities
	}
	experiences.append(experience)

	return experiences

	except Exception as e:
	logger.warning(f"Cloud experience extraction failed: {e}")

	# Fallback to regex
	return self._extract_experiences_regex(text)

	def _extract_education_cloud(self, text: str) -> List[str]:
	"""Extract education using question-answering model"""
	try:
	payload = {
	"inputs": {
	"question": "What is the person's educational background including degrees, institutions, and dates?",
	"context": text
	}
	}

	response = self._make_api_request(self.models["question_answering"], payload)

	if response and "answer" in response:
	education_text = response["answer"].strip()

	# Split into individual education entries
	education = []
	if education_text:
	# Split by common separators
	entries = re.split(r'[;,]', education_text)
	for entry in entries:
	entry = entry.strip()
	if len(entry) > 10:
	education.append(entry)

	if education:
	return education

	except Exception as e:
	logger.warning(f"Cloud education extraction failed: {e}")

	# Fallback to regex
	return self._extract_education_regex(text)

	def _extract_contact_info(self, text: str) -> Dict[str, str]:
	"""Extract contact information (email, phone, LinkedIn)"""
	contact_info = {}

	# Extract email
	email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
	if email_match:
	contact_info["email"] = email_match.group(0)

	# Extract phone
	phone_patterns = [
	r'\+?1?[-.\s]?$?(\d{3})$?[-.\s]?(\d{3})[-.\s]?(\d{4})',
	r'(\d{3})[-.\s](\d{3})[-.\s](\d{4})',
	r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
	]

	for pattern in phone_patterns:
	phone_match = re.search(pattern, text)
	if phone_match:
	contact_info["phone"] = phone_match.group(0)
	break

	# Extract LinkedIn
	linkedin_patterns = [
	r'linkedin\.com/in/[\w-]+',
	r'LinkedIn:\s*([\w-]+)',
	r'linkedin\.com/[\w-]+'
	]

	for pattern in linkedin_patterns:
	linkedin_match = re.search(pattern, text, re.IGNORECASE)
	if linkedin_match:
	contact_info["linkedin"] = linkedin_match.group(0)
	break

	return contact_info

	def _fallback_extraction(self, text: str) -> Dict[str, Any]:
	"""Fallback to regex-based extraction"""
	logger.info("Using regex fallback extraction...")
	try:
	from utils.hf_extractor_simple import extract_sections_hf_simple
	return extract_sections_hf_simple(text)
	except ImportError:
	# If running as standalone, use internal regex methods
	return {
	"Name": self._extract_name_regex(text),
	"Summary": self._extract_summary_regex(text),
	"Skills": self._extract_skills_regex(text),
	"StructuredExperiences": self._extract_experiences_regex(text),
	"Education": self._extract_education_regex(text),
	"Training": []
	}

	# Regex fallback methods
	def _extract_name_regex(self, text: str) -> str:
	"""Regex fallback for name extraction"""
	lines = text.split('\n')[:5]
	for line in lines:
	line = line.strip()
	if re.search(r'@\|phone\|email\|linkedin\|github\|📧\|📞\|📍', line.lower()):
	continue
	if len(re.findall(r'[^\w\s]', line)) > 3:
	continue
	name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line)
	if name_match:
	return name_match.group(1)
	return ""

	def _extract_summary_regex(self, text: str) -> str:
	"""Regex fallback for summary extraction"""
	summary_patterns = [
	r'(?i)(?:professional\s+)?summary[:\s]\n(.?)(?=\n\s*(?:technical\s+skills?\|skills?\|experience\|education))',
	r'(?i)objective[:\s]\n(.?)(?=\n\s*(?:technical\s+skills?\|skills?\|experience\|education))',
	]

	for pattern in summary_patterns:
	match = re.search(pattern, text, re.DOTALL)
	if match:
	summary = match.group(1).strip()
	summary = re.sub(r'\n+', ' ', summary)
	summary = re.sub(r'\s+', ' ', summary)
	if len(summary) > 50:
	return summary
	return ""

	def _extract_skills_regex(self, text: str) -> List[str]:
	"""Regex fallback for skills extraction"""
	skills = set()

	# Technical skills section
	skills_pattern = r'(?i)technical\s+skills?[:\s]\n(.?)(?=\n\s*(?:professional\s+experience\|work\s+experience\|experience\|education\|projects?))'
	match = re.search(skills_pattern, text, re.DOTALL)

	if match:
	skills_text = match.group(1)

	# Handle both bullet points and comma-separated lists
	bullet_lines = re.findall(r'●\s*([^●\n]+)', skills_text)
	if not bullet_lines:
	# If no bullets, treat as comma-separated list
	bullet_lines = [skills_text.strip()]

	for line in bullet_lines:
	if ':' in line:
	skills_part = line.split(':', 1)[1].strip()
	else:
	skills_part = line.strip()

	# Split by commas and clean up
	individual_skills = re.split(r',\s*', skills_part)
	for skill in individual_skills:
	skill = skill.strip()
	skill = re.sub(r'$[^)]*$', '', skill).strip() # Remove parentheses
	skill = re.sub(r'\s+', ' ', skill) # Normalize whitespace

	# Filter out company names and invalid skills
	if (skill and
	len(skill) > 1 and
	len(skill) < 50 and
	not self._is_company_name_skill(skill) and
	not self._is_broken_skill(skill)):
	skills.add(skill)

	# Clean up and deduplicate
	cleaned_skills = set()
	for skill in skills:
	# Fix common parsing issues
	skill = self._fix_skill_name(skill)
	if skill:
	cleaned_skills.add(skill)

	return sorted(list(cleaned_skills))

	def _is_company_name_skill(self, skill: str) -> bool:
	"""Check if skill is actually a company name"""
	company_indicators = [
	'financial services', 'insurance solutions', 'abc financial', 'xyz insurance',
	'abc', 'xyz', 'solutions', 'services', 'financial', 'insurance'
	]
	skill_lower = skill.lower()
	return any(indicator in skill_lower for indicator in company_indicators)

	def _is_broken_skill(self, skill: str) -> bool:
	"""Check if skill appears to be broken/truncated"""
	# Skills that are too short or look broken
	broken_patterns = [
	r'^[a-z]{1,3}$', # Very short lowercase
	r'^[A-Z]{1,2}$', # Very short uppercase
	r'ium$', # Ends with 'ium' (likely from Selenium)
	r'^len$', # Just 'len'
	r'^Web$', # Just 'Web'
	r'^T\s', # Starts with 'T ' (likely from REST)
	]

	for pattern in broken_patterns:
	if re.match(pattern, skill):
	return True
	return False

	def _fix_skill_name(self, skill: str) -> str:
	"""Fix common skill name issues"""
	# Fix known broken skills
	fixes = {
	'Selen': 'Selenium',
	'lenium': 'Selenium',
	'ium': 'Selenium',
	'len': None, # Remove
	'T Assured': 'REST Assured',
	'CI / CD': 'CI/CD',
	'Agile / Scrum': 'Agile/Scrum',
	'Web': None, # Remove standalone 'Web'
	}

	if skill in fixes:
	return fixes[skill]

	# Fix spacing issues
	skill = re.sub(r'\s/\s', '/', skill) # Fix "CI / CD" -> "CI/CD"

	return skill

	def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
	"""Regex fallback for experience extraction"""
	experiences = []

	# Look for experience section (try different section names)
	exp_patterns = [
	r'(?i)(?:work\s+)?experience[:\s]\n(.?)(?=\n\s*(?:education\|projects?\|certifications?\|page\s+\d+\|$))',
	r'(?i)(?:professional\s+)?experience[:\s]\n(.?)(?=\n\s*(?:education\|projects?\|certifications?\|page\s+\d+\|$))'
	]

	exp_text = ""
	for pattern in exp_patterns:
	match = re.search(pattern, text, re.DOTALL)
	if match:
	exp_text = match.group(1)
	break

	if exp_text:
	# Try 3-part format: Title \| Company \| Date
	pattern_3 = r'([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)'
	matches_3 = re.findall(pattern_3, exp_text)

	# Try 4-part format: Company \| Location \| Title \| Date
	pattern_4 = r'([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)\s\\|\s([^\|\n]+)'
	matches_4 = re.findall(pattern_4, exp_text)

	processed_companies = set()

	# Process 3-part matches (Title \| Company \| Date)
	for match in matches_3:
	title, company, dates = match
	company_key = company.strip()

	if company_key in processed_companies:
	continue
	processed_companies.add(company_key)

	responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())

	experience = {
	"title": title.strip(),
	"company": company_key,
	"date_range": dates.strip(),
	"responsibilities": responsibilities
	}
	experiences.append(experience)

	# Process 4-part matches (Company \| Location \| Title \| Date)
	for match in matches_4:
	company, location, title, dates = match
	company_key = f"{company.strip()}, {location.strip()}"

	if company_key in processed_companies:
	continue
	processed_companies.add(company_key)

	responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())

	experience = {
	"title": title.strip(),
	"company": company_key,
	"date_range": dates.strip(),
	"responsibilities": responsibilities
	}
	experiences.append(experience)

	return experiences

	def _extract_responsibilities_regex(self, exp_text: str, company: str, title: str) -> List[str]:
	"""Regex fallback for responsibilities extraction"""
	responsibilities = []

	# Look for the job section - try different patterns
	job_patterns = [
	rf'{re.escape(title)}.?{re.escape(company)}.?\n(.?)(?=\n[A-Z][^\|\n-]\s*\\|\|$)',
	rf'{re.escape(company)}.?{re.escape(title)}.?\n(.?)(?=\n[A-Z][^\|\n-]\s*\\|\|$)'
	]

	for pattern in job_patterns:
	match = re.search(pattern, exp_text, re.DOTALL \| re.IGNORECASE)
	if match:
	resp_text = match.group(1)

	# Look for bullet points (● or -)
	bullets = re.findall(r'[●-]\s*([^●\n-]+)', resp_text)

	# Clean and fix responsibilities
	for bullet in bullets:
	bullet = bullet.strip()
	bullet = re.sub(r'\s+', ' ', bullet)

	# Fix common truncation issues
	bullet = self._fix_responsibility_text(bullet)

	if bullet and len(bullet) > 15:
	responsibilities.append(bullet)
	break

	return responsibilities

	def _fix_responsibility_text(self, text: str) -> str:
	"""Fix common responsibility text issues"""
	# Fix known truncation issues
	fixes = {
	'end UI and API testing': 'Automated end-to-end UI and API testing',
	'related web services.': 'for policy-related web services.',
	}

	for broken, fixed in fixes.items():
	if text.startswith(broken):
	return fixed + text[len(broken):]
	if text.endswith(broken):
	return text[:-len(broken)] + fixed

	# Fix incomplete sentences that start with lowercase
	if text and text[0].islower() and not text.startswith('e.g.'):
	# Likely a continuation, try to fix common patterns
	if text.startswith('end '):
	text = 'Automated ' + text
	elif text.startswith('related '):
	text = 'for policy-' + text

	return text

	def _extract_education_regex(self, text: str) -> List[str]:
	"""Regex fallback for education extraction"""
	education = []

	edu_pattern = r'(?i)education[:\s]\n(.?)(?=\n\s*(?:certifications?\|projects?\|$))'
	match = re.search(edu_pattern, text, re.DOTALL)

	if match:
	edu_text = match.group(1)
	edu_lines = re.findall(r'●\s*([^●\n]+)', edu_text)
	if not edu_lines:
	edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()]

	for line in edu_lines:
	line = line.strip()
	line = re.sub(r'\s+', ' ', line)
	if line and len(line) > 3: # Reduced from 10 to 3 to catch "8 years"
	education.append(line)

	return education

	# Convenience function for easy usage
	def extract_sections_hf_cloud(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
	"""
	Extract resume sections using Hugging Face cloud models

	Args:
	text: Raw resume text
	api_key: Hugging Face API key (optional)

	Returns:
	Structured resume data
	"""
	extractor = HuggingFaceCloudExtractor(api_key=api_key)
	return extractor.extract_sections_hf_cloud(text)

	# Test function
	def test_hf_cloud_extraction():
	"""Test the Hugging Face cloud extraction with sample resume"""

	sample_text = """
	Jonathan Edward Nguyen
	📍San Diego, CA \| 858-900-5036 \| 📧 jonatngu@icloud.com

	Summary
	Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
	automation solutions, AI development, and optimizing workflows.

	Technical Skills
	● Programming Languages: Python, Java, SQL, Apex, Bash
	● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas
	● Cloud Platforms: AWS Glue, AWS SageMaker, AWS Orchestration, REST APIs

	Professional Experience
	TalentLens.AI \| Remote \| AI Developer \| Feb 2025 – Present
	● Built an automated test suite for LLM prompts that export reports with performance metrics
	● Architected and developed an AI-powered resume screening application using Streamlit

	GoFundMe \| San Diego, CA \| Senior Developer in Test \| Oct 2021 – Dec 2024
	● Built and maintained robust API and UI test suites in Python, reducing defects by 37%
	● Automated environment builds using Apex and Bash, improving deployment times by 30%

	Education
	● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing
	"""

	extractor = HuggingFaceCloudExtractor()
	result = extractor.extract_sections_hf_cloud(sample_text)

	print("Hugging Face Cloud Extraction Results:")
	print(json.dumps(result, indent=2))

	return result

	if __name__ == "__main__":
	test_hf_cloud_extraction()