Spaces:
Running
Running
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
#!/usr/bin/env python3 | |
""" | |
OpenAI GPT-4o Resume Extractor | |
This module provides resume extraction using OpenAI's GPT-4o model (GPT-4.1), | |
which is the latest and most capable model for complex resume parsing. | |
""" | |
import json | |
import re | |
import logging | |
import os | |
from typing import Dict, Any, List, Optional | |
from openai import OpenAI | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class OpenAIResumeExtractor: | |
""" | |
Production-ready resume extractor using OpenAI GPT-4o (GPT-4.1) | |
""" | |
def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"): | |
""" | |
Initialize the OpenAI extractor | |
Args: | |
api_key: OpenAI API key (optional, will use env var if not provided) | |
model: OpenAI model to use (gpt-4o is the latest and most capable GPT-4 model) | |
""" | |
self.api_key = api_key or os.getenv('OPENAI_API_KEY') | |
self.model = model | |
if not self.api_key: | |
raise ValueError("No OpenAI API key found. Set OPENAI_API_KEY environment variable.") | |
self.client = OpenAI(api_key=self.api_key) | |
def extract_sections_openai(self, text: str) -> Dict[str, Any]: | |
""" | |
Extract resume sections using OpenAI GPT-4o | |
Args: | |
text: Raw resume text | |
Returns: | |
Structured resume data | |
""" | |
logger.info("Starting OpenAI GPT-4o extraction...") | |
try: | |
# Create a comprehensive prompt for structured extraction | |
prompt = self._create_extraction_prompt(text) | |
# Make API call to OpenAI | |
response = self.client.chat.completions.create( | |
model=self.model, | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are an expert resume parser. Extract information accurately and return valid JSON only." | |
}, | |
{ | |
"role": "user", | |
"content": prompt | |
} | |
], | |
temperature=0.1, # Low temperature for consistent results | |
max_tokens=2000 | |
) | |
# Parse the response | |
result_text = response.choices[0].message.content.strip() | |
# Clean up the response to extract JSON | |
if "```json" in result_text: | |
result_text = result_text.split("```json")[1].split("```")[0] | |
elif "```" in result_text: | |
result_text = result_text.split("```")[1] | |
# Parse JSON | |
result = json.loads(result_text) | |
# Validate and clean the result | |
result = self._validate_and_clean_result(result) | |
# Extract contact info from the original text | |
contact_info = self._extract_contact_info(text) | |
result["ContactInfo"] = contact_info | |
logger.info("✅ OpenAI extraction completed successfully") | |
return result | |
except Exception as e: | |
logger.error(f"OpenAI extraction failed: {e}") | |
# Check if it's an API key issue | |
if "401" in str(e) or "invalid_api_key" in str(e): | |
logger.error("❌ Invalid OpenAI API key - please check your OPENAI_API_KEY environment variable") | |
# Return empty result to force hybrid system to try other methods | |
return self._get_empty_result() | |
# For other errors, fallback to regex extraction | |
return self._fallback_extraction(text) | |
def _create_extraction_prompt(self, text: str) -> str: | |
"""Create a comprehensive prompt for resume extraction""" | |
prompt = f""" | |
Extract the following information from this resume text and return it as valid JSON: | |
RESUME TEXT: | |
{text} | |
Extract and return ONLY a JSON object with this exact structure: | |
{{ | |
"Name": "Full name of the person", | |
"Summary": "Professional summary or objective (full text)", | |
"Skills": ["skill1", "skill2", "skill3"], | |
"StructuredExperiences": [ | |
{{ | |
"title": "Job title", | |
"company": "Company name", | |
"date_range": "Date range (e.g., Jan 2021 - Present)", | |
"responsibilities": ["responsibility 1", "responsibility 2"] | |
}} | |
], | |
"Education": ["degree | institution | year"], | |
"Training": [] | |
}} | |
EXTRACTION RULES: | |
1. Name: Extract the full name from the top of the resume | |
2. Summary: Extract the complete professional summary/objective section | |
3. Skills: Extract technical skills only (programming languages, tools, frameworks) | |
4. StructuredExperiences: For each job, extract: | |
- title: The job title/position | |
- company: Company name (include location if provided) | |
- date_range: Employment dates | |
- responsibilities: List of bullet points describing what they did | |
5. Education: Extract degrees, institutions, and graduation years | |
6. Training: Extract certifications, courses, training programs | |
IMPORTANT: | |
- Return ONLY valid JSON, no explanations | |
- If a section is not found, use empty string or empty array | |
- For skills, exclude company names and focus on technical skills | |
- For experiences, look for patterns like "Title | Company | Dates" or similar | |
- Extract ALL job experiences found in the resume | |
- Include ALL bullet points under each job as responsibilities | |
""" | |
return prompt | |
def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]: | |
"""Validate and clean the extraction result""" | |
# Ensure all required keys exist | |
required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training"] | |
for key in required_keys: | |
if key not in result: | |
result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else "" | |
# Clean skills - remove company names and duplicates | |
if result.get("Skills"): | |
cleaned_skills = [] | |
for skill in result["Skills"]: | |
skill = skill.strip() | |
# Skip if it looks like a company name or is too short | |
if len(skill) > 1 and not self._is_company_name(skill): | |
cleaned_skills.append(skill) | |
result["Skills"] = list(set(cleaned_skills)) # Remove duplicates | |
# Validate experience structure | |
if result.get("StructuredExperiences"): | |
cleaned_experiences = [] | |
for exp in result["StructuredExperiences"]: | |
if isinstance(exp, dict) and exp.get("title") and exp.get("company"): | |
# Ensure responsibilities is a list | |
if not isinstance(exp.get("responsibilities"), list): | |
exp["responsibilities"] = [] | |
cleaned_experiences.append(exp) | |
result["StructuredExperiences"] = cleaned_experiences | |
return result | |
def _get_empty_result(self) -> Dict[str, Any]: | |
"""Return empty result structure for API failures""" | |
return { | |
"Name": "", | |
"Summary": "", | |
"Skills": [], | |
"StructuredExperiences": [], | |
"Education": [], | |
"Training": [], | |
"ContactInfo": {} | |
} | |
def _is_company_name(self, text: str) -> bool: | |
"""Check if text looks like a company name rather than a skill""" | |
company_indicators = [ | |
"inc", "llc", "corp", "ltd", "company", "solutions", "services", | |
"systems", "technologies", "financial", "insurance", "abc", "xyz" | |
] | |
text_lower = text.lower() | |
return any(indicator in text_lower for indicator in company_indicators) | |
def _fallback_extraction(self, text: str) -> Dict[str, Any]: | |
"""Fallback to regex-based extraction if OpenAI fails""" | |
logger.info("Using regex fallback extraction...") | |
try: | |
from utils.hf_extractor_simple import extract_sections_hf_simple | |
return extract_sections_hf_simple(text) | |
except ImportError: | |
# Basic regex fallback | |
return { | |
"Name": self._extract_name_regex(text), | |
"Summary": self._extract_summary_regex(text), | |
"Skills": self._extract_skills_regex(text), | |
"StructuredExperiences": self._extract_experiences_regex(text), | |
"Education": self._extract_education_regex(text), | |
"Training": [], | |
"ContactInfo": self._extract_contact_info(text) | |
} | |
def _extract_name_regex(self, text: str) -> str: | |
"""Regex fallback for name extraction""" | |
lines = text.split('\n')[:5] | |
for line in lines: | |
line = line.strip() | |
if re.search(r'@|phone|email|linkedin|github', line.lower()): | |
continue | |
name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line) | |
if name_match: | |
return name_match.group(1) | |
return "" | |
def _extract_summary_regex(self, text: str) -> str: | |
"""Regex fallback for summary extraction""" | |
summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' | |
match = re.search(summary_pattern, text, re.DOTALL) | |
if match: | |
summary = match.group(1).strip() | |
summary = re.sub(r'\n+', ' ', summary) | |
summary = re.sub(r'\s+', ' ', summary) | |
return summary | |
return "" | |
def _extract_skills_regex(self, text: str) -> List[str]: | |
"""Regex fallback for skills extraction""" | |
skills = set() | |
# Look for technical skills section | |
skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:experience|education|projects?))' | |
match = re.search(skills_pattern, text, re.DOTALL) | |
if match: | |
skills_text = match.group(1) | |
# Split by common separators | |
skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' ')) | |
for item in skill_items: | |
item = item.strip() | |
if item and len(item) > 1 and len(item) < 30: | |
skills.add(item) | |
return sorted(list(skills)) | |
def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]: | |
"""Regex fallback for experience extraction""" | |
experiences = [] | |
# Look for work experience section | |
exp_pattern = r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' | |
match = re.search(exp_pattern, text, re.DOTALL) | |
if match: | |
exp_text = match.group(1) | |
# Look for job entries with | separators | |
job_pattern = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' | |
matches = re.findall(job_pattern, exp_text) | |
for match in matches: | |
title, company, dates = match | |
responsibilities = [] | |
# Look for bullet points after this job | |
job_section = exp_text[exp_text.find(f"{title}|{company}|{dates}"):] | |
bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section) | |
responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10] | |
experience = { | |
"title": title.strip(), | |
"company": company.strip(), | |
"date_range": dates.strip(), | |
"responsibilities": responsibilities | |
} | |
experiences.append(experience) | |
return experiences | |
def _extract_education_regex(self, text: str) -> List[str]: | |
"""Regex fallback for education extraction""" | |
education = [] | |
edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' | |
match = re.search(edu_pattern, text, re.DOTALL) | |
if match: | |
edu_text = match.group(1) | |
edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] | |
for line in edu_lines: | |
if len(line) > 10: # Filter out short lines | |
education.append(line) | |
return education | |
def _extract_contact_info(self, text: str) -> Dict[str, str]: | |
"""Extract contact information (email, phone, LinkedIn)""" | |
contact_info = {} | |
# Extract email | |
email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text) | |
if email_match: | |
contact_info["email"] = email_match.group(0) | |
# Extract phone | |
phone_patterns = [ | |
r'\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})', | |
r'(\d{3})[-.\s](\d{3})[-.\s](\d{4})', | |
r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}' | |
] | |
for pattern in phone_patterns: | |
phone_match = re.search(pattern, text) | |
if phone_match: | |
contact_info["phone"] = phone_match.group(0) | |
break | |
# Extract LinkedIn | |
linkedin_patterns = [ | |
r'linkedin\.com/in/[\w-]+', | |
r'linkedin\.com/[\w-]+', | |
r'(?i)linkedin[:\s]+[\w.-]+', | |
] | |
for pattern in linkedin_patterns: | |
linkedin_match = re.search(pattern, text) | |
if linkedin_match: | |
linkedin_url = linkedin_match.group(0) | |
if not linkedin_url.startswith('http'): | |
linkedin_url = f"https://{linkedin_url}" | |
contact_info["linkedin"] = linkedin_url | |
break | |
return contact_info | |
# Convenience function for easy usage | |
def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]: | |
""" | |
Extract resume sections using OpenAI GPT-4o (GPT-4.1) | |
Args: | |
text: Raw resume text | |
api_key: OpenAI API key (optional) | |
Returns: | |
Structured resume data | |
""" | |
extractor = OpenAIResumeExtractor(api_key=api_key) | |
return extractor.extract_sections_openai(text) | |
# Test function | |
def test_openai_extraction(): | |
"""Test the OpenAI extraction with sample resume""" | |
sample_text = """ | |
John Doe | |
Selenium Java Automation Engineer | |
Email: johndoe@example.com | Phone: +1-123-456-7890 | |
Professional Summary | |
Results-driven Automation Test Engineer with 8 years of experience in Selenium and Java, | |
specializing in automation frameworks for financial and insurance domains. | |
Technical Skills | |
Selenium WebDriver, Java, TestNG, Cucumber, Jenkins, Maven, Git, REST Assured, Postman, | |
JIRA, Agile/Scrum, CI/CD | |
Work Experience | |
Senior Automation Test Engineer | ABC Financial Services | Jan 2021 - Present | |
- Led automation framework enhancements using Selenium and Java, improving test efficiency. | |
- Automated end-to-end UI and API testing for financial applications, reducing manual effort by 40%. | |
Automation Test Engineer | XYZ Insurance Solutions | Jun 2017 - Dec 2020 | |
- Designed and implemented Selenium automation framework using Java and TestNG. | |
- Developed automated test scripts for insurance policy management applications. | |
Education | |
Bachelor of Technology in Computer Science | ABC University | 2015 | |
""" | |
extractor = OpenAIResumeExtractor() | |
result = extractor.extract_sections_openai(sample_text) | |
print("OpenAI Extraction Results:") | |
print(json.dumps(result, indent=2)) | |
return result | |
if __name__ == "__main__": | |
test_openai_extraction() |