TalentLensAI / utils /parser.py
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
raw
history blame
2.82 kB
# parser.py
import fitz # PyMuPDF
import re
from io import BytesIO
from docx import Document
from config import supabase, embedding_model, client, query
def extract_name(resume_text: str) -> str:
# look at the very top lines for a capitalized full name
for line in resume_text.splitlines()[:5]:
if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()):
return line.strip()
# last‐ditch: pull the first multiword “Title Case” anywhere
m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text)
return m.group(1) if m else "Candidate Name"
def parse_resume(file_obj, file_type=None):
"""
Extract raw text from PDF or DOCX resume.
"""
if file_type is None and hasattr(file_obj, 'name'):
file_type = file_obj.name.split('.')[-1].lower()
if file_type == 'pdf':
doc = fitz.open(stream=file_obj.read(), filetype='pdf')
return "\n".join(page.get_text('text') for page in doc)
elif file_type == 'docx':
doc = Document(file_obj)
text = []
for para in doc.paragraphs:
if para.text.strip():
text.append(para.text)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text.append(cell.text.strip())
return "\n".join(text)
else:
raise ValueError("Unsupported file format")
def extract_email(resume_text):
"""
Extracts the first valid email found in text.
"""
match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
return match.group(0) if match else None
def summarize_resume(resume_text):
prompt = (
"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. "
"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. "
"Format it as a professional summary paragraph.\n\n"
f"Resume:\n{resume_text}\n\n"
"Summary:"
)
try:
response = client.chat.completions.create(
model="tgi",
messages=[{"role": "user", "content": prompt}],
temperature=0.5,
max_tokens=300,
)
result = response.choices[0].message.content.strip()
# Clean up generic lead-ins from the model
cleaned = re.sub(
r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*",
"", result, flags=re.IGNORECASE
).strip()
return cleaned
except Exception as e:
print(f"❌ Error generating structured summary: {e}")
return "Summary unavailable due to API issues."