Spaces:
Running
Running
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
# parser.py | |
import fitz # PyMuPDF | |
import re | |
from io import BytesIO | |
from docx import Document | |
from config import supabase, embedding_model, client, query | |
def extract_name(resume_text: str) -> str: | |
# look at the very top lines for a capitalized full name | |
for line in resume_text.splitlines()[:5]: | |
if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()): | |
return line.strip() | |
# last‐ditch: pull the first multiword “Title Case” anywhere | |
m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text) | |
return m.group(1) if m else "Candidate Name" | |
def parse_resume(file_obj, file_type=None): | |
""" | |
Extract raw text from PDF or DOCX resume. | |
""" | |
if file_type is None and hasattr(file_obj, 'name'): | |
file_type = file_obj.name.split('.')[-1].lower() | |
if file_type == 'pdf': | |
doc = fitz.open(stream=file_obj.read(), filetype='pdf') | |
return "\n".join(page.get_text('text') for page in doc) | |
elif file_type == 'docx': | |
doc = Document(file_obj) | |
text = [] | |
for para in doc.paragraphs: | |
if para.text.strip(): | |
text.append(para.text) | |
for table in doc.tables: | |
for row in table.rows: | |
for cell in row.cells: | |
if cell.text.strip(): | |
text.append(cell.text.strip()) | |
return "\n".join(text) | |
else: | |
raise ValueError("Unsupported file format") | |
def extract_email(resume_text): | |
""" | |
Extracts the first valid email found in text. | |
""" | |
match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) | |
return match.group(0) if match else None | |
def summarize_resume(resume_text): | |
prompt = ( | |
"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. " | |
"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. " | |
"Format it as a professional summary paragraph.\n\n" | |
f"Resume:\n{resume_text}\n\n" | |
"Summary:" | |
) | |
try: | |
response = client.chat.completions.create( | |
model="tgi", | |
messages=[{"role": "user", "content": prompt}], | |
temperature=0.5, | |
max_tokens=300, | |
) | |
result = response.choices[0].message.content.strip() | |
# Clean up generic lead-ins from the model | |
cleaned = re.sub( | |
r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*", | |
"", result, flags=re.IGNORECASE | |
).strip() | |
return cleaned | |
except Exception as e: | |
print(f"❌ Error generating structured summary: {e}") | |
return "Summary unavailable due to API issues." |