Spaces:
Running
Running
File size: 2,821 Bytes
c2f9ec8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# parser.py
import fitz # PyMuPDF
import re
from io import BytesIO
from docx import Document
from config import supabase, embedding_model, client, query
def extract_name(resume_text: str) -> str:
# look at the very top lines for a capitalized full name
for line in resume_text.splitlines()[:5]:
if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()):
return line.strip()
# last‐ditch: pull the first multiword “Title Case” anywhere
m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text)
return m.group(1) if m else "Candidate Name"
def parse_resume(file_obj, file_type=None):
"""
Extract raw text from PDF or DOCX resume.
"""
if file_type is None and hasattr(file_obj, 'name'):
file_type = file_obj.name.split('.')[-1].lower()
if file_type == 'pdf':
doc = fitz.open(stream=file_obj.read(), filetype='pdf')
return "\n".join(page.get_text('text') for page in doc)
elif file_type == 'docx':
doc = Document(file_obj)
text = []
for para in doc.paragraphs:
if para.text.strip():
text.append(para.text)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text.append(cell.text.strip())
return "\n".join(text)
else:
raise ValueError("Unsupported file format")
def extract_email(resume_text):
"""
Extracts the first valid email found in text.
"""
match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
return match.group(0) if match else None
def summarize_resume(resume_text):
prompt = (
"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. "
"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. "
"Format it as a professional summary paragraph.\n\n"
f"Resume:\n{resume_text}\n\n"
"Summary:"
)
try:
response = client.chat.completions.create(
model="tgi",
messages=[{"role": "user", "content": prompt}],
temperature=0.5,
max_tokens=300,
)
result = response.choices[0].message.content.strip()
# Clean up generic lead-ins from the model
cleaned = re.sub(
r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*",
"", result, flags=re.IGNORECASE
).strip()
return cleaned
except Exception as e:
print(f"❌ Error generating structured summary: {e}")
return "Summary unavailable due to API issues." |