Spaces:

gauravbox
/

TalentLensAI

Running

File size: 2,821 Bytes

c2f9ec8

# parser.py
import fitz  # PyMuPDF
import re
from io import BytesIO
from docx import Document
from config import supabase, embedding_model, client, query

def extract_name(resume_text: str) -> str:
    # look at the very top lines for a capitalized full name
    for line in resume_text.splitlines()[:5]:
        if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()):
            return line.strip()
    # last‐ditch: pull the first multiword “Title Case” anywhere
    m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text)
    return m.group(1) if m else "Candidate Name"

def parse_resume(file_obj, file_type=None):
    """
    Extract raw text from PDF or DOCX resume.
    """
    if file_type is None and hasattr(file_obj, 'name'):
        file_type = file_obj.name.split('.')[-1].lower()
    if file_type == 'pdf':
        doc = fitz.open(stream=file_obj.read(), filetype='pdf')
        return "\n".join(page.get_text('text') for page in doc)
    elif file_type == 'docx':
        doc = Document(file_obj)
        text = []
        for para in doc.paragraphs:
            if para.text.strip():
                text.append(para.text)
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        text.append(cell.text.strip())
        return "\n".join(text)
    else:
        raise ValueError("Unsupported file format")
    
def extract_email(resume_text):
    """
    Extracts the first valid email found in text.
    """
    match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
    return match.group(0) if match else None

def summarize_resume(resume_text):
    prompt = (
        "You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. "
        "Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. "
        "Format it as a professional summary paragraph.\n\n"
        f"Resume:\n{resume_text}\n\n"
        "Summary:"
    )

    try:
        response = client.chat.completions.create(
            model="tgi",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            max_tokens=300,
        )
        result = response.choices[0].message.content.strip()

        # Clean up generic lead-ins from the model
        cleaned = re.sub(
            r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*",
            "", result, flags=re.IGNORECASE
        ).strip()

        return cleaned

    except Exception as e:
        print(f"❌ Error generating structured summary: {e}")
        return "Summary unavailable due to API issues."