# parser.py import fitz # PyMuPDF import re from io import BytesIO from docx import Document from config import supabase, embedding_model, client, query def extract_name(resume_text: str) -> str: # look at the very top lines for a capitalized full name for line in resume_text.splitlines()[:5]: if re.match(r"^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}$", line.strip()): return line.strip() # last‐ditch: pull the first multiword “Title Case” anywhere m = re.search(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", resume_text) return m.group(1) if m else "Candidate Name" def parse_resume(file_obj, file_type=None): """ Extract raw text from PDF or DOCX resume. """ if file_type is None and hasattr(file_obj, 'name'): file_type = file_obj.name.split('.')[-1].lower() if file_type == 'pdf': doc = fitz.open(stream=file_obj.read(), filetype='pdf') return "\n".join(page.get_text('text') for page in doc) elif file_type == 'docx': doc = Document(file_obj) text = [] for para in doc.paragraphs: if para.text.strip(): text.append(para.text) for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text.strip(): text.append(cell.text.strip()) return "\n".join(text) else: raise ValueError("Unsupported file format") def extract_email(resume_text): """ Extracts the first valid email found in text. """ match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) return match.group(0) if match else None def summarize_resume(resume_text): prompt = ( "You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. " "Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. " "Format it as a professional summary paragraph.\n\n" f"Resume:\n{resume_text}\n\n" "Summary:" ) try: response = client.chat.completions.create( model="tgi", messages=[{"role": "user", "content": prompt}], temperature=0.5, max_tokens=300, ) result = response.choices[0].message.content.strip() # Clean up generic lead-ins from the model cleaned = re.sub( r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*", "", result, flags=re.IGNORECASE ).strip() return cleaned except Exception as e: print(f"❌ Error generating structured summary: {e}") return "Summary unavailable due to API issues."