# utils/screening.py from .parser import parse_resume, extract_email, summarize_resume from .hybrid_extractor import extract_resume_sections from .spacy_loader import get_nlp, is_spacy_available from config import supabase, embedding_model, client from fuzzywuzzy import fuzz from sentence_transformers import util import streamlit as st # Load spaCy model for keyword extraction with fallback nlp = get_nlp() from sklearn.feature_extraction.text import TfidfVectorizer def extract_keywords(text, top_n=10): """ Extracts top keywords from the job description using spaCy and TF-IDF. """ if not text.strip(): return [] # Use spaCy for keyword extraction if available, otherwise use simple word extraction if nlp and is_spacy_available(): doc = nlp(text.lower()) keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop] else: # Fallback to simple word extraction without POS tagging import re words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) # Filter out common stop words manually stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'man', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'its', 'did', 'yes', 'she', 'may', 'say', 'use', 'her', 'any', 'top', 'own', 'too', 'off', 'far', 'set', 'why', 'ask', 'men', 'run', 'end', 'put', 'lot', 'big', 'eye', 'try', 'yet', 'car', 'eat', 'job', 'sit', 'cut', 'let', 'got', 'buy', 'win', 'box', 'hit', 'add', 'oil', 'six', 'war', 'age', 'boy', 'due', 'bed', 'hot', 'cup', 'cut', 'gun', 'kid', 'red', 'sea', 'art', 'air', 'low', 'pay', 'act', 'bit', 'bad', 'law', 'dog', 'key', 'bit', 'arm', 'tax', 'gas'} keywords = [word for word in words if word not in stop_words] if not keywords: return [] try: tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) matrix = tfidf.fit_transform([" ".join(keywords)]) scores = matrix.toarray()[0] features = tfidf.get_feature_names_out() ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True) return [kw for kw, _ in ranked[:top_n]] except ValueError: return [] def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): """ Filters resumes by keyword match using fuzzy logic. """ job_keywords = extract_keywords(job_description) if len(job_keywords) < min_keyword_match: st.warning("⚠️ Job description too short or missing for keyword filtering.") return resumes, [] filtered, removed = [], [] for resume in resumes: matched = { keyword for keyword in job_keywords if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split()) } if len(matched) >= min_keyword_match: filtered.append(resume) else: removed.append(resume["name"]) return filtered, removed def create_enhanced_summary(extracted_data, resume_text): """ Create an enhanced summary from structured extraction data. Falls back to old summarization if extraction fails. """ try: name = extracted_data.get('Name', 'Candidate') summary_text = extracted_data.get('Summary', '') skills = extracted_data.get('Skills', []) experiences = extracted_data.get('StructuredExperiences', []) education = extracted_data.get('Education', []) # Build enhanced summary parts = [] # Add name and current title if experiences: current_job = experiences[0] # Most recent job parts.append(f"{name} - {current_job.get('title', 'Professional')}") else: parts.append(f"{name} - Professional") # Add experience summary if summary_text: parts.append(summary_text[:200] + "..." if len(summary_text) > 200 else summary_text) # Add key skills (top 5) if skills: top_skills = skills[:5] parts.append(f"Key Skills: {', '.join(top_skills)}") # Add experience count if experiences: parts.append(f"Experience: {len(experiences)} positions") # Add education if education: parts.append(f"Education: {education[0]}") return " | ".join(parts) except Exception as e: print(f"❌ Error creating enhanced summary: {e}") # Fallback to old summarization from .parser import summarize_resume return summarize_resume(resume_text) def score_candidate(resume_text, job_description): """ Computes cosine similarity between resume and job description using embeddings. """ try: resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True) job_vec = embedding_model.encode(job_description, convert_to_tensor=True) score = util.pytorch_cos_sim(resume_vec, job_vec).item() return round(score, 4) except Exception as e: print(f"Error computing similarity: {e}") return 0 def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): """ Evaluate uploaded resumes and return shortlisted candidates with scores and summaries. Uses the new hybrid extraction system with OpenAI as primary and HF Cloud as backup. """ candidates, removed_candidates = [], [] for pdf_file in uploaded_files: try: # Extract raw text resume_text = parse_resume(pdf_file) # Use new hybrid extraction system (OpenAI primary, HF Cloud backup) extracted_data = extract_resume_sections( resume_text, prefer_ai=True, use_openai=True, # Try OpenAI first use_hf_cloud=True # Fallback to HF Cloud ) # Get structured data candidate_name = extracted_data.get('Name') or pdf_file.name.replace('.pdf', '') email = extract_email(resume_text) # Keep existing email extraction # Create enhanced summary from structured data summary = create_enhanced_summary(extracted_data, resume_text) # Score the candidate score = score_candidate(resume_text, job_description) if score < 0.20: removed_candidates.append({ "name": candidate_name, "reason": "Low confidence score (< 0.20)" }) continue candidates.append({ "name": candidate_name, "resume": resume_text, "score": score, "email": email, "summary": summary, "structured_data": extracted_data # Include structured data for better processing }) except Exception as e: st.error(f"❌ Error processing {pdf_file.name}: {e}") removed_candidates.append({ "name": pdf_file.name, "reason": f"Processing error: {str(e)}" }) continue # 🔹 Step 2: Filter candidates based on keyword matches filtered_candidates, keyword_removed = filter_resumes_by_keywords( candidates, job_description, min_keyword_match ) # 🔹 Step 3: Log removed candidates for name in keyword_removed: removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) # 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] # 🔹 Step 4.5: Store shortlisted candidates in Supabase for candidate in shortlisted_candidates: try: store_in_supabase( resume_text=candidate["resume"], score=candidate["score"], candidate_name=candidate["name"], email=candidate["email"], summary=candidate["summary"] ) except Exception as e: print(f"❌ Failed to store {candidate['name']} in Supabase: {e}") # 🔹 Step 5: Ensure return value is always a list if not isinstance(shortlisted_candidates, list): print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.") return [], removed_candidates return shortlisted_candidates, removed_candidates def store_in_supabase(resume_text, score, candidate_name, email, summary): """ Saves candidate data to the Supabase table. """ data = { "name": candidate_name, "resume": resume_text, "score": score or 0, "email": email, "summary": summary } return supabase.table("candidates").insert(data).execute()