Spaces:

gauravbox
/

TalentLensAI

Running

File size: 9,121 Bytes

c2f9ec8
 
 
102e49d
c2f9ec8
 
 
 
 
102e49d
 
c2f9ec8
 
 
 
 
 
 
 
 
102e49d
 
 
 
 
 
 
 
 
 
 
c2f9ec8

# utils/screening.py
from .parser     import parse_resume, extract_email, summarize_resume
from .hybrid_extractor import extract_resume_sections
from .spacy_loader import get_nlp, is_spacy_available
from config      import supabase, embedding_model, client
from fuzzywuzzy import fuzz
from sentence_transformers import util
import streamlit as st

# Load spaCy model for keyword extraction with fallback
nlp = get_nlp()
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords(text, top_n=10):
    """
    Extracts top keywords from the job description using spaCy and TF-IDF.
    """
    if not text.strip():
        return []

    # Use spaCy for keyword extraction if available, otherwise use simple word extraction
    if nlp and is_spacy_available():
        doc = nlp(text.lower())
        keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop]
    else:
        # Fallback to simple word extraction without POS tagging
        import re
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
        # Filter out common stop words manually
        stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'man', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'its', 'did', 'yes', 'she', 'may', 'say', 'use', 'her', 'any', 'top', 'own', 'too', 'off', 'far', 'set', 'why', 'ask', 'men', 'run', 'end', 'put', 'lot', 'big', 'eye', 'try', 'yet', 'car', 'eat', 'job', 'sit', 'cut', 'let', 'got', 'buy', 'win', 'box', 'hit', 'add', 'oil', 'six', 'war', 'age', 'boy', 'due', 'bed', 'hot', 'cup', 'cut', 'gun', 'kid', 'red', 'sea', 'art', 'air', 'low', 'pay', 'act', 'bit', 'bad', 'law', 'dog', 'key', 'bit', 'arm', 'tax', 'gas'}
        keywords = [word for word in words if word not in stop_words]

    if not keywords:
        return []

    try:
        tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
        matrix = tfidf.fit_transform([" ".join(keywords)])
        scores = matrix.toarray()[0]
        features = tfidf.get_feature_names_out()
        ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)

        return [kw for kw, _ in ranked[:top_n]]

    except ValueError:
        return []


def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
    """
    Filters resumes by keyword match using fuzzy logic.
    """
    job_keywords = extract_keywords(job_description)
    if len(job_keywords) < min_keyword_match:
        st.warning("⚠️ Job description too short or missing for keyword filtering.")
        return resumes, []

    filtered, removed = [], []

    for resume in resumes:
        matched = {
            keyword for keyword in job_keywords
            if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split())
        }

        if len(matched) >= min_keyword_match:
            filtered.append(resume)
        else:
            removed.append(resume["name"])

    return filtered, removed


def create_enhanced_summary(extracted_data, resume_text):
    """
    Create an enhanced summary from structured extraction data.
    Falls back to old summarization if extraction fails.
    """
    try:
        name = extracted_data.get('Name', 'Candidate')
        summary_text = extracted_data.get('Summary', '')
        skills = extracted_data.get('Skills', [])
        experiences = extracted_data.get('StructuredExperiences', [])
        education = extracted_data.get('Education', [])
        
        # Build enhanced summary
        parts = []
        
        # Add name and current title
        if experiences:
            current_job = experiences[0]  # Most recent job
            parts.append(f"{name} - {current_job.get('title', 'Professional')}")
        else:
            parts.append(f"{name} - Professional")
        
        # Add experience summary
        if summary_text:
            parts.append(summary_text[:200] + "..." if len(summary_text) > 200 else summary_text)
        
        # Add key skills (top 5)
        if skills:
            top_skills = skills[:5]
            parts.append(f"Key Skills: {', '.join(top_skills)}")
        
        # Add experience count
        if experiences:
            parts.append(f"Experience: {len(experiences)} positions")
        
        # Add education
        if education:
            parts.append(f"Education: {education[0]}")
        
        return " | ".join(parts)
        
    except Exception as e:
        print(f"❌ Error creating enhanced summary: {e}")
        # Fallback to old summarization
        from .parser import summarize_resume
        return summarize_resume(resume_text)

def score_candidate(resume_text, job_description):
    """
    Computes cosine similarity between resume and job description using embeddings.
    """
    try:
        resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True)
        job_vec = embedding_model.encode(job_description, convert_to_tensor=True)
        score = util.pytorch_cos_sim(resume_vec, job_vec).item()
        return round(score, 4)
    except Exception as e:
        print(f"Error computing similarity: {e}")
        return 0
    
def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
    """
    Evaluate uploaded resumes and return shortlisted candidates with scores and summaries.
    Uses the new hybrid extraction system with OpenAI as primary and HF Cloud as backup.
    """
    candidates, removed_candidates = [], []

    for pdf_file in uploaded_files:
        try:
            # Extract raw text
            resume_text = parse_resume(pdf_file)
            
            # Use new hybrid extraction system (OpenAI primary, HF Cloud backup)
            extracted_data = extract_resume_sections(
                resume_text, 
                prefer_ai=True, 
                use_openai=True,      # Try OpenAI first
                use_hf_cloud=True     # Fallback to HF Cloud
            )
            
            # Get structured data
            candidate_name = extracted_data.get('Name') or pdf_file.name.replace('.pdf', '')
            email = extract_email(resume_text)  # Keep existing email extraction
            
            # Create enhanced summary from structured data
            summary = create_enhanced_summary(extracted_data, resume_text)
            
            # Score the candidate
            score = score_candidate(resume_text, job_description)

            if score < 0.20:
                removed_candidates.append({
                    "name": candidate_name, 
                    "reason": "Low confidence score (< 0.20)"
                })
                continue

            candidates.append({
                "name": candidate_name,
                "resume": resume_text,
                "score": score,
                "email": email,
                "summary": summary,
                "structured_data": extracted_data  # Include structured data for better processing
            })
            
        except Exception as e:
            st.error(f"❌ Error processing {pdf_file.name}: {e}")
            removed_candidates.append({
                "name": pdf_file.name, 
                "reason": f"Processing error: {str(e)}"
            })
            continue

    # 🔹 Step 2: Filter candidates based on keyword matches
    filtered_candidates, keyword_removed = filter_resumes_by_keywords(
        candidates, job_description, min_keyword_match
    )
    
    # 🔹 Step 3: Log removed candidates
    for name in keyword_removed:
        removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})
    
    # 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates
    shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]
    
    # 🔹 Step 4.5: Store shortlisted candidates in Supabase
    for candidate in shortlisted_candidates:
        try:
            store_in_supabase(
                resume_text=candidate["resume"],
                score=candidate["score"],
                candidate_name=candidate["name"],
                email=candidate["email"],
                summary=candidate["summary"]
            )
        except Exception as e:
            print(f"❌ Failed to store {candidate['name']} in Supabase: {e}")

    # 🔹 Step 5: Ensure return value is always a list
    if not isinstance(shortlisted_candidates, list):
        print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.")
        return [], removed_candidates

    return shortlisted_candidates, removed_candidates

def store_in_supabase(resume_text, score, candidate_name, email, summary):
    """
    Saves candidate data to the Supabase table.
    """
    data = {
        "name": candidate_name,
        "resume": resume_text,
        "score": score or 0,
        "email": email,
        "summary": summary
    }

    return supabase.table("candidates").insert(data).execute()