Spaces:
Running
Running
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
# utils/screening.py | |
from .parser import parse_resume, extract_email, summarize_resume | |
from .hybrid_extractor import extract_resume_sections | |
from config import supabase, embedding_model, client | |
import spacy | |
from fuzzywuzzy import fuzz | |
from sentence_transformers import util | |
import streamlit as st | |
# Load spaCy model for keyword extraction | |
nlp = spacy.load("en_core_web_sm") | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def extract_keywords(text, top_n=10): | |
""" | |
Extracts top keywords from the job description using spaCy and TF-IDF. | |
""" | |
if not text.strip(): | |
return [] | |
doc = nlp(text.lower()) | |
keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop] | |
if not keywords: | |
return [] | |
try: | |
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) | |
matrix = tfidf.fit_transform([" ".join(keywords)]) | |
scores = matrix.toarray()[0] | |
features = tfidf.get_feature_names_out() | |
ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True) | |
return [kw for kw, _ in ranked[:top_n]] | |
except ValueError: | |
return [] | |
def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): | |
""" | |
Filters resumes by keyword match using fuzzy logic. | |
""" | |
job_keywords = extract_keywords(job_description) | |
if len(job_keywords) < min_keyword_match: | |
st.warning("โ ๏ธ Job description too short or missing for keyword filtering.") | |
return resumes, [] | |
filtered, removed = [], [] | |
for resume in resumes: | |
matched = { | |
keyword for keyword in job_keywords | |
if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split()) | |
} | |
if len(matched) >= min_keyword_match: | |
filtered.append(resume) | |
else: | |
removed.append(resume["name"]) | |
return filtered, removed | |
def create_enhanced_summary(extracted_data, resume_text): | |
""" | |
Create an enhanced summary from structured extraction data. | |
Falls back to old summarization if extraction fails. | |
""" | |
try: | |
name = extracted_data.get('Name', 'Candidate') | |
summary_text = extracted_data.get('Summary', '') | |
skills = extracted_data.get('Skills', []) | |
experiences = extracted_data.get('StructuredExperiences', []) | |
education = extracted_data.get('Education', []) | |
# Build enhanced summary | |
parts = [] | |
# Add name and current title | |
if experiences: | |
current_job = experiences[0] # Most recent job | |
parts.append(f"{name} - {current_job.get('title', 'Professional')}") | |
else: | |
parts.append(f"{name} - Professional") | |
# Add experience summary | |
if summary_text: | |
parts.append(summary_text[:200] + "..." if len(summary_text) > 200 else summary_text) | |
# Add key skills (top 5) | |
if skills: | |
top_skills = skills[:5] | |
parts.append(f"Key Skills: {', '.join(top_skills)}") | |
# Add experience count | |
if experiences: | |
parts.append(f"Experience: {len(experiences)} positions") | |
# Add education | |
if education: | |
parts.append(f"Education: {education[0]}") | |
return " | ".join(parts) | |
except Exception as e: | |
print(f"โ Error creating enhanced summary: {e}") | |
# Fallback to old summarization | |
from .parser import summarize_resume | |
return summarize_resume(resume_text) | |
def score_candidate(resume_text, job_description): | |
""" | |
Computes cosine similarity between resume and job description using embeddings. | |
""" | |
try: | |
resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True) | |
job_vec = embedding_model.encode(job_description, convert_to_tensor=True) | |
score = util.pytorch_cos_sim(resume_vec, job_vec).item() | |
return round(score, 4) | |
except Exception as e: | |
print(f"Error computing similarity: {e}") | |
return 0 | |
def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): | |
""" | |
Evaluate uploaded resumes and return shortlisted candidates with scores and summaries. | |
Uses the new hybrid extraction system with OpenAI as primary and HF Cloud as backup. | |
""" | |
candidates, removed_candidates = [], [] | |
for pdf_file in uploaded_files: | |
try: | |
# Extract raw text | |
resume_text = parse_resume(pdf_file) | |
# Use new hybrid extraction system (OpenAI primary, HF Cloud backup) | |
extracted_data = extract_resume_sections( | |
resume_text, | |
prefer_ai=True, | |
use_openai=True, # Try OpenAI first | |
use_hf_cloud=True # Fallback to HF Cloud | |
) | |
# Get structured data | |
candidate_name = extracted_data.get('Name') or pdf_file.name.replace('.pdf', '') | |
email = extract_email(resume_text) # Keep existing email extraction | |
# Create enhanced summary from structured data | |
summary = create_enhanced_summary(extracted_data, resume_text) | |
# Score the candidate | |
score = score_candidate(resume_text, job_description) | |
if score < 0.20: | |
removed_candidates.append({ | |
"name": candidate_name, | |
"reason": "Low confidence score (< 0.20)" | |
}) | |
continue | |
candidates.append({ | |
"name": candidate_name, | |
"resume": resume_text, | |
"score": score, | |
"email": email, | |
"summary": summary, | |
"structured_data": extracted_data # Include structured data for better processing | |
}) | |
except Exception as e: | |
st.error(f"โ Error processing {pdf_file.name}: {e}") | |
removed_candidates.append({ | |
"name": pdf_file.name, | |
"reason": f"Processing error: {str(e)}" | |
}) | |
continue | |
# ๐น Step 2: Filter candidates based on keyword matches | |
filtered_candidates, keyword_removed = filter_resumes_by_keywords( | |
candidates, job_description, min_keyword_match | |
) | |
# ๐น Step 3: Log removed candidates | |
for name in keyword_removed: | |
removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) | |
# ๐น Step 4: Ensure the final list is sorted by score and limit to top 5 candidates | |
shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] | |
# ๐น Step 4.5: Store shortlisted candidates in Supabase | |
for candidate in shortlisted_candidates: | |
try: | |
store_in_supabase( | |
resume_text=candidate["resume"], | |
score=candidate["score"], | |
candidate_name=candidate["name"], | |
email=candidate["email"], | |
summary=candidate["summary"] | |
) | |
except Exception as e: | |
print(f"โ Failed to store {candidate['name']} in Supabase: {e}") | |
# ๐น Step 5: Ensure return value is always a list | |
if not isinstance(shortlisted_candidates, list): | |
print("โ ๏ธ ERROR: shortlisted_candidates is not a list! Returning empty list.") | |
return [], removed_candidates | |
return shortlisted_candidates, removed_candidates | |
def store_in_supabase(resume_text, score, candidate_name, email, summary): | |
""" | |
Saves candidate data to the Supabase table. | |
""" | |
data = { | |
"name": candidate_name, | |
"resume": resume_text, | |
"score": score or 0, | |
"email": email, | |
"summary": summary | |
} | |
return supabase.table("candidates").insert(data).execute() |