Spaces:

gauravbox
/

TalentLensAI

Running

TalentLensAI / utils /screening.py

Johnny

feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.

c2f9ec8 about 1 month ago

raw

history blame

7.99 kB

	# utils/screening.py
	from .parser import parse_resume, extract_email, summarize_resume
	from .hybrid_extractor import extract_resume_sections
	from config import supabase, embedding_model, client
	import spacy
	from fuzzywuzzy import fuzz
	from sentence_transformers import util
	import streamlit as st

	# Load spaCy model for keyword extraction
	nlp = spacy.load("en_core_web_sm")
	from sklearn.feature_extraction.text import TfidfVectorizer

	def extract_keywords(text, top_n=10):
	"""
	Extracts top keywords from the job description using spaCy and TF-IDF.
	"""
	if not text.strip():
	return []

	doc = nlp(text.lower())
	keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop]

	if not keywords:
	return []

	try:
	tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
	matrix = tfidf.fit_transform([" ".join(keywords)])
	scores = matrix.toarray()[0]
	features = tfidf.get_feature_names_out()
	ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)

	return [kw for kw, _ in ranked[:top_n]]

	except ValueError:
	return []


	def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
	"""
	Filters resumes by keyword match using fuzzy logic.
	"""
	job_keywords = extract_keywords(job_description)
	if len(job_keywords) < min_keyword_match:
	st.warning("⚠️ Job description too short or missing for keyword filtering.")
	return resumes, []

	filtered, removed = [], []

	for resume in resumes:
	matched = {
	keyword for keyword in job_keywords
	if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split())
	}

	if len(matched) >= min_keyword_match:
	filtered.append(resume)
	else:
	removed.append(resume["name"])

	return filtered, removed


	def create_enhanced_summary(extracted_data, resume_text):
	"""
	Create an enhanced summary from structured extraction data.
	Falls back to old summarization if extraction fails.
	"""
	try:
	name = extracted_data.get('Name', 'Candidate')
	summary_text = extracted_data.get('Summary', '')
	skills = extracted_data.get('Skills', [])
	experiences = extracted_data.get('StructuredExperiences', [])
	education = extracted_data.get('Education', [])

	# Build enhanced summary
	parts = []

	# Add name and current title
	if experiences:
	current_job = experiences[0] # Most recent job
	parts.append(f"{name} - {current_job.get('title', 'Professional')}")
	else:
	parts.append(f"{name} - Professional")

	# Add experience summary
	if summary_text:
	parts.append(summary_text[:200] + "..." if len(summary_text) > 200 else summary_text)

	# Add key skills (top 5)
	if skills:
	top_skills = skills[:5]
	parts.append(f"Key Skills: {', '.join(top_skills)}")

	# Add experience count
	if experiences:
	parts.append(f"Experience: {len(experiences)} positions")

	# Add education
	if education:
	parts.append(f"Education: {education[0]}")

	return " \| ".join(parts)

	except Exception as e:
	print(f"❌ Error creating enhanced summary: {e}")
	# Fallback to old summarization
	from .parser import summarize_resume
	return summarize_resume(resume_text)

	def score_candidate(resume_text, job_description):
	"""
	Computes cosine similarity between resume and job description using embeddings.
	"""
	try:
	resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True)
	job_vec = embedding_model.encode(job_description, convert_to_tensor=True)
	score = util.pytorch_cos_sim(resume_vec, job_vec).item()
	return round(score, 4)
	except Exception as e:
	print(f"Error computing similarity: {e}")
	return 0

	def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
	"""
	Evaluate uploaded resumes and return shortlisted candidates with scores and summaries.
	Uses the new hybrid extraction system with OpenAI as primary and HF Cloud as backup.
	"""
	candidates, removed_candidates = [], []

	for pdf_file in uploaded_files:
	try:
	# Extract raw text
	resume_text = parse_resume(pdf_file)

	# Use new hybrid extraction system (OpenAI primary, HF Cloud backup)
	extracted_data = extract_resume_sections(
	resume_text,
	prefer_ai=True,
	use_openai=True, # Try OpenAI first
	use_hf_cloud=True # Fallback to HF Cloud
	)

	# Get structured data
	candidate_name = extracted_data.get('Name') or pdf_file.name.replace('.pdf', '')
	email = extract_email(resume_text) # Keep existing email extraction

	# Create enhanced summary from structured data
	summary = create_enhanced_summary(extracted_data, resume_text)

	# Score the candidate
	score = score_candidate(resume_text, job_description)

	if score < 0.20:
	removed_candidates.append({
	"name": candidate_name,
	"reason": "Low confidence score (< 0.20)"
	})
	continue

	candidates.append({
	"name": candidate_name,
	"resume": resume_text,
	"score": score,
	"email": email,
	"summary": summary,
	"structured_data": extracted_data # Include structured data for better processing
	})

	except Exception as e:
	st.error(f"❌ Error processing {pdf_file.name}: {e}")
	removed_candidates.append({
	"name": pdf_file.name,
	"reason": f"Processing error: {str(e)}"
	})
	continue

	# 🔹 Step 2: Filter candidates based on keyword matches
	filtered_candidates, keyword_removed = filter_resumes_by_keywords(
	candidates, job_description, min_keyword_match
	)

	# 🔹 Step 3: Log removed candidates
	for name in keyword_removed:
	removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})

	# 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates
	shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]

	# 🔹 Step 4.5: Store shortlisted candidates in Supabase
	for candidate in shortlisted_candidates:
	try:
	store_in_supabase(
	resume_text=candidate["resume"],
	score=candidate["score"],
	candidate_name=candidate["name"],
	email=candidate["email"],
	summary=candidate["summary"]
	)
	except Exception as e:
	print(f"❌ Failed to store {candidate['name']} in Supabase: {e}")

	# 🔹 Step 5: Ensure return value is always a list
	if not isinstance(shortlisted_candidates, list):
	print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.")
	return [], removed_candidates

	return shortlisted_candidates, removed_candidates

	def store_in_supabase(resume_text, score, candidate_name, email, summary):
	"""
	Saves candidate data to the Supabase table.
	"""
	data = {
	"name": candidate_name,
	"resume": resume_text,
	"score": score or 0,
	"email": email,
	"summary": summary
	}

	return supabase.table("candidates").insert(data).execute()