TalentLensAI / utils /extractor_fixed.py
Johnny
updated resume_format > template, hide sidebar, download Spacy model with spacy_loader.py
102e49d
import os, re, json, subprocess
from utils.parser import extract_name # <= your helper
from utils.spacy_loader import get_nlp, is_spacy_available
from datetime import datetime
from dateutil.parser import parse as date_parse
# Load spaCy model with fallback
nlp = get_nlp()
# Initialize spaCy matchers only if spaCy is available
if nlp and is_spacy_available():
from spacy.matcher import PhraseMatcher, Matcher
# ----------------------------- data lists -----------------------------
BASE = os.path.dirname(__file__)
SKILLS = json.load(open(os.path.join(BASE, "data/skills.json"))) \
if os.path.exists(os.path.join(BASE,"data/skills.json")) \
else ["python","sql","aws","selenium"]
JOB_TITLES = json.load(open(os.path.join(BASE, "data/job_titles.json")))\
if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\
else []
skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS])
edu_matcher = Matcher(nlp.vocab)
edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]])
edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]])
else:
# Fallback: set matchers to None when spaCy is not available
skill_matcher = None
edu_matcher = None
SKILLS = ["python","sql","aws","selenium"]
JOB_TITLES = []
# ----------------------------- regex helpers --------------------------
# Jonathan's format: Company | Location | Title | Date
ROLE_FOUR_PARTS = re.compile(
r"""^(?P<company>.+?)\s*\|\s*(?P<location>.+?)\s*\|\s*(?P<title>.+?)\s*\|\s*
(?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
(?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)
# Original format: Title | Company | Date
ROLE_ONE = re.compile(
r"""^(?P<title>.+?)\s*\|\s*(?P<company>.+?)\s*\|\s*
(?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
(?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)
# Also support the original comma/@ format for backward compatibility
ROLE_ONE_COMMA = re.compile(
r"""^(?P<company>.+?)\s*[,@]\s*(?P<title>[^,@]+?)\s+
(?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
(?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)
DATE_LINE = re.compile(
r"""^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
(?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?\s*$""", re.I|re.X)
BULLET = re.compile(r"^\s*(?:[-β€’Β·]|\*|●)\s+")
HEAD = re.compile(r"^\s*(summary|skills?|technical\s+skills?|education|training|projects?|work\s+experience|experience|professional\s+experience|certifications?)[:\s]*$",re.I)
# ----------------------------- main -----------------------------------
def extract_sections_spacy_fixed(text:str)->dict:
lines = [ln.rstrip() for ln in text.splitlines()]
# Only create spaCy doc if nlp is available
doc = nlp(text) if nlp and is_spacy_available() else None
# Helper function for contact detection
def is_contact(s): return bool(re.search(r"@\w|\d{3}[-.\s]?\d{3}",s))
out = {
"Name" : extract_name(text),
"Summary" : "",
"Skills" : [],
"StructuredExperiences": [],
"Education" : [],
"Training" : []
}
# ---------- skills extraction (FIXED) ------
# Extract ONLY from Technical Skills section to avoid noise
skills_from_section = set()
for i, line in enumerate(lines):
if re.match(r"^\s*technical\s+skills?\s*$", line.strip(), re.I):
# Found the heading, now collect the skills content
for j in range(i + 1, len(lines)):
next_line = lines[j].strip()
if not next_line: # Empty line
continue
if HEAD.match(next_line): # Next section heading
break
if is_contact(next_line): # Contact info
break
# Handle bullet point format like "● Programming Languages: Python, Java, SQL, Apex, Bash"
if next_line.startswith('●'):
# Remove bullet and extract the part after the colon
clean_line = next_line[1:].strip() # Remove ●
if ':' in clean_line:
# Split on colon and take the part after it
skills_part = clean_line.split(':', 1)[1].strip()
# Split skills by comma
skills_in_line = re.split(r',\s*', skills_part)
for skill in skills_in_line:
skill = skill.strip()
if skill and len(skill) > 1 and not skill.endswith(')'): # Avoid incomplete entries
skills_from_section.add(skill)
else:
# Handle non-bullet format
skills_in_line = re.split(r',\s*', next_line)
for skill in skills_in_line:
skill = skill.strip()
# Remove bullet points and clean up
skill = re.sub(r'^\s*[β€’Β·\-\*●]\s*', '', skill)
if skill and len(skill) > 1: # Avoid single characters
skills_from_section.add(skill)
break
# Use only section-extracted skills to avoid spaCy noise
out["Skills"] = sorted(skills_from_section)
# ---------- summary (improved extraction) ------
# First try: look for content after "Summary" or "Professional Summary" heading
summary_found = False
for i, line in enumerate(lines):
if re.match(r"^\s*(professional\s+)?summary\s*$", line.strip(), re.I):
# Found the heading, now collect the summary content
summary_lines = []
for j in range(i + 1, len(lines)):
next_line = lines[j].strip()
if not next_line: # Empty line
continue
if HEAD.match(next_line): # Next section heading
break
if is_contact(next_line): # Contact info
break
summary_lines.append(next_line)
if summary_lines:
out["Summary"] = " ".join(summary_lines)
summary_found = True
break
# Fallback: original method (first non-heading/non-contact paragraph)
if not summary_found:
for para in re.split(r"\n\s*\n", text):
p = para.strip()
if p and not HEAD.match(p) and not is_contact(p):
out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I)
break
# ---------- experiences (FIXED) -------------------------------------------
i=0
while i < len(lines):
ln = lines[i].strip()
# Try four-part format first (Company | Location | Title | Date)
m4 = ROLE_FOUR_PARTS.match(ln)
if m4:
company, location, title, dates = m4.group("company","location","title","dates")
company = f"{company}, {location}" # Combine company and location
i += 1
# Try pipe-separated format (Title | Company | Date)
elif ROLE_ONE.match(ln):
m1 = ROLE_ONE.match(ln)
title, company, dates = m1.group("title","company","dates")
i += 1
# Try comma-separated format (Company, Title Date)
elif ROLE_ONE_COMMA.match(ln):
m2 = ROLE_ONE_COMMA.match(ln)
company, title, dates = m2.group("company","title","dates")
i += 1
# Try two-liner format
elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()):
first = lines[i].strip()
parts = re.split(r"[,@|\|]\s*", first, 1) # Support both comma and pipe
if len(parts) == 2:
title = parts[0].strip()
company = parts[1].strip()
else:
title = first
company = ""
dates = lines[i+1].strip()
i += 2
else:
i += 1
continue
exp = {
"title" : title,
"company" : company,
"date_range" : dates,
"responsibilities": []
}
# FIXED: Collect responsibilities properly
while i < len(lines):
nxt = lines[i].strip()
if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt):
break
if BULLET.match(nxt):
responsibility = BULLET.sub("",nxt).strip()
if responsibility: # Only add non-empty responsibilities
exp["responsibilities"].append(responsibility)
i += 1
out["StructuredExperiences"].append(exp)
# ---------- education / training / certifications -----------------------------------
# Use spaCy matchers if available, otherwise use regex fallback
if doc and edu_matcher and is_spacy_available():
for mid, s, e in edu_matcher(doc):
bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training"
out[bucket].append(doc[s:e].text)
else:
# Regex fallback for education extraction
edu_patterns = [
r"(?i)\b(?:bachelor|master|phd|doctorate|associate).*(?:degree|of|in)\s+([^,\n]+)",
r"(?i)\b(?:bs|ba|ms|ma|mba|phd)\s+(?:in\s+)?([^,\n]+)",
r"(?i)\b(?:university|college|institute).*\n?.*(?:bachelor|master|phd|degree)",
]
for pattern in edu_patterns:
matches = re.findall(pattern, text)
for match in matches:
if isinstance(match, str) and len(match.strip()) > 3:
out["Education"].append(match.strip())
# Also extract certifications section manually
cert_section_found = False
for i, line in enumerate(lines):
if re.match(r"^\s*certifications?\s*$", line.strip(), re.I):
cert_section_found = True
# Collect certification lines
for j in range(i + 1, len(lines)):
next_line = lines[j].strip()
if not next_line: # Empty line
continue
if HEAD.match(next_line): # Next section heading
break
# Split multiple certifications on the same line
certs = re.split(r',\s*', next_line)
for cert in certs:
cert = cert.strip()
if cert and not is_contact(cert):
out["Training"].append(cert)
break
return out