Spaces:
Running
Running
Johnny
updated resume_format > template, hide sidebar, download Spacy model with spacy_loader.py
102e49d
import os, re, json, subprocess | |
from utils.parser import extract_name # <= your helper | |
from utils.spacy_loader import get_nlp, is_spacy_available | |
from datetime import datetime | |
from dateutil.parser import parse as date_parse | |
# Load spaCy model with fallback | |
nlp = get_nlp() | |
# Initialize spaCy matchers only if spaCy is available | |
if nlp and is_spacy_available(): | |
from spacy.matcher import PhraseMatcher, Matcher | |
# ----------------------------- data lists ----------------------------- | |
BASE = os.path.dirname(__file__) | |
SKILLS = json.load(open(os.path.join(BASE, "data/skills.json"))) \ | |
if os.path.exists(os.path.join(BASE,"data/skills.json")) \ | |
else ["python","sql","aws","selenium"] | |
JOB_TITLES = json.load(open(os.path.join(BASE, "data/job_titles.json")))\ | |
if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\ | |
else [] | |
skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") | |
skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS]) | |
edu_matcher = Matcher(nlp.vocab) | |
edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]]) | |
edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]]) | |
else: | |
# Fallback: set matchers to None when spaCy is not available | |
skill_matcher = None | |
edu_matcher = None | |
SKILLS = ["python","sql","aws","selenium"] | |
JOB_TITLES = [] | |
# ----------------------------- regex helpers -------------------------- | |
# Jonathan's format: Company | Location | Title | Date | |
ROLE_FOUR_PARTS = re.compile( | |
r"""^(?P<company>.+?)\s*\|\s*(?P<location>.+?)\s*\|\s*(?P<title>.+?)\s*\|\s* | |
(?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
(?:\s*[β-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) | |
# Original format: Title | Company | Date | |
ROLE_ONE = re.compile( | |
r"""^(?P<title>.+?)\s*\|\s*(?P<company>.+?)\s*\|\s* | |
(?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
(?:\s*[β-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) | |
# Also support the original comma/@ format for backward compatibility | |
ROLE_ONE_COMMA = re.compile( | |
r"""^(?P<company>.+?)\s*[,@]\s*(?P<title>[^,@]+?)\s+ | |
(?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
(?:\s*[β-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) | |
DATE_LINE = re.compile( | |
r"""^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
(?:\s*[β-]\s*(?:Present|\w+\s+\d{4}))?\s*$""", re.I|re.X) | |
BULLET = re.compile(r"^\s*(?:[-β’Β·]|\*|β)\s+") | |
HEAD = re.compile(r"^\s*(summary|skills?|technical\s+skills?|education|training|projects?|work\s+experience|experience|professional\s+experience|certifications?)[:\s]*$",re.I) | |
# ----------------------------- main ----------------------------------- | |
def extract_sections_spacy_fixed(text:str)->dict: | |
lines = [ln.rstrip() for ln in text.splitlines()] | |
# Only create spaCy doc if nlp is available | |
doc = nlp(text) if nlp and is_spacy_available() else None | |
# Helper function for contact detection | |
def is_contact(s): return bool(re.search(r"@\w|\d{3}[-.\s]?\d{3}",s)) | |
out = { | |
"Name" : extract_name(text), | |
"Summary" : "", | |
"Skills" : [], | |
"StructuredExperiences": [], | |
"Education" : [], | |
"Training" : [] | |
} | |
# ---------- skills extraction (FIXED) ------ | |
# Extract ONLY from Technical Skills section to avoid noise | |
skills_from_section = set() | |
for i, line in enumerate(lines): | |
if re.match(r"^\s*technical\s+skills?\s*$", line.strip(), re.I): | |
# Found the heading, now collect the skills content | |
for j in range(i + 1, len(lines)): | |
next_line = lines[j].strip() | |
if not next_line: # Empty line | |
continue | |
if HEAD.match(next_line): # Next section heading | |
break | |
if is_contact(next_line): # Contact info | |
break | |
# Handle bullet point format like "β Programming Languages: Python, Java, SQL, Apex, Bash" | |
if next_line.startswith('β'): | |
# Remove bullet and extract the part after the colon | |
clean_line = next_line[1:].strip() # Remove β | |
if ':' in clean_line: | |
# Split on colon and take the part after it | |
skills_part = clean_line.split(':', 1)[1].strip() | |
# Split skills by comma | |
skills_in_line = re.split(r',\s*', skills_part) | |
for skill in skills_in_line: | |
skill = skill.strip() | |
if skill and len(skill) > 1 and not skill.endswith(')'): # Avoid incomplete entries | |
skills_from_section.add(skill) | |
else: | |
# Handle non-bullet format | |
skills_in_line = re.split(r',\s*', next_line) | |
for skill in skills_in_line: | |
skill = skill.strip() | |
# Remove bullet points and clean up | |
skill = re.sub(r'^\s*[β’Β·\-\*β]\s*', '', skill) | |
if skill and len(skill) > 1: # Avoid single characters | |
skills_from_section.add(skill) | |
break | |
# Use only section-extracted skills to avoid spaCy noise | |
out["Skills"] = sorted(skills_from_section) | |
# ---------- summary (improved extraction) ------ | |
# First try: look for content after "Summary" or "Professional Summary" heading | |
summary_found = False | |
for i, line in enumerate(lines): | |
if re.match(r"^\s*(professional\s+)?summary\s*$", line.strip(), re.I): | |
# Found the heading, now collect the summary content | |
summary_lines = [] | |
for j in range(i + 1, len(lines)): | |
next_line = lines[j].strip() | |
if not next_line: # Empty line | |
continue | |
if HEAD.match(next_line): # Next section heading | |
break | |
if is_contact(next_line): # Contact info | |
break | |
summary_lines.append(next_line) | |
if summary_lines: | |
out["Summary"] = " ".join(summary_lines) | |
summary_found = True | |
break | |
# Fallback: original method (first non-heading/non-contact paragraph) | |
if not summary_found: | |
for para in re.split(r"\n\s*\n", text): | |
p = para.strip() | |
if p and not HEAD.match(p) and not is_contact(p): | |
out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I) | |
break | |
# ---------- experiences (FIXED) ------------------------------------------- | |
i=0 | |
while i < len(lines): | |
ln = lines[i].strip() | |
# Try four-part format first (Company | Location | Title | Date) | |
m4 = ROLE_FOUR_PARTS.match(ln) | |
if m4: | |
company, location, title, dates = m4.group("company","location","title","dates") | |
company = f"{company}, {location}" # Combine company and location | |
i += 1 | |
# Try pipe-separated format (Title | Company | Date) | |
elif ROLE_ONE.match(ln): | |
m1 = ROLE_ONE.match(ln) | |
title, company, dates = m1.group("title","company","dates") | |
i += 1 | |
# Try comma-separated format (Company, Title Date) | |
elif ROLE_ONE_COMMA.match(ln): | |
m2 = ROLE_ONE_COMMA.match(ln) | |
company, title, dates = m2.group("company","title","dates") | |
i += 1 | |
# Try two-liner format | |
elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()): | |
first = lines[i].strip() | |
parts = re.split(r"[,@|\|]\s*", first, 1) # Support both comma and pipe | |
if len(parts) == 2: | |
title = parts[0].strip() | |
company = parts[1].strip() | |
else: | |
title = first | |
company = "" | |
dates = lines[i+1].strip() | |
i += 2 | |
else: | |
i += 1 | |
continue | |
exp = { | |
"title" : title, | |
"company" : company, | |
"date_range" : dates, | |
"responsibilities": [] | |
} | |
# FIXED: Collect responsibilities properly | |
while i < len(lines): | |
nxt = lines[i].strip() | |
if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt): | |
break | |
if BULLET.match(nxt): | |
responsibility = BULLET.sub("",nxt).strip() | |
if responsibility: # Only add non-empty responsibilities | |
exp["responsibilities"].append(responsibility) | |
i += 1 | |
out["StructuredExperiences"].append(exp) | |
# ---------- education / training / certifications ----------------------------------- | |
# Use spaCy matchers if available, otherwise use regex fallback | |
if doc and edu_matcher and is_spacy_available(): | |
for mid, s, e in edu_matcher(doc): | |
bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training" | |
out[bucket].append(doc[s:e].text) | |
else: | |
# Regex fallback for education extraction | |
edu_patterns = [ | |
r"(?i)\b(?:bachelor|master|phd|doctorate|associate).*(?:degree|of|in)\s+([^,\n]+)", | |
r"(?i)\b(?:bs|ba|ms|ma|mba|phd)\s+(?:in\s+)?([^,\n]+)", | |
r"(?i)\b(?:university|college|institute).*\n?.*(?:bachelor|master|phd|degree)", | |
] | |
for pattern in edu_patterns: | |
matches = re.findall(pattern, text) | |
for match in matches: | |
if isinstance(match, str) and len(match.strip()) > 3: | |
out["Education"].append(match.strip()) | |
# Also extract certifications section manually | |
cert_section_found = False | |
for i, line in enumerate(lines): | |
if re.match(r"^\s*certifications?\s*$", line.strip(), re.I): | |
cert_section_found = True | |
# Collect certification lines | |
for j in range(i + 1, len(lines)): | |
next_line = lines[j].strip() | |
if not next_line: # Empty line | |
continue | |
if HEAD.match(next_line): # Next section heading | |
break | |
# Split multiple certifications on the same line | |
certs = re.split(r',\s*', next_line) | |
for cert in certs: | |
cert = cert.strip() | |
if cert and not is_contact(cert): | |
out["Training"].append(cert) | |
break | |
return out |