import os, re, json, subprocess from utils.parser import extract_name # <= your helper from utils.spacy_loader import get_nlp, is_spacy_available from datetime import datetime from dateutil.parser import parse as date_parse # Load spaCy model with fallback nlp = get_nlp() # Initialize spaCy matchers only if spaCy is available if nlp and is_spacy_available(): from spacy.matcher import PhraseMatcher, Matcher # ----------------------------- data lists ----------------------------- BASE = os.path.dirname(__file__) SKILLS = json.load(open(os.path.join(BASE, "data/skills.json"))) \ if os.path.exists(os.path.join(BASE,"data/skills.json")) \ else ["python","sql","aws","selenium"] JOB_TITLES = json.load(open(os.path.join(BASE, "data/job_titles.json")))\ if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\ else [] skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS]) edu_matcher = Matcher(nlp.vocab) edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]]) edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]]) else: # Fallback: set matchers to None when spaCy is not available skill_matcher = None edu_matcher = None SKILLS = ["python","sql","aws","selenium"] JOB_TITLES = [] # ----------------------------- regex helpers -------------------------- # Jonathan's format: Company | Location | Title | Date ROLE_FOUR_PARTS = re.compile( r"""^(?P.+?)\s*\|\s*(?P.+?)\s*\|\s*(?P.+?)\s*\|\s* (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) # Original format: Title | Company | Date ROLE_ONE = re.compile( r"""^(?P<title>.+?)\s*\|\s*(?P<company>.+?)\s*\|\s* (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) # Also support the original comma/@ format for backward compatibility ROLE_ONE_COMMA = re.compile( r"""^(?P<company>.+?)\s*[,@]\s*(?P<title>[^,@]+?)\s+ (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) DATE_LINE = re.compile( r"""^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?\s*$""", re.I|re.X) BULLET = re.compile(r"^\s*(?:[-•·]|\*|●)\s+") HEAD = re.compile(r"^\s*(summary|skills?|technical\s+skills?|education|training|projects?|work\s+experience|experience|professional\s+experience|certifications?)[:\s]*$",re.I) # ----------------------------- main ----------------------------------- def extract_sections_spacy_fixed(text:str)->dict: lines = [ln.rstrip() for ln in text.splitlines()] # Only create spaCy doc if nlp is available doc = nlp(text) if nlp and is_spacy_available() else None # Helper function for contact detection def is_contact(s): return bool(re.search(r"@\w|\d{3}[-.\s]?\d{3}",s)) out = { "Name" : extract_name(text), "Summary" : "", "Skills" : [], "StructuredExperiences": [], "Education" : [], "Training" : [] } # ---------- skills extraction (FIXED) ------ # Extract ONLY from Technical Skills section to avoid noise skills_from_section = set() for i, line in enumerate(lines): if re.match(r"^\s*technical\s+skills?\s*$", line.strip(), re.I): # Found the heading, now collect the skills content for j in range(i + 1, len(lines)): next_line = lines[j].strip() if not next_line: # Empty line continue if HEAD.match(next_line): # Next section heading break if is_contact(next_line): # Contact info break # Handle bullet point format like "● Programming Languages: Python, Java, SQL, Apex, Bash" if next_line.startswith('●'): # Remove bullet and extract the part after the colon clean_line = next_line[1:].strip() # Remove ● if ':' in clean_line: # Split on colon and take the part after it skills_part = clean_line.split(':', 1)[1].strip() # Split skills by comma skills_in_line = re.split(r',\s*', skills_part) for skill in skills_in_line: skill = skill.strip() if skill and len(skill) > 1 and not skill.endswith(')'): # Avoid incomplete entries skills_from_section.add(skill) else: # Handle non-bullet format skills_in_line = re.split(r',\s*', next_line) for skill in skills_in_line: skill = skill.strip() # Remove bullet points and clean up skill = re.sub(r'^\s*[•·\-\*●]\s*', '', skill) if skill and len(skill) > 1: # Avoid single characters skills_from_section.add(skill) break # Use only section-extracted skills to avoid spaCy noise out["Skills"] = sorted(skills_from_section) # ---------- summary (improved extraction) ------ # First try: look for content after "Summary" or "Professional Summary" heading summary_found = False for i, line in enumerate(lines): if re.match(r"^\s*(professional\s+)?summary\s*$", line.strip(), re.I): # Found the heading, now collect the summary content summary_lines = [] for j in range(i + 1, len(lines)): next_line = lines[j].strip() if not next_line: # Empty line continue if HEAD.match(next_line): # Next section heading break if is_contact(next_line): # Contact info break summary_lines.append(next_line) if summary_lines: out["Summary"] = " ".join(summary_lines) summary_found = True break # Fallback: original method (first non-heading/non-contact paragraph) if not summary_found: for para in re.split(r"\n\s*\n", text): p = para.strip() if p and not HEAD.match(p) and not is_contact(p): out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I) break # ---------- experiences (FIXED) ------------------------------------------- i=0 while i < len(lines): ln = lines[i].strip() # Try four-part format first (Company | Location | Title | Date) m4 = ROLE_FOUR_PARTS.match(ln) if m4: company, location, title, dates = m4.group("company","location","title","dates") company = f"{company}, {location}" # Combine company and location i += 1 # Try pipe-separated format (Title | Company | Date) elif ROLE_ONE.match(ln): m1 = ROLE_ONE.match(ln) title, company, dates = m1.group("title","company","dates") i += 1 # Try comma-separated format (Company, Title Date) elif ROLE_ONE_COMMA.match(ln): m2 = ROLE_ONE_COMMA.match(ln) company, title, dates = m2.group("company","title","dates") i += 1 # Try two-liner format elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()): first = lines[i].strip() parts = re.split(r"[,@|\|]\s*", first, 1) # Support both comma and pipe if len(parts) == 2: title = parts[0].strip() company = parts[1].strip() else: title = first company = "" dates = lines[i+1].strip() i += 2 else: i += 1 continue exp = { "title" : title, "company" : company, "date_range" : dates, "responsibilities": [] } # FIXED: Collect responsibilities properly while i < len(lines): nxt = lines[i].strip() if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt): break if BULLET.match(nxt): responsibility = BULLET.sub("",nxt).strip() if responsibility: # Only add non-empty responsibilities exp["responsibilities"].append(responsibility) i += 1 out["StructuredExperiences"].append(exp) # ---------- education / training / certifications ----------------------------------- # Use spaCy matchers if available, otherwise use regex fallback if doc and edu_matcher and is_spacy_available(): for mid, s, e in edu_matcher(doc): bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training" out[bucket].append(doc[s:e].text) else: # Regex fallback for education extraction edu_patterns = [ r"(?i)\b(?:bachelor|master|phd|doctorate|associate).*(?:degree|of|in)\s+([^,\n]+)", r"(?i)\b(?:bs|ba|ms|ma|mba|phd)\s+(?:in\s+)?([^,\n]+)", r"(?i)\b(?:university|college|institute).*\n?.*(?:bachelor|master|phd|degree)", ] for pattern in edu_patterns: matches = re.findall(pattern, text) for match in matches: if isinstance(match, str) and len(match.strip()) > 3: out["Education"].append(match.strip()) # Also extract certifications section manually cert_section_found = False for i, line in enumerate(lines): if re.match(r"^\s*certifications?\s*$", line.strip(), re.I): cert_section_found = True # Collect certification lines for j in range(i + 1, len(lines)): next_line = lines[j].strip() if not next_line: # Empty line continue if HEAD.match(next_line): # Next section heading break # Split multiple certifications on the same line certs = re.split(r',\s*', next_line) for cert in certs: cert = cert.strip() if cert and not is_contact(cert): out["Training"].append(cert) break return out