Spaces:

gauravbox
/

TalentLensAI

Running

File size: 11,084 Bytes

import os, re, json, subprocess
from utils.parser import extract_name        # <= your helper
from utils.spacy_loader import get_nlp, is_spacy_available
from datetime import datetime
from dateutil.parser import parse as date_parse

# Load spaCy model with fallback
nlp = get_nlp()

# Initialize spaCy matchers only if spaCy is available
if nlp and is_spacy_available():
    from spacy.matcher import PhraseMatcher, Matcher
    
    # ----------------------------- data lists -----------------------------
    BASE = os.path.dirname(__file__)
    SKILLS      = json.load(open(os.path.join(BASE, "data/skills.json")))   \
                  if os.path.exists(os.path.join(BASE,"data/skills.json"))  \
                  else ["python","sql","aws","selenium"]
    JOB_TITLES  = json.load(open(os.path.join(BASE, "data/job_titles.json")))\
                  if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\
                  else []

    skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS])

    edu_matcher = Matcher(nlp.vocab)
    edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]])
    edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]])
else:
    # Fallback: set matchers to None when spaCy is not available
    skill_matcher = None
    edu_matcher = None
    SKILLS = ["python","sql","aws","selenium"]
    JOB_TITLES = []

# ----------------------------- regex helpers --------------------------
# Jonathan's format: Company | Location | Title | Date
ROLE_FOUR_PARTS = re.compile(
    r"""^(?P<company>.+?)\s*\|\s*(?P<location>.+?)\s*\|\s*(?P<title>.+?)\s*\|\s*
        (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)

# Original format: Title | Company | Date
ROLE_ONE   = re.compile(
    r"""^(?P<title>.+?)\s*\|\s*(?P<company>.+?)\s*\|\s*
        (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)

# Also support the original comma/@ format for backward compatibility
ROLE_ONE_COMMA = re.compile(
    r"""^(?P<company>.+?)\s*[,@]\s*(?P<title>[^,@]+?)\s+
        (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)

DATE_LINE  = re.compile(
    r"""^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?\s*$""", re.I|re.X)

BULLET     = re.compile(r"^\s*(?:[-•·]|\*|●)\s+")
HEAD       = re.compile(r"^\s*(summary|skills?|technical\s+skills?|education|training|projects?|work\s+experience|experience|professional\s+experience|certifications?)[:\s]*$",re.I)

# ----------------------------- main -----------------------------------
def extract_sections_spacy_fixed(text:str)->dict:
    lines = [ln.rstrip() for ln in text.splitlines()]
    
    # Only create spaCy doc if nlp is available
    doc = nlp(text) if nlp and is_spacy_available() else None

    # Helper function for contact detection
    def is_contact(s): return bool(re.search(r"@\w|\d{3}[-.\s]?\d{3}",s))

    out = {
        "Name"                 : extract_name(text),
        "Summary"              : "",
        "Skills"               : [],
        "StructuredExperiences": [],
        "Education"            : [],
        "Training"             : []
    }

    # ---------- skills extraction (FIXED) ------
    # Extract ONLY from Technical Skills section to avoid noise
    skills_from_section = set()
    for i, line in enumerate(lines):
        if re.match(r"^\s*technical\s+skills?\s*$", line.strip(), re.I):
            # Found the heading, now collect the skills content
            for j in range(i + 1, len(lines)):
                next_line = lines[j].strip()
                if not next_line:  # Empty line
                    continue
                if HEAD.match(next_line):  # Next section heading
                    break
                if is_contact(next_line):  # Contact info
                    break
                
                # Handle bullet point format like "● Programming Languages: Python, Java, SQL, Apex, Bash"
                if next_line.startswith('●'):
                    # Remove bullet and extract the part after the colon
                    clean_line = next_line[1:].strip()  # Remove ●
                    if ':' in clean_line:
                        # Split on colon and take the part after it
                        skills_part = clean_line.split(':', 1)[1].strip()
                        # Split skills by comma
                        skills_in_line = re.split(r',\s*', skills_part)
                        for skill in skills_in_line:
                            skill = skill.strip()
                            if skill and len(skill) > 1 and not skill.endswith(')'):  # Avoid incomplete entries
                                skills_from_section.add(skill)
                else:
                    # Handle non-bullet format
                    skills_in_line = re.split(r',\s*', next_line)
                    for skill in skills_in_line:
                        skill = skill.strip()
                        # Remove bullet points and clean up
                        skill = re.sub(r'^\s*[•·\-\*●]\s*', '', skill)
                        if skill and len(skill) > 1:  # Avoid single characters
                            skills_from_section.add(skill)
            break
    
    # Use only section-extracted skills to avoid spaCy noise
    out["Skills"] = sorted(skills_from_section)

    # ---------- summary (improved extraction) ------
    # First try: look for content after "Summary" or "Professional Summary" heading
    summary_found = False
    for i, line in enumerate(lines):
        if re.match(r"^\s*(professional\s+)?summary\s*$", line.strip(), re.I):
            # Found the heading, now collect the summary content
            summary_lines = []
            for j in range(i + 1, len(lines)):
                next_line = lines[j].strip()
                if not next_line:  # Empty line
                    continue
                if HEAD.match(next_line):  # Next section heading
                    break
                if is_contact(next_line):  # Contact info
                    break
                summary_lines.append(next_line)
            if summary_lines:
                out["Summary"] = " ".join(summary_lines)
                summary_found = True
            break
    
    # Fallback: original method (first non-heading/non-contact paragraph)
    if not summary_found:
        for para in re.split(r"\n\s*\n", text):
            p = para.strip()
            if p and not HEAD.match(p) and not is_contact(p):
                out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I)
                break

    # ---------- experiences (FIXED) -------------------------------------------
    i=0
    while i < len(lines):
        ln = lines[i].strip()
        
        # Try four-part format first (Company | Location | Title | Date)
        m4 = ROLE_FOUR_PARTS.match(ln)
        if m4:
            company, location, title, dates = m4.group("company","location","title","dates")
            company = f"{company}, {location}"  # Combine company and location
            i += 1
        # Try pipe-separated format (Title | Company | Date)
        elif ROLE_ONE.match(ln):
            m1 = ROLE_ONE.match(ln)
            title, company, dates = m1.group("title","company","dates")
            i += 1
        # Try comma-separated format (Company, Title Date)
        elif ROLE_ONE_COMMA.match(ln):
            m2 = ROLE_ONE_COMMA.match(ln)
            company, title, dates = m2.group("company","title","dates")
            i += 1
        # Try two-liner format
        elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()):
            first = lines[i].strip()
            parts = re.split(r"[,@|\|]\s*", first, 1)  # Support both comma and pipe
            if len(parts) == 2:
                title = parts[0].strip()
                company = parts[1].strip()
            else:
                title = first
                company = ""
            dates = lines[i+1].strip()
            i += 2
        else:
            i += 1
            continue

        exp = {
            "title"          : title,
            "company"        : company,
            "date_range"     : dates,
            "responsibilities": []
        }

        # FIXED: Collect responsibilities properly
        while i < len(lines):
            nxt = lines[i].strip()
            if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt):
                break
            if BULLET.match(nxt):
                responsibility = BULLET.sub("",nxt).strip()
                if responsibility:  # Only add non-empty responsibilities
                    exp["responsibilities"].append(responsibility)
            i += 1

        out["StructuredExperiences"].append(exp)

    # ---------- education / training / certifications -----------------------------------
    # Use spaCy matchers if available, otherwise use regex fallback
    if doc and edu_matcher and is_spacy_available():
        for mid, s, e in edu_matcher(doc):
            bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training"
            out[bucket].append(doc[s:e].text)
    else:
        # Regex fallback for education extraction
        edu_patterns = [
            r"(?i)\b(?:bachelor|master|phd|doctorate|associate).*(?:degree|of|in)\s+([^,\n]+)",
            r"(?i)\b(?:bs|ba|ms|ma|mba|phd)\s+(?:in\s+)?([^,\n]+)",
            r"(?i)\b(?:university|college|institute).*\n?.*(?:bachelor|master|phd|degree)",
        ]
        
        for pattern in edu_patterns:
            matches = re.findall(pattern, text)
            for match in matches:
                if isinstance(match, str) and len(match.strip()) > 3:
                    out["Education"].append(match.strip())
    
    # Also extract certifications section manually
    cert_section_found = False
    for i, line in enumerate(lines):
        if re.match(r"^\s*certifications?\s*$", line.strip(), re.I):
            cert_section_found = True
            # Collect certification lines
            for j in range(i + 1, len(lines)):
                next_line = lines[j].strip()
                if not next_line:  # Empty line
                    continue
                if HEAD.match(next_line):  # Next section heading
                    break
                # Split multiple certifications on the same line
                certs = re.split(r',\s*', next_line)
                for cert in certs:
                    cert = cert.strip()
                    if cert and not is_contact(cert):
                        out["Training"].append(cert)
            break

    return out