File size: 11,084 Bytes
102e49d
c2f9ec8
102e49d
c2f9ec8
 
 
102e49d
 
c2f9ec8
102e49d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102e49d
 
 
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102e49d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os, re, json, subprocess
from utils.parser import extract_name        # <= your helper
from utils.spacy_loader import get_nlp, is_spacy_available
from datetime import datetime
from dateutil.parser import parse as date_parse

# Load spaCy model with fallback
nlp = get_nlp()

# Initialize spaCy matchers only if spaCy is available
if nlp and is_spacy_available():
    from spacy.matcher import PhraseMatcher, Matcher
    
    # ----------------------------- data lists -----------------------------
    BASE = os.path.dirname(__file__)
    SKILLS      = json.load(open(os.path.join(BASE, "data/skills.json")))   \
                  if os.path.exists(os.path.join(BASE,"data/skills.json"))  \
                  else ["python","sql","aws","selenium"]
    JOB_TITLES  = json.load(open(os.path.join(BASE, "data/job_titles.json")))\
                  if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\
                  else []

    skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS])

    edu_matcher = Matcher(nlp.vocab)
    edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]])
    edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]])
else:
    # Fallback: set matchers to None when spaCy is not available
    skill_matcher = None
    edu_matcher = None
    SKILLS = ["python","sql","aws","selenium"]
    JOB_TITLES = []

# ----------------------------- regex helpers --------------------------
# Jonathan's format: Company | Location | Title | Date
ROLE_FOUR_PARTS = re.compile(
    r"""^(?P<company>.+?)\s*\|\s*(?P<location>.+?)\s*\|\s*(?P<title>.+?)\s*\|\s*
        (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)

# Original format: Title | Company | Date
ROLE_ONE   = re.compile(
    r"""^(?P<title>.+?)\s*\|\s*(?P<company>.+?)\s*\|\s*
        (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)

# Also support the original comma/@ format for backward compatibility
ROLE_ONE_COMMA = re.compile(
    r"""^(?P<company>.+?)\s*[,@]\s*(?P<title>[^,@]+?)\s+
        (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X)

DATE_LINE  = re.compile(
    r"""^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}
        (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?\s*$""", re.I|re.X)

BULLET     = re.compile(r"^\s*(?:[-β€’Β·]|\*|●)\s+")
HEAD       = re.compile(r"^\s*(summary|skills?|technical\s+skills?|education|training|projects?|work\s+experience|experience|professional\s+experience|certifications?)[:\s]*$",re.I)

# ----------------------------- main -----------------------------------
def extract_sections_spacy_fixed(text:str)->dict:
    lines = [ln.rstrip() for ln in text.splitlines()]
    
    # Only create spaCy doc if nlp is available
    doc = nlp(text) if nlp and is_spacy_available() else None

    # Helper function for contact detection
    def is_contact(s): return bool(re.search(r"@\w|\d{3}[-.\s]?\d{3}",s))

    out = {
        "Name"                 : extract_name(text),
        "Summary"              : "",
        "Skills"               : [],
        "StructuredExperiences": [],
        "Education"            : [],
        "Training"             : []
    }

    # ---------- skills extraction (FIXED) ------
    # Extract ONLY from Technical Skills section to avoid noise
    skills_from_section = set()
    for i, line in enumerate(lines):
        if re.match(r"^\s*technical\s+skills?\s*$", line.strip(), re.I):
            # Found the heading, now collect the skills content
            for j in range(i + 1, len(lines)):
                next_line = lines[j].strip()
                if not next_line:  # Empty line
                    continue
                if HEAD.match(next_line):  # Next section heading
                    break
                if is_contact(next_line):  # Contact info
                    break
                
                # Handle bullet point format like "● Programming Languages: Python, Java, SQL, Apex, Bash"
                if next_line.startswith('●'):
                    # Remove bullet and extract the part after the colon
                    clean_line = next_line[1:].strip()  # Remove ●
                    if ':' in clean_line:
                        # Split on colon and take the part after it
                        skills_part = clean_line.split(':', 1)[1].strip()
                        # Split skills by comma
                        skills_in_line = re.split(r',\s*', skills_part)
                        for skill in skills_in_line:
                            skill = skill.strip()
                            if skill and len(skill) > 1 and not skill.endswith(')'):  # Avoid incomplete entries
                                skills_from_section.add(skill)
                else:
                    # Handle non-bullet format
                    skills_in_line = re.split(r',\s*', next_line)
                    for skill in skills_in_line:
                        skill = skill.strip()
                        # Remove bullet points and clean up
                        skill = re.sub(r'^\s*[β€’Β·\-\*●]\s*', '', skill)
                        if skill and len(skill) > 1:  # Avoid single characters
                            skills_from_section.add(skill)
            break
    
    # Use only section-extracted skills to avoid spaCy noise
    out["Skills"] = sorted(skills_from_section)

    # ---------- summary (improved extraction) ------
    # First try: look for content after "Summary" or "Professional Summary" heading
    summary_found = False
    for i, line in enumerate(lines):
        if re.match(r"^\s*(professional\s+)?summary\s*$", line.strip(), re.I):
            # Found the heading, now collect the summary content
            summary_lines = []
            for j in range(i + 1, len(lines)):
                next_line = lines[j].strip()
                if not next_line:  # Empty line
                    continue
                if HEAD.match(next_line):  # Next section heading
                    break
                if is_contact(next_line):  # Contact info
                    break
                summary_lines.append(next_line)
            if summary_lines:
                out["Summary"] = " ".join(summary_lines)
                summary_found = True
            break
    
    # Fallback: original method (first non-heading/non-contact paragraph)
    if not summary_found:
        for para in re.split(r"\n\s*\n", text):
            p = para.strip()
            if p and not HEAD.match(p) and not is_contact(p):
                out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I)
                break

    # ---------- experiences (FIXED) -------------------------------------------
    i=0
    while i < len(lines):
        ln = lines[i].strip()
        
        # Try four-part format first (Company | Location | Title | Date)
        m4 = ROLE_FOUR_PARTS.match(ln)
        if m4:
            company, location, title, dates = m4.group("company","location","title","dates")
            company = f"{company}, {location}"  # Combine company and location
            i += 1
        # Try pipe-separated format (Title | Company | Date)
        elif ROLE_ONE.match(ln):
            m1 = ROLE_ONE.match(ln)
            title, company, dates = m1.group("title","company","dates")
            i += 1
        # Try comma-separated format (Company, Title Date)
        elif ROLE_ONE_COMMA.match(ln):
            m2 = ROLE_ONE_COMMA.match(ln)
            company, title, dates = m2.group("company","title","dates")
            i += 1
        # Try two-liner format
        elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()):
            first = lines[i].strip()
            parts = re.split(r"[,@|\|]\s*", first, 1)  # Support both comma and pipe
            if len(parts) == 2:
                title = parts[0].strip()
                company = parts[1].strip()
            else:
                title = first
                company = ""
            dates = lines[i+1].strip()
            i += 2
        else:
            i += 1
            continue

        exp = {
            "title"          : title,
            "company"        : company,
            "date_range"     : dates,
            "responsibilities": []
        }

        # FIXED: Collect responsibilities properly
        while i < len(lines):
            nxt = lines[i].strip()
            if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt):
                break
            if BULLET.match(nxt):
                responsibility = BULLET.sub("",nxt).strip()
                if responsibility:  # Only add non-empty responsibilities
                    exp["responsibilities"].append(responsibility)
            i += 1

        out["StructuredExperiences"].append(exp)

    # ---------- education / training / certifications -----------------------------------
    # Use spaCy matchers if available, otherwise use regex fallback
    if doc and edu_matcher and is_spacy_available():
        for mid, s, e in edu_matcher(doc):
            bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training"
            out[bucket].append(doc[s:e].text)
    else:
        # Regex fallback for education extraction
        edu_patterns = [
            r"(?i)\b(?:bachelor|master|phd|doctorate|associate).*(?:degree|of|in)\s+([^,\n]+)",
            r"(?i)\b(?:bs|ba|ms|ma|mba|phd)\s+(?:in\s+)?([^,\n]+)",
            r"(?i)\b(?:university|college|institute).*\n?.*(?:bachelor|master|phd|degree)",
        ]
        
        for pattern in edu_patterns:
            matches = re.findall(pattern, text)
            for match in matches:
                if isinstance(match, str) and len(match.strip()) > 3:
                    out["Education"].append(match.strip())
    
    # Also extract certifications section manually
    cert_section_found = False
    for i, line in enumerate(lines):
        if re.match(r"^\s*certifications?\s*$", line.strip(), re.I):
            cert_section_found = True
            # Collect certification lines
            for j in range(i + 1, len(lines)):
                next_line = lines[j].strip()
                if not next_line:  # Empty line
                    continue
                if HEAD.match(next_line):  # Next section heading
                    break
                # Split multiple certifications on the same line
                certs = re.split(r',\s*', next_line)
                for cert in certs:
                    cert = cert.strip()
                    if cert and not is_contact(cert):
                        out["Training"].append(cert)
            break

    return out