TalentLensAI / utils /builder.py
Johnny
feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.
c2f9ec8
raw
history blame
13.1 kB
from datetime import datetime
from dateutil.parser import parse as date_parse
import re, math
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_ALIGN_PARAGRAPH
import logging
logger = logging.getLogger(__name__)
# ---------- helpers ---------------------------------------------------
def _date(dt_str:str)->datetime:
try: return date_parse(dt_str, default=datetime(1900,1,1))
except: return datetime(1900,1,1)
def fmt_range(raw:str)->str:
if not raw: return ""
parts = [p.strip() for p in re.split(r"\s*[–-]\s*", raw)]
formatted_parts = []
for part in parts:
if part.lower() == "present":
formatted_parts.append("Present")
else:
try:
date_obj = _date(part)
formatted_parts.append(date_obj.strftime("%B %Y"))
except:
formatted_parts.append(part) # fallback to original text
return " – ".join(formatted_parts)
# ---------- main ------------------------------------------------------
def build_resume_from_data(tmpl:str, sections:dict)->Document:
logger.info(f"BUILDER: Attempting to load document template from: {tmpl}")
doc = Document(tmpl)
logger.info(f"BUILDER: Template {tmpl} loaded successfully.")
# Log the template state
logger.info(f"BUILDER: Template has {len(doc.sections)} sections")
for i, section_obj in enumerate(doc.sections):
if section_obj.header:
logger.info(f"BUILDER: Section {i} header has {len(section_obj.header.paragraphs)} paragraphs")
if section_obj.footer:
logger.info(f"BUILDER: Section {i} footer has {len(section_obj.footer.paragraphs)} paragraphs")
# MOST CONSERVATIVE APPROACH: Clear paragraph content but don't remove elements
# This should preserve all document structure including sections
logger.info(f"BUILDER: Before clearing - Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")
# Clear paragraph text content only, don't remove elements
for paragraph in doc.paragraphs:
# Clear all runs in the paragraph but keep the paragraph element
for run in paragraph.runs:
run.text = ""
# Also clear the paragraph text directly
paragraph.text = ""
# Remove tables (these are less likely to affect sections)
tables_to_remove = list(doc.tables) # Create a copy of the list
for table in tables_to_remove:
tbl = table._element
tbl.getparent().remove(tbl)
logger.info(f"BUILDER: After clearing - Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")
# Verify headers/footers are still intact
logger.info(f"BUILDER: After clearing - Document still has {len(doc.sections)} sections")
for i, section_obj in enumerate(doc.sections):
if section_obj.header:
logger.info(f"BUILDER: Section {i} header still has {len(section_obj.header.paragraphs)} paragraphs")
if section_obj.footer:
logger.info(f"BUILDER: Section {i} footer still has {len(section_obj.footer.paragraphs)} paragraphs")
logger.info(f"BUILDER: Template preserved with original headers and footers")
# --- easy builders ---
def heading(txt): pg=doc.add_paragraph(); r=pg.add_run(txt); r.bold=True; r.font.size=Pt(12)
def bullet(txt,lvl=0): p=doc.add_paragraph(); p.paragraph_format.left_indent=Pt(lvl*12); p.add_run(f"β€’ {txt}").font.size=Pt(11)
def two_col(l,r):
tbl=doc.add_table(rows=1,cols=2); tbl.autofit=True
tbl.cell(0,0).paragraphs[0].add_run(l).bold=True
rp = tbl.cell(0,1).paragraphs[0]; rp.alignment=WD_ALIGN_PARAGRAPH.RIGHT
rr = rp.add_run(r); rr.italic=True
# --- header (name + current role) ---
exps = sections.get("StructuredExperiences",[])
if exps:
try:
# Filter to only dictionary experiences
dict_exps = [e for e in exps if isinstance(e, dict)]
if dict_exps:
newest = max(dict_exps, key=lambda e: _date(e.get("date_range","").split("–")[0] if "–" in e.get("date_range","") else e.get("date_range","").split("-")[0] if "-" in e.get("date_range","") else e.get("date_range","")))
cur_title = newest.get("title","")
else:
cur_title = ""
except:
# Fallback: try to get title from first dictionary experience
for exp in exps:
if isinstance(exp, dict) and exp.get("title"):
cur_title = exp.get("title","")
break
else:
cur_title = ""
else:
# Try to extract job title from summary if no structured experiences
cur_title = ""
summary = sections.get("Summary", "")
if summary:
# Look for job titles in the summary
title_patterns = [
r'(?i)(.*?engineer)',
r'(?i)(.*?developer)',
r'(?i)(.*?analyst)',
r'(?i)(.*?manager)',
r'(?i)(.*?specialist)',
r'(?i)(.*?consultant)',
r'(?i)(.*?architect)',
r'(?i)(.*?lead)',
r'(?i)(.*?director)',
r'(?i)(.*?coordinator)'
]
for pattern in title_patterns:
match = re.search(pattern, summary)
if match:
potential_title = match.group(1).strip()
# Clean up the title
potential_title = re.sub(r'^(results-driven|experienced|senior|junior|lead)\s+', '', potential_title, flags=re.I)
if len(potential_title) > 3 and len(potential_title) < 50:
cur_title = potential_title.title()
break
if sections.get("Name"):
p=doc.add_paragraph(); p.alignment=WD_PARAGRAPH_ALIGNMENT.CENTER
run=p.add_run(sections["Name"]); run.bold=True; run.font.size=Pt(16)
if cur_title:
p=doc.add_paragraph(); p.alignment=WD_PARAGRAPH_ALIGNMENT.CENTER
p.add_run(cur_title).font.size=Pt(12)
# --- summary ---
if sections.get("Summary"):
heading("Professional Summary:")
pg=doc.add_paragraph(); pg.paragraph_format.first_line_indent=Pt(12)
pg.add_run(sections["Summary"]).font.size=Pt(11)
# --- skills ---
if sections.get("Skills"):
heading("Skills:")
skills = sorted(set(sections["Skills"]))
cols = 3
rows = math.ceil(len(skills)/cols)
tbl = doc.add_table(rows=rows, cols=cols); tbl.autofit=True
k=0
for r in range(rows):
for c in range(cols):
if k < len(skills):
tbl.cell(r,c).paragraphs[0].add_run(f"β€’ {skills[k]}").font.size=Pt(11)
k+=1
# --- experience ---
if exps:
heading("Professional Experience:")
for e in exps:
# Ensure e is a dictionary, not a string
if isinstance(e, str):
# If it's a string, create a basic experience entry
bullet(e, 0)
continue
elif not isinstance(e, dict):
# Skip if it's neither string nor dict
continue
# Process dictionary experience entry
title = e.get("title", "")
company = e.get("company", "")
date_range = e.get("date_range", "")
responsibilities = e.get("responsibilities", [])
# Create the job header
two_col(" | ".join(filter(None, [title, company])),
fmt_range(date_range))
# Add responsibilities
if isinstance(responsibilities, list):
for resp in responsibilities:
if isinstance(resp, str) and resp.strip():
bullet(resp, 1)
elif isinstance(responsibilities, str) and responsibilities.strip():
bullet(responsibilities, 1)
else:
# If no structured experiences found, try to extract from summary
heading("Professional Experience:")
summary = sections.get("Summary", "")
if summary and cur_title:
# Extract years of experience from summary
years_match = re.search(r'(\d+)\s+years?\s+of\s+experience', summary, re.I)
years_text = f"{years_match.group(1)} years of experience" if years_match else "Multiple years of experience"
# Create a basic experience entry from summary
two_col(cur_title, years_text)
# Extract key responsibilities/skills from summary
sentences = re.split(r'[.!]', summary)
responsibilities = []
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) > 30 and any(keyword in sentence.lower() for keyword in
['expert', 'specializing', 'experience', 'developing', 'designing', 'implementing', 'managing', 'leading']):
responsibilities.append(sentence)
# Add responsibilities as bullet points
for resp in responsibilities[:5]: # Limit to 5 key points
bullet(resp.strip(), 1)
else:
# Fallback message
pg = doc.add_paragraph()
pg.add_run("Experience details are included in the Professional Summary above.").font.size = Pt(11)
pg.add_run(" For specific job titles, companies, and dates, please refer to the original resume.").font.size = Pt(11)
# --- job history timeline (chronological list) ---
if exps:
# Filter to only dictionary experiences and sort by date (most recent first)
dict_exps = [e for e in exps if isinstance(e, dict) and e.get("title") and e.get("date_range")]
if dict_exps:
# Sort experiences by start date (most recent first)
try:
sorted_exps = sorted(dict_exps, key=lambda e: _date(
e.get("date_range", "").split("–")[0] if "–" in e.get("date_range", "")
else e.get("date_range", "").split("-")[0] if "-" in e.get("date_range", "")
else e.get("date_range", "")
), reverse=True)
except:
# If sorting fails, use original order
sorted_exps = dict_exps
heading("Career Timeline:")
for exp in sorted_exps:
title = exp.get("title", "")
company = exp.get("company", "")
date_range = exp.get("date_range", "")
# Format: "Job Title at Company (Dates)"
if company:
timeline_entry = f"{title} at {company}"
else:
timeline_entry = title
if date_range:
timeline_entry += f" ({fmt_range(date_range)})"
bullet(timeline_entry, 0)
# --- education / training ---
education = sections.get("Education", [])
training = sections.get("Training", [])
# Check if we have any real education or if it's just experience duration
has_real_education = False
processed_education = []
experience_years = None
for ed in education:
# Ensure ed is a string
if not isinstance(ed, str):
continue
# Clean up the education entry (remove bullets)
clean_ed = ed.replace('β€’', '').strip()
if re.match(r'^\d+\s+years?$', clean_ed, re.I):
# This is experience duration, not education
experience_years = clean_ed
else:
processed_education.append(clean_ed)
has_real_education = True
# Show education section
if has_real_education:
heading("Education:")
for ed in processed_education:
bullet(ed)
elif experience_years:
# If only experience years found, show it as a note
heading("Education:")
pg = doc.add_paragraph()
pg.add_run(f"Professional experience: {experience_years}").font.size = Pt(11)
if training:
heading("Training:")
for tr in training:
# Ensure tr is a string
if isinstance(tr, str) and tr.strip():
bullet(tr)
# Final diagnostic before returning
logger.info(f"BUILDER: FINAL STATE - Document has {len(doc.sections)} sections")
for i, section_obj in enumerate(doc.sections):
if section_obj.header:
logger.info(f"BUILDER: FINAL - Section {i} header has {len(section_obj.header.paragraphs)} paragraphs")
if section_obj.footer:
logger.info(f"BUILDER: FINAL - Section {i} footer has {len(section_obj.footer.paragraphs)} paragraphs")
return doc