Spaces:

gauravbox
/

TalentLensAI

Running

Johnny

feat: Complete Format_Resume.py system with OpenAI GPT-4o integration and template preservation - Added Format_Resume.py Streamlit page with OpenAI GPT-4o primary extraction, HF Cloud backup, 5-tier fallback system, template preservation with Qvell branding, contact info extraction, skills cleaning, career timeline generation, and comprehensive utils restructure (10/11 files required). Renamed app.py to TalentLens.py, added blank_resume.docx template, updated .gitignore for Salesforce exclusion.

c2f9ec8 about 1 month ago

raw

history blame

13.1 kB

	from datetime import datetime
	from dateutil.parser import parse as date_parse
	import re, math
	from docx import Document
	from docx.shared import Pt
	from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_ALIGN_PARAGRAPH
	import logging

	logger = logging.getLogger(__name__)

	# ---------- helpers ---------------------------------------------------
	def _date(dt_str:str)->datetime:
	try: return date_parse(dt_str, default=datetime(1900,1,1))
	except: return datetime(1900,1,1)

	def fmt_range(raw:str)->str:
	if not raw: return ""
	parts = [p.strip() for p in re.split(r"\s[–-]\s", raw)]

	formatted_parts = []
	for part in parts:
	if part.lower() == "present":
	formatted_parts.append("Present")
	else:
	try:
	date_obj = _date(part)
	formatted_parts.append(date_obj.strftime("%B %Y"))
	except:
	formatted_parts.append(part) # fallback to original text

	return " – ".join(formatted_parts)

	# ---------- main ------------------------------------------------------
	def build_resume_from_data(tmpl:str, sections:dict)->Document:
	logger.info(f"BUILDER: Attempting to load document template from: {tmpl}")
	doc = Document(tmpl)
	logger.info(f"BUILDER: Template {tmpl} loaded successfully.")

	# Log the template state
	logger.info(f"BUILDER: Template has {len(doc.sections)} sections")
	for i, section_obj in enumerate(doc.sections):
	if section_obj.header:
	logger.info(f"BUILDER: Section {i} header has {len(section_obj.header.paragraphs)} paragraphs")
	if section_obj.footer:
	logger.info(f"BUILDER: Section {i} footer has {len(section_obj.footer.paragraphs)} paragraphs")

	# MOST CONSERVATIVE APPROACH: Clear paragraph content but don't remove elements
	# This should preserve all document structure including sections
	logger.info(f"BUILDER: Before clearing - Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")

	# Clear paragraph text content only, don't remove elements
	for paragraph in doc.paragraphs:
	# Clear all runs in the paragraph but keep the paragraph element
	for run in paragraph.runs:
	run.text = ""
	# Also clear the paragraph text directly
	paragraph.text = ""

	# Remove tables (these are less likely to affect sections)
	tables_to_remove = list(doc.tables) # Create a copy of the list
	for table in tables_to_remove:
	tbl = table._element
	tbl.getparent().remove(tbl)

	logger.info(f"BUILDER: After clearing - Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")

	# Verify headers/footers are still intact
	logger.info(f"BUILDER: After clearing - Document still has {len(doc.sections)} sections")
	for i, section_obj in enumerate(doc.sections):
	if section_obj.header:
	logger.info(f"BUILDER: Section {i} header still has {len(section_obj.header.paragraphs)} paragraphs")
	if section_obj.footer:
	logger.info(f"BUILDER: Section {i} footer still has {len(section_obj.footer.paragraphs)} paragraphs")

	logger.info(f"BUILDER: Template preserved with original headers and footers")

	# --- easy builders ---
	def heading(txt): pg=doc.add_paragraph(); r=pg.add_run(txt); r.bold=True; r.font.size=Pt(12)
	def bullet(txt,lvl=0): p=doc.add_paragraph(); p.paragraph_format.left_indent=Pt(lvl*12); p.add_run(f"• {txt}").font.size=Pt(11)
	def two_col(l,r):
	tbl=doc.add_table(rows=1,cols=2); tbl.autofit=True
	tbl.cell(0,0).paragraphs[0].add_run(l).bold=True
	rp = tbl.cell(0,1).paragraphs[0]; rp.alignment=WD_ALIGN_PARAGRAPH.RIGHT
	rr = rp.add_run(r); rr.italic=True

	# --- header (name + current role) ---
	exps = sections.get("StructuredExperiences",[])
	if exps:
	try:
	# Filter to only dictionary experiences
	dict_exps = [e for e in exps if isinstance(e, dict)]
	if dict_exps:
	newest = max(dict_exps, key=lambda e: _date(e.get("date_range","").split("–")[0] if "–" in e.get("date_range","") else e.get("date_range","").split("-")[0] if "-" in e.get("date_range","") else e.get("date_range","")))
	cur_title = newest.get("title","")
	else:
	cur_title = ""
	except:
	# Fallback: try to get title from first dictionary experience
	for exp in exps:
	if isinstance(exp, dict) and exp.get("title"):
	cur_title = exp.get("title","")
	break
	else:
	cur_title = ""
	else:
	# Try to extract job title from summary if no structured experiences
	cur_title = ""
	summary = sections.get("Summary", "")
	if summary:
	# Look for job titles in the summary
	title_patterns = [
	r'(?i)(.*?engineer)',
	r'(?i)(.*?developer)',
	r'(?i)(.*?analyst)',
	r'(?i)(.*?manager)',
	r'(?i)(.*?specialist)',
	r'(?i)(.*?consultant)',
	r'(?i)(.*?architect)',
	r'(?i)(.*?lead)',
	r'(?i)(.*?director)',
	r'(?i)(.*?coordinator)'
	]

	for pattern in title_patterns:
	match = re.search(pattern, summary)
	if match:
	potential_title = match.group(1).strip()
	# Clean up the title
	potential_title = re.sub(r'^(results-driven\|experienced\|senior\|junior\|lead)\s+', '', potential_title, flags=re.I)
	if len(potential_title) > 3 and len(potential_title) < 50:
	cur_title = potential_title.title()
	break

	if sections.get("Name"):
	p=doc.add_paragraph(); p.alignment=WD_PARAGRAPH_ALIGNMENT.CENTER
	run=p.add_run(sections["Name"]); run.bold=True; run.font.size=Pt(16)
	if cur_title:
	p=doc.add_paragraph(); p.alignment=WD_PARAGRAPH_ALIGNMENT.CENTER
	p.add_run(cur_title).font.size=Pt(12)

	# --- summary ---
	if sections.get("Summary"):
	heading("Professional Summary:")
	pg=doc.add_paragraph(); pg.paragraph_format.first_line_indent=Pt(12)
	pg.add_run(sections["Summary"]).font.size=Pt(11)

	# --- skills ---
	if sections.get("Skills"):
	heading("Skills:")
	skills = sorted(set(sections["Skills"]))
	cols = 3
	rows = math.ceil(len(skills)/cols)
	tbl = doc.add_table(rows=rows, cols=cols); tbl.autofit=True
	k=0
	for r in range(rows):
	for c in range(cols):
	if k < len(skills):
	tbl.cell(r,c).paragraphs[0].add_run(f"• {skills[k]}").font.size=Pt(11)
	k+=1

	# --- experience ---
	if exps:
	heading("Professional Experience:")
	for e in exps:
	# Ensure e is a dictionary, not a string
	if isinstance(e, str):
	# If it's a string, create a basic experience entry
	bullet(e, 0)
	continue
	elif not isinstance(e, dict):
	# Skip if it's neither string nor dict
	continue

	# Process dictionary experience entry
	title = e.get("title", "")
	company = e.get("company", "")
	date_range = e.get("date_range", "")
	responsibilities = e.get("responsibilities", [])

	# Create the job header
	two_col(" \| ".join(filter(None, [title, company])),
	fmt_range(date_range))

	# Add responsibilities
	if isinstance(responsibilities, list):
	for resp in responsibilities:
	if isinstance(resp, str) and resp.strip():
	bullet(resp, 1)
	elif isinstance(responsibilities, str) and responsibilities.strip():
	bullet(responsibilities, 1)
	else:
	# If no structured experiences found, try to extract from summary
	heading("Professional Experience:")
	summary = sections.get("Summary", "")

	if summary and cur_title:
	# Extract years of experience from summary
	years_match = re.search(r'(\d+)\s+years?\s+of\s+experience', summary, re.I)
	years_text = f"{years_match.group(1)} years of experience" if years_match else "Multiple years of experience"

	# Create a basic experience entry from summary
	two_col(cur_title, years_text)

	# Extract key responsibilities/skills from summary
	sentences = re.split(r'[.!]', summary)
	responsibilities = []

	for sentence in sentences:
	sentence = sentence.strip()
	if len(sentence) > 30 and any(keyword in sentence.lower() for keyword in
	['expert', 'specializing', 'experience', 'developing', 'designing', 'implementing', 'managing', 'leading']):
	responsibilities.append(sentence)

	# Add responsibilities as bullet points
	for resp in responsibilities[:5]: # Limit to 5 key points
	bullet(resp.strip(), 1)
	else:
	# Fallback message
	pg = doc.add_paragraph()
	pg.add_run("Experience details are included in the Professional Summary above.").font.size = Pt(11)
	pg.add_run(" For specific job titles, companies, and dates, please refer to the original resume.").font.size = Pt(11)

	# --- job history timeline (chronological list) ---
	if exps:
	# Filter to only dictionary experiences and sort by date (most recent first)
	dict_exps = [e for e in exps if isinstance(e, dict) and e.get("title") and e.get("date_range")]

	if dict_exps:
	# Sort experiences by start date (most recent first)
	try:
	sorted_exps = sorted(dict_exps, key=lambda e: _date(
	e.get("date_range", "").split("–")[0] if "–" in e.get("date_range", "")
	else e.get("date_range", "").split("-")[0] if "-" in e.get("date_range", "")
	else e.get("date_range", "")
	), reverse=True)
	except:
	# If sorting fails, use original order
	sorted_exps = dict_exps

	heading("Career Timeline:")
	for exp in sorted_exps:
	title = exp.get("title", "")
	company = exp.get("company", "")
	date_range = exp.get("date_range", "")

	# Format: "Job Title at Company (Dates)"
	if company:
	timeline_entry = f"{title} at {company}"
	else:
	timeline_entry = title

	if date_range:
	timeline_entry += f" ({fmt_range(date_range)})"

	bullet(timeline_entry, 0)

	# --- education / training ---
	education = sections.get("Education", [])
	training = sections.get("Training", [])

	# Check if we have any real education or if it's just experience duration
	has_real_education = False
	processed_education = []
	experience_years = None

	for ed in education:
	# Ensure ed is a string
	if not isinstance(ed, str):
	continue

	# Clean up the education entry (remove bullets)
	clean_ed = ed.replace('•', '').strip()
	if re.match(r'^\d+\s+years?$', clean_ed, re.I):
	# This is experience duration, not education
	experience_years = clean_ed
	else:
	processed_education.append(clean_ed)
	has_real_education = True

	# Show education section
	if has_real_education:
	heading("Education:")
	for ed in processed_education:
	bullet(ed)
	elif experience_years:
	# If only experience years found, show it as a note
	heading("Education:")
	pg = doc.add_paragraph()
	pg.add_run(f"Professional experience: {experience_years}").font.size = Pt(11)

	if training:
	heading("Training:")
	for tr in training:
	# Ensure tr is a string
	if isinstance(tr, str) and tr.strip():
	bullet(tr)

	# Final diagnostic before returning
	logger.info(f"BUILDER: FINAL STATE - Document has {len(doc.sections)} sections")
	for i, section_obj in enumerate(doc.sections):
	if section_obj.header:
	logger.info(f"BUILDER: FINAL - Section {i} header has {len(section_obj.header.paragraphs)} paragraphs")
	if section_obj.footer:
	logger.info(f"BUILDER: FINAL - Section {i} footer has {len(section_obj.footer.paragraphs)} paragraphs")

	return doc