Spaces:

gauravbox
/

TalentLensAI

Running

Johnny commited on Jul 26

Commit

79b5c9c

1 Parent(s): 102e49d

feat: Update resume builder with LFS-tracked assets

- Add header and footer images using Git LFS
- Update configuration and dependencies
- Improve resume builder and OpenAI extractor
- Update app components and utility functions
- Remove unused blank resume template

Files changed (15) hide show

.gitattributes +2 -0
.gitignore +7 -0
.streamlit/config.toml +0 -1
app.py +57 -4
config.py +105 -27
footer.png +3 -0
header.png +3 -0
pages/Template.py +64 -13
requirements.txt +2 -1
templates/blank_resume.docx +0 -0
utils/builder.py +192 -280
utils/openai_extractor.py +142 -219
utils/parser.py +1 -1
utils/reporting.py +2 -2
utils/screening.py +1 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -37,3 +37,10 @@ debug_*.docx
 .sfdx/
 *.cls
 apex.db

 .sfdx/
 *.cls
 apex.db
+.DS_Store
+utils/.DS_Store
+utils/cursor-updates
+utils/prompt-updates
+Youlin Joseph Li qvell.docx
+Template.py

.streamlit/config.toml CHANGED Viewed

@@ -7,7 +7,6 @@ font="sans serif"
 [ui]
 hideTopBar = false
-hideSidebarNav = true
 [server]
 headless = true

 [ui]
 hideTopBar = false
 [server]
 headless = true

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # TalentLens
 import os
 from io import BytesIO
 import streamlit as st
@@ -8,13 +9,40 @@ import fitz  # PyMuPDF
 import requests
 from dotenv import load_dotenv
-from config import supabase, HF_API_TOKEN, HF_HEADERS, HF_MODELS
 from utils.parser     import parse_resume, extract_email, summarize_resume
 from utils.hybrid_extractor import extract_resume_sections
 from utils.builder    import build_resume_from_data
 from utils.screening  import evaluate_resumes
 from utils.reporting import generate_pdf_report, generate_interview_questions_from_summaries
 # ------------------------- Main App Function -------------------------
 def main():
@@ -61,11 +89,11 @@ def main():
     with col1:
         # Evaluation trigger
-        evaluate_clicked = st.button("📊 Evaluate Resumes", type="primary", use_container_width=True)
     with col2:
         # Format Resume redirect button
-        format_clicked = st.button("📄 Format Resume", use_container_width=True)
     # Handle Format Resume redirect
     if format_clicked:
@@ -81,7 +109,7 @@ def main():
             st.error("⚠️ Please upload at least one resume.")
             return
-        st.write("### 📊 Evaluating Resumes...")
         # Resume Evaluation
         shortlisted, removed_candidates = evaluate_resumes(uploaded_files, job_description)
@@ -109,6 +137,31 @@ def main():
             for removed in removed_candidates:
                 st.write(f"**{removed['name']}** - {removed['reason']}")
 # ------------------------- Run the App -------------------------
 if __name__ == "__main__":
     main()

 # TalentLens
 import os
+import time  # Add time module import
 from io import BytesIO
 import streamlit as st
 import requests
 from dotenv import load_dotenv
+from config import supabase, HF_API_TOKEN, HF_HEADERS, HF_ENDPOINTS
 from utils.parser     import parse_resume, extract_email, summarize_resume
 from utils.hybrid_extractor import extract_resume_sections
 from utils.builder    import build_resume_from_data
 from utils.screening  import evaluate_resumes
 from utils.reporting import generate_pdf_report, generate_interview_questions_from_summaries
+def toggle_endpoint(endpoint_name, action):
+    """Start or stop an endpoint"""
+    try:
+        from config import HF_HEADERS, HF_ENDPOINTS
+        # Use the health endpoint
+        endpoint_info = HF_ENDPOINTS[endpoint_name]
+        url = f"{endpoint_info['url']}/health"
+        # Use HEAD request to start the endpoint
+        response = requests.head(url, headers=HF_HEADERS)
+        if response.status_code == 503:
+            st.info("🚀 Starting endpoint... This may take 5-6 minutes. Click on 'Start' again to refresh status.")
+            time.sleep(2)  # Wait briefly before refreshing status
+            from config import check_endpoint_status
+            new_status = check_endpoint_status(endpoint_name)
+            st.session_state['endpoint_status'] = {endpoint_name: new_status}
+        elif response.status_code == 200:
+            st.success("✅ Endpoint is running")
+            time.sleep(2)  # Wait briefly before refreshing status
+            from config import check_endpoint_status
+            new_status = check_endpoint_status(endpoint_name)
+            st.session_state['endpoint_status'] = {endpoint_name: new_status}
+        else:
+            st.error(f"❌ Failed to {action} endpoint: {response.text}")
+    except Exception as e:
+        st.error(f"❌ Failed to {action} endpoint: {str(e)}")
 # ------------------------- Main App Function -------------------------
 def main():
     with col1:
         # Evaluation trigger
+        evaluate_clicked = st.button("\U0001F4CA Evaluate Resumes", type="primary", use_container_width=True)
     with col2:
         # Format Resume redirect button
+        format_clicked = st.button("\U0001F4C4 Format Resume", use_container_width=True)
     # Handle Format Resume redirect
     if format_clicked:
             st.error("⚠️ Please upload at least one resume.")
             return
+        st.write("### �� Evaluating Resumes...")
         # Resume Evaluation
         shortlisted, removed_candidates = evaluate_resumes(uploaded_files, job_description)
             for removed in removed_candidates:
                 st.write(f"**{removed['name']}** - {removed['reason']}")
+    # Get current status using DNS resolution
+    from config import check_endpoint_status
+    endpoint_name = "vzwjawyxvu030jsw"  # Updated to match endpoint ID
+    current_status = check_endpoint_status(endpoint_name)
+    state = current_status.get('status', 'unknown')
+    # Update session state with current status
+    st.session_state['endpoint_status'] = {endpoint_name: current_status}
+    # Show Start button and status
+    start_button = st.empty()  # Placeholder for Start button
+    if state in ['stopped', 'error']:
+        if start_button.button("▶️ Start", key=f"start_{endpoint_name}", use_container_width=True):
+            toggle_endpoint(endpoint_name, "start")
+            # Refresh status after starting
+            new_status = check_endpoint_status(endpoint_name)
+            st.session_state['endpoint_status'] = {endpoint_name: new_status}
+            if new_status.get('status') == 'running':
+                st.success("✅ Endpoint is running")
+            elif new_status.get('status') == 'starting':
+                st.info("🚀 Starting endpoint... This may take 5-6 minutes. Click on 'Start' again to refresh status.")
+            elif new_status.get('status') == 'error':
+                st.error(f"❌ Error: {new_status.get('error', 'Unknown error')}")
 # ------------------------- Run the App -------------------------
 if __name__ == "__main__":
     main()

config.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import time
 import requests
 from dotenv import load_dotenv
 from supabase import create_client
 from sentence_transformers import SentenceTransformer
@@ -20,44 +21,121 @@ supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
 # === Embedding Model for Scoring ===
 embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# === Hugging Face API Configuration (for summarization/other) ===
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 if not HF_API_TOKEN:
     raise ValueError("Missing Hugging Face API key. Check your .env file.")
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}
 # === Hugging Face Model Endpoints ===
-HF_MODELS = {
-    "pegasus": "https://router.huggingface.co/hf-inference/models/google/pegasus-xsum",
-    "gemma": "tgi"  # Used as the model name with OpenAI-compatible client
 }
-# === OpenAI-Compatible Client (for Gemma) ===
-client = OpenAI(
-    base_url="https://vzwjawyxvu030jsw.us-east-1.aws.endpoints.huggingface.cloud/v1/",
-    api_key=HF_API_TOKEN,
-)
-# === Optional: General Query Helper (for non-chat models like pegasus) ===
-def query(payload, model="pegasus", retries=5, delay=5):
     """
-    Sends a request to the Hugging Face API with retries and error handling.
     """
-    if model not in HF_MODELS:
-        raise ValueError(f"Invalid model name: {model}. Available: {list(HF_MODELS.keys())}")
-    api_url = HF_MODELS[model]
-    for attempt in range(retries):
-        try:
-            response = requests.post(api_url, headers=HF_HEADERS, json=payload, timeout=10)
-            if response.status_code in (401, 402):
-                print(f"❌ HF error {response.status_code}")
-                return None
-            response.raise_for_status()
             return response.json()
-        except requests.exceptions.RequestException as e:
-            print(f"⚠️ Attempt {attempt+1} failed: {e}")
-            time.sleep(delay)
-    print("🚨 All retry attempts failed.")
-    return None

 import os
 import time
 import requests
+import socket
 from dotenv import load_dotenv
 from supabase import create_client
 from sentence_transformers import SentenceTransformer
 # === Embedding Model for Scoring ===
 embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# === Hugging Face API Configuration ===
 HF_API_TOKEN = os.getenv("HF_API_TOKEN")
 if not HF_API_TOKEN:
     raise ValueError("Missing Hugging Face API key. Check your .env file.")
+# Headers for API requests
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}
 # === Hugging Face Model Endpoints ===
+HF_ENDPOINTS = {
+    "bart-large-cnn-ovt": {
+        "url": "https://hedemwou4oqkk65c.us-east-1.aws.endpoints.huggingface.cloud",
+        "task": "summarization",
+        "model_id": "facebook/bart-large-cnn"
+    },
+    "vzwjawyxvu030jsw": {  # Updated endpoint name to match URL
+        "url": "https://vzwjawyxvu030jsw.us-east-1.aws.endpoints.huggingface.cloud",
+        "task": "text-generation",
+        "model_id": "google/gemma-7b"
+    }
 }
+def check_endpoint_status(endpoint_name: str) -> dict:
+    """
+    Check the status of a private Hugging Face endpoint using DNS resolution
+    """
+    if endpoint_name not in HF_ENDPOINTS:
+        return {
+            "status": "error",
+            "error": f"Unknown endpoint: {endpoint_name}"
+        }
+    try:
+        endpoint_info = HF_ENDPOINTS[endpoint_name]
+        hostname = endpoint_info['url'].replace('https://', '').split('/')[0]
+        # Try DNS resolution
+        try:
+            socket.gethostbyname(hostname)
+            # If DNS resolves, endpoint exists but may be stopped
+            return {
+                "status": "stopped",
+                "scaled": True,
+                "pending": 0,
+                "error": None
+            }
+        except socket.gaierror:
+            # If DNS fails, endpoint doesn't exist
+            return {
+                "status": "error",
+                "error": "Endpoint not found"
+            }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+def toggle_endpoint(endpoint_name: str, action: str) -> dict:
+    """
+    Start or stop a private Hugging Face endpoint
+    """
+    try:
+        # For private endpoints, use the Endpoints API
+        api_base = "https://api.endpoints.huggingface.cloud"
+        action_url = f"{api_base}/v2/endpoint/{endpoint_name}/{action}"
+        response = requests.post(
+            action_url,
+            headers=HF_HEADERS,
+            timeout=10
+        )
+        if response.status_code in [200, 202]:
+            return {
+                "success": True,
+                "message": f"Successfully {action}ed endpoint"
+            }
+        else:
+            return {
+                "error": f"Failed to {action} endpoint: {response.text}"
+            }
+    except Exception as e:
+        return {
+            "error": f"Failed to {action} endpoint: {str(e)}"
+        }
+# === Query Helper ===
+def query(payload: dict, endpoint_name: str) -> dict:
     """
+    Send a query to a Hugging Face endpoint
     """
+    if endpoint_name not in HF_ENDPOINTS:
+        return {
+            "error": f"Unknown endpoint: {endpoint_name}"
+        }
+    endpoint_info = HF_ENDPOINTS[endpoint_name]
+    url = endpoint_info['url']
+    try:
+        response = requests.post(
+            url,
+            headers=HF_HEADERS,
+            json=payload,
+            timeout=30
+        )
+        if response.status_code == 200:
             return response.json()
+        else:
+            return {
+                "error": f"Query failed with status {response.status_code}: {response.text}"
+            }
+    except Exception as e:
+        return {
+            "error": str(e)
+        }

footer.png ADDED Viewed

Git LFS Details

SHA256: b673e89dcea8e3e2533789aa14441e471ce5e169e734718da5813c4cf043e3b2
Pointer size: 130 Bytes
Size of remote file: 30 kB

header.png ADDED Viewed

Git LFS Details

SHA256: f7bc089bb6134bd545d9f8ef974ddb418b0e2157d357eafa32fb6b6e84dda726
Pointer size: 131 Bytes
Size of remote file: 184 kB

pages/Template.py CHANGED Viewed

@@ -1,8 +1,10 @@
-# pages/Template.py
 import os, sys, streamlit as st
 import json
 from io import BytesIO
 # Add parent directory to path so we can import utils
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -11,17 +13,13 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from dotenv import load_dotenv
 load_dotenv(override=True)
 from utils.hybrid_extractor import extract_resume_sections
 from utils.builder   import build_resume_from_data
-from utils.parser import parse_resume            # whatever parse_resume you already have
-# Path to your blank template (header/footer only)
-template_path = os.path.join(
-    os.path.dirname(__file__), '..', 'templates', 'blank_resume.docx'
-)
 st.set_page_config(
-    page_title='Resume Template Builder',
     layout='centered',
     initial_sidebar_state="collapsed"
 )
@@ -40,17 +38,70 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
 # Home button at the top
-if st.button("🏠 Home", help="Return to main TalentLens.AI page"):
     st.switch_page("app.py")
-st.title('📄 Resume Template Builder')
 st.markdown("---")
 uploaded = st.file_uploader('Upload Resume (PDF or DOCX)', type=['pdf','docx'])
 if not uploaded:
     st.info("Please upload a resume to get started.")
-    st.stop()
 st.success(f'Uploaded: {uploaded.name}')
@@ -239,7 +290,7 @@ if st.button('📄 Generate Formatted Resume', type='primary'):
     try:
         with st.spinner('Building formatted resume...'):
             # Build the resume document
-            doc = build_resume_from_data(template_path, data)
             # Save to buffer
             buf = BytesIO()
@@ -329,4 +380,4 @@ st.markdown(
     "🚀 <strong>TalentLens.AI</strong> - Powered by AI for intelligent resume processing"
     "</div>",
     unsafe_allow_html=True
-)

+# pages/Format_Resume.py
 import os, sys, streamlit as st
 import json
 from io import BytesIO
+import time # Added for API status check
+import requests # Added for endpoint control
 # Add parent directory to path so we can import utils
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from dotenv import load_dotenv
 load_dotenv(override=True)
+from config import HF_ENDPOINTS  # Update import
 from utils.hybrid_extractor import extract_resume_sections
 from utils.builder   import build_resume_from_data
+from utils.parser import parse_resume
 st.set_page_config(
+    page_title='Resume Formatter',
     layout='centered',
     initial_sidebar_state="collapsed"
 )
     </style>
 """, unsafe_allow_html=True)
+def toggle_endpoint(endpoint_name, action):
+    """Start or stop an endpoint"""
+    try:
+        from config import HF_HEADERS, HF_ENDPOINTS
+        # Use the health endpoint
+        endpoint_info = HF_ENDPOINTS[endpoint_name]
+        url = f"{endpoint_info['url']}/health"
+        # Use HEAD request to start the endpoint
+        response = requests.head(url, headers=HF_HEADERS)
+        if response.status_code == 503:
+            st.info("🚀 Starting endpoint... This may take 3-4 minutes. Click on 'Start' again to refresh status.")
+            time.sleep(2)  # Wait briefly before refreshing status
+            from config import check_endpoint_status
+            new_status = check_endpoint_status(endpoint_name)
+            st.session_state['endpoint_status'] = {endpoint_name: new_status}
+        elif response.status_code == 200:
+            st.success("✅ Endpoint is running")
+            time.sleep(2)  # Wait briefly before refreshing status
+            from config import check_endpoint_status
+            new_status = check_endpoint_status(endpoint_name)
+            st.session_state['endpoint_status'] = {endpoint_name: new_status}
+        else:
+            st.error(f"❌ Failed to {action} endpoint: {response.text}")
+    except Exception as e:
+        st.error(f"❌ Failed to {action} endpoint: {str(e)}")
 # Home button at the top
+if st.button("\U0001F3E0 Home", help="Return to main TalentLens.AI page"):
     st.switch_page("app.py")
+st.title('📄 Resume Formatter')
 st.markdown("---")
 uploaded = st.file_uploader('Upload Resume (PDF or DOCX)', type=['pdf','docx'])
 if not uploaded:
     st.info("Please upload a resume to get started.")
+    # Get current status using DNS resolution
+    from config import check_endpoint_status
+    endpoint_name = "bart-large-cnn-ovt"
+    current_status = check_endpoint_status(endpoint_name)
+    state = current_status.get('status', 'unknown')
+    # Update session state with current status
+    st.session_state['endpoint_status'] = {endpoint_name: current_status}
+    # Show Start button and status
+    start_button = st.empty()  # Placeholder for Start button
+    if state in ['stopped', 'error']:
+        if start_button.button("▶️ Start", key=f"start_{endpoint_name}", use_container_width=True):
+            toggle_endpoint(endpoint_name, "start")
+            # Refresh status after starting
+            new_status = check_endpoint_status(endpoint_name)
+            st.session_state['endpoint_status'] = {endpoint_name: new_status}
+            if new_status.get('status') == 'running':
+                st.success("✅ Endpoint is running")
+            elif new_status.get('status') == 'starting':
+                st.info("🚀 Starting endpoint... This may take 3-4 minutes. Click on 'Start' again to refresh status.")
+            elif new_status.get('status') == 'error':
+                st.error(f"❌ Error: {new_status.get('error', 'Unknown error')}")
+    st.stop()  # Stop here if no file is uploaded
 st.success(f'Uploaded: {uploaded.name}')
     try:
         with st.spinner('Building formatted resume...'):
             # Build the resume document
+            doc = build_resume_from_data(tmpl="", sections=data)
             # Save to buffer
             buf = BytesIO()
     "🚀 <strong>TalentLens.AI</strong> - Powered by AI for intelligent resume processing"
     "</div>",
     unsafe_allow_html=True
+)

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1
 openai
 fuzzywuzzy
 python-docx
-numpy<2.0

 openai
 fuzzywuzzy
 python-docx
+numpy<2.0
+from torch._C import *  # noqa: F403

templates/blank_resume.docx DELETED Viewed

Binary file (48.2 kB)

utils/builder.py CHANGED Viewed

@@ -1,20 +1,19 @@
 from datetime import datetime
 from dateutil.parser import parse as date_parse
-import re, math
 from docx import Document
-from docx.shared import Pt
-from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_ALIGN_PARAGRAPH
-import logging
 logger = logging.getLogger(__name__)
-# ---------- helpers ---------------------------------------------------
-def _date(dt_str:str)->datetime:
-    try:    return date_parse(dt_str, default=datetime(1900,1,1))
-    except: return datetime(1900,1,1)
-def fmt_range(raw:str)->str:
-    if not raw: return ""
     parts = [p.strip() for p in re.split(r"\s*[–-]\s*", raw)]
     formatted_parts = []
@@ -23,284 +22,197 @@ def fmt_range(raw:str)->str:
             formatted_parts.append("Present")
         else:
             try:
-                date_obj = _date(part)
-                formatted_parts.append(date_obj.strftime("%B %Y"))
-            except:
-                formatted_parts.append(part)  # fallback to original text
     return " – ".join(formatted_parts)
-# ---------- main ------------------------------------------------------
-def build_resume_from_data(tmpl:str, sections:dict)->Document:
-    logger.info(f"BUILDER: Attempting to load document template from: {tmpl}")
-    doc = Document(tmpl)
-    logger.info(f"BUILDER: Template {tmpl} loaded successfully.")
-    # Log the template state
-    logger.info(f"BUILDER: Template has {len(doc.sections)} sections")
-    for i, section_obj in enumerate(doc.sections):
-        if section_obj.header:
-            logger.info(f"BUILDER: Section {i} header has {len(section_obj.header.paragraphs)} paragraphs")
-        if section_obj.footer:
-            logger.info(f"BUILDER: Section {i} footer has {len(section_obj.footer.paragraphs)} paragraphs")
-    # MOST CONSERVATIVE APPROACH: Clear paragraph content but don't remove elements
-    # This should preserve all document structure including sections
-    logger.info(f"BUILDER: Before clearing - Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")
-    # Clear paragraph text content only, don't remove elements
-    for paragraph in doc.paragraphs:
-        # Clear all runs in the paragraph but keep the paragraph element
-        for run in paragraph.runs:
-            run.text = ""
-        # Also clear the paragraph text directly
-        paragraph.text = ""
-    # Remove tables (these are less likely to affect sections)
-    tables_to_remove = list(doc.tables)  # Create a copy of the list
-    for table in tables_to_remove:
-        tbl = table._element
-        tbl.getparent().remove(tbl)
-    logger.info(f"BUILDER: After clearing - Document has {len(doc.paragraphs)} paragraphs and {len(doc.tables)} tables")
-    # Verify headers/footers are still intact
-    logger.info(f"BUILDER: After clearing - Document still has {len(doc.sections)} sections")
-    for i, section_obj in enumerate(doc.sections):
-        if section_obj.header:
-            logger.info(f"BUILDER: Section {i} header still has {len(section_obj.header.paragraphs)} paragraphs")
-        if section_obj.footer:
-            logger.info(f"BUILDER: Section {i} footer still has {len(section_obj.footer.paragraphs)} paragraphs")
-    logger.info(f"BUILDER: Template preserved with original headers and footers")
-    # --- easy builders ---
-    def heading(txt): pg=doc.add_paragraph(); r=pg.add_run(txt); r.bold=True; r.font.size=Pt(12)
-    def bullet(txt,lvl=0): p=doc.add_paragraph(); p.paragraph_format.left_indent=Pt(lvl*12); p.add_run(f"• {txt}").font.size=Pt(11)
-    def two_col(l,r):
-        tbl=doc.add_table(rows=1,cols=2); tbl.autofit=True
-        tbl.cell(0,0).paragraphs[0].add_run(l).bold=True
-        rp  = tbl.cell(0,1).paragraphs[0]; rp.alignment=WD_ALIGN_PARAGRAPH.RIGHT
-        rr  = rp.add_run(r); rr.italic=True
-    # --- header (name + current role) ---
-    exps = sections.get("StructuredExperiences",[])
-    if exps:
-        try:
-            # Filter to only dictionary experiences
-            dict_exps = [e for e in exps if isinstance(e, dict)]
-            if dict_exps:
-                newest = max(dict_exps, key=lambda e: _date(e.get("date_range","").split("–")[0] if "–" in e.get("date_range","") else e.get("date_range","").split("-")[0] if "-" in e.get("date_range","") else e.get("date_range","")))
-                cur_title = newest.get("title","")
-            else:
-                cur_title = ""
-        except:
-            # Fallback: try to get title from first dictionary experience
-            for exp in exps:
-                if isinstance(exp, dict) and exp.get("title"):
-                    cur_title = exp.get("title","")
-                    break
-            else:
-                cur_title = ""
-    else:
-        # Try to extract job title from summary if no structured experiences
-        cur_title = ""
-        summary = sections.get("Summary", "")
-        if summary:
-            # Look for job titles in the summary
-            title_patterns = [
-                r'(?i)(.*?engineer)',
-                r'(?i)(.*?developer)',
-                r'(?i)(.*?analyst)',
-                r'(?i)(.*?manager)',
-                r'(?i)(.*?specialist)',
-                r'(?i)(.*?consultant)',
-                r'(?i)(.*?architect)',
-                r'(?i)(.*?lead)',
-                r'(?i)(.*?director)',
-                r'(?i)(.*?coordinator)'
-            ]
-            for pattern in title_patterns:
-                match = re.search(pattern, summary)
-                if match:
-                    potential_title = match.group(1).strip()
-                    # Clean up the title
-                    potential_title = re.sub(r'^(results-driven|experienced|senior|junior|lead)\s+', '', potential_title, flags=re.I)
-                    if len(potential_title) > 3 and len(potential_title) < 50:
-                        cur_title = potential_title.title()
-                        break
-    if sections.get("Name"):
-        p=doc.add_paragraph(); p.alignment=WD_PARAGRAPH_ALIGNMENT.CENTER
-        run=p.add_run(sections["Name"]); run.bold=True; run.font.size=Pt(16)
-    if cur_title:
-        p=doc.add_paragraph(); p.alignment=WD_PARAGRAPH_ALIGNMENT.CENTER
-        p.add_run(cur_title).font.size=Pt(12)
-    # --- summary ---
-    if sections.get("Summary"):
-        heading("Professional Summary:")
-        pg=doc.add_paragraph(); pg.paragraph_format.first_line_indent=Pt(12)
-        pg.add_run(sections["Summary"]).font.size=Pt(11)
-    # --- skills ---
-    if sections.get("Skills"):
-        heading("Skills:")
-        skills = sorted(set(sections["Skills"]))
-        cols   = 3
-        rows   = math.ceil(len(skills)/cols)
-        tbl    = doc.add_table(rows=rows, cols=cols); tbl.autofit=True
-        k=0
-        for r in range(rows):
-            for c in range(cols):
-                if k < len(skills):
-                    tbl.cell(r,c).paragraphs[0].add_run(f"• {skills[k]}").font.size=Pt(11)
-                    k+=1
-    # --- experience ---
-    if exps:
-        heading("Professional Experience:")
-        for e in exps:
-            # Ensure e is a dictionary, not a string
-            if isinstance(e, str):
-                # If it's a string, create a basic experience entry
-                bullet(e, 0)
-                continue
-            elif not isinstance(e, dict):
-                # Skip if it's neither string nor dict
-                continue
-            # Process dictionary experience entry
-            title = e.get("title", "")
-            company = e.get("company", "")
-            date_range = e.get("date_range", "")
-            responsibilities = e.get("responsibilities", [])
-            # Create the job header
-            two_col(" | ".join(filter(None, [title, company])),
-                    fmt_range(date_range))
-            # Add responsibilities
-            if isinstance(responsibilities, list):
-                for resp in responsibilities:
-                    if isinstance(resp, str) and resp.strip():
-                        bullet(resp, 1)
-            elif isinstance(responsibilities, str) and responsibilities.strip():
-                bullet(responsibilities, 1)
-    else:
-        # If no structured experiences found, try to extract from summary
-        heading("Professional Experience:")
-        summary = sections.get("Summary", "")
-        if summary and cur_title:
-            # Extract years of experience from summary
-            years_match = re.search(r'(\d+)\s+years?\s+of\s+experience', summary, re.I)
-            years_text = f"{years_match.group(1)} years of experience" if years_match else "Multiple years of experience"
-            # Create a basic experience entry from summary
-            two_col(cur_title, years_text)
-            # Extract key responsibilities/skills from summary
-            sentences = re.split(r'[.!]', summary)
-            responsibilities = []
-            for sentence in sentences:
-                sentence = sentence.strip()
-                if len(sentence) > 30 and any(keyword in sentence.lower() for keyword in
-                    ['expert', 'specializing', 'experience', 'developing', 'designing', 'implementing', 'managing', 'leading']):
-                    responsibilities.append(sentence)
-            # Add responsibilities as bullet points
-            for resp in responsibilities[:5]:  # Limit to 5 key points
-                bullet(resp.strip(), 1)
-        else:
-            # Fallback message
-            pg = doc.add_paragraph()
-            pg.add_run("Experience details are included in the Professional Summary above.").font.size = Pt(11)
-            pg.add_run(" For specific job titles, companies, and dates, please refer to the original resume.").font.size = Pt(11)
-    # --- job history timeline (chronological list) ---
-    if exps:
-        # Filter to only dictionary experiences and sort by date (most recent first)
-        dict_exps = [e for e in exps if isinstance(e, dict) and e.get("title") and e.get("date_range")]
-        if dict_exps:
-            # Sort experiences by start date (most recent first)
-            try:
-                sorted_exps = sorted(dict_exps, key=lambda e: _date(
-                    e.get("date_range", "").split("–")[0] if "–" in e.get("date_range", "")
-                    else e.get("date_range", "").split("-")[0] if "-" in e.get("date_range", "")
-                    else e.get("date_range", "")
-                ), reverse=True)
-            except:
-                # If sorting fails, use original order
-                sorted_exps = dict_exps
-            heading("Career Timeline:")
-            for exp in sorted_exps:
-                title = exp.get("title", "")
-                company = exp.get("company", "")
-                date_range = exp.get("date_range", "")
-                # Format: "Job Title at Company (Dates)"
-                if company:
-                    timeline_entry = f"{title} at {company}"
-                else:
-                    timeline_entry = title
-                if date_range:
-                    timeline_entry += f" ({fmt_range(date_range)})"
-                bullet(timeline_entry, 0)
-    # --- education / training ---
-    education = sections.get("Education", [])
-    training = sections.get("Training", [])
-    # Check if we have any real education or if it's just experience duration
-    has_real_education = False
-    processed_education = []
-    experience_years = None
-    for ed in education:
-        # Ensure ed is a string
-        if not isinstance(ed, str):
-            continue
-        # Clean up the education entry (remove bullets)
-        clean_ed = ed.replace('•', '').strip()
-        if re.match(r'^\d+\s+years?$', clean_ed, re.I):
-            # This is experience duration, not education
-            experience_years = clean_ed
-        else:
-            processed_education.append(clean_ed)
-            has_real_education = True
-    # Show education section
-    if has_real_education:
-        heading("Education:")
-        for ed in processed_education:
-            bullet(ed)
-    elif experience_years:
-        # If only experience years found, show it as a note
-        heading("Education:")
-        pg = doc.add_paragraph()
-        pg.add_run(f"Professional experience: {experience_years}").font.size = Pt(11)
-    if training:
-        heading("Training:")
-        for tr in training:
-            # Ensure tr is a string
-            if isinstance(tr, str) and tr.strip():
-                bullet(tr)
-    # Final diagnostic before returning
-    logger.info(f"BUILDER: FINAL STATE - Document has {len(doc.sections)} sections")
-    for i, section_obj in enumerate(doc.sections):
-        if section_obj.header:
-            logger.info(f"BUILDER: FINAL - Section {i} header has {len(section_obj.header.paragraphs)} paragraphs")
-        if section_obj.footer:
-            logger.info(f"BUILDER: FINAL - Section {i} footer has {len(section_obj.footer.paragraphs)} paragraphs")
-    return doc

+import logging
+import os
+import re
 from datetime import datetime
 from dateutil.parser import parse as date_parse
 from docx import Document
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
+from docx.shared import Inches, Pt
 logger = logging.getLogger(__name__)
+def fmt_range(raw: str) -> str:
+    """Formats a date range string nicely."""
+    if not raw:
+        return ""
     parts = [p.strip() for p in re.split(r"\s*[–-]\s*", raw)]
     formatted_parts = []
             formatted_parts.append("Present")
         else:
             try:
+                date_obj = date_parse(part, fuzzy=True, default=datetime(1900, 1, 1))
+                if date_obj.year == 1900:
+                    formatted_parts.append(part)
+                else:
+                    formatted_parts.append(date_obj.strftime("%B %Y"))
+            except (ValueError, TypeError):
+                formatted_parts.append(part)
     return " – ".join(formatted_parts)
+def add_section_heading(doc, text):
+    """Adds a centered section heading."""
+    p = doc.add_paragraph()
+    run = p.add_run(text.upper())
+    run.bold = True
+    font = run.font
+    font.size = Pt(12)
+    font.name = 'Arial'
+    p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+    p.paragraph_format.space_after = Pt(6)
+def build_resume_from_data(tmpl: str, sections: dict, remove_blank_pages_enabled: bool = True) -> Document:
+    """
+    Builds a formatted resume from structured data, inserting header/footer images and logging the process.
+    """
+    logger.info("BUILDER: Starting image-based resume build process.")
+    try:
+        # 1. Create a new blank document, ignoring the template file
+        doc = Document()
+        logger.info("BUILDER: Successfully created a new blank document.")
+        # Get section and enable different first page header/footer
+        section = doc.sections[0]
+        section.different_first_page = True
+        # Move header and footer to the very edge of the page
+        section.header_distance = Pt(0)
+        section.footer_distance = Pt(0)
+        logger.info("BUILDER: Set header/footer distance to 0 to remove whitespace.")
+        # 2. Define image paths relative to the project root
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        project_root = os.path.dirname(script_dir)
+        header_path = os.path.join(project_root, 'header.png')
+        footer_path = os.path.join(project_root, 'footer.png')
+        logger.info(f"BUILDER: Attempting to use header image from: {header_path}")
+        logger.info(f"BUILDER: Attempting to use footer image from: {footer_path}")
+        if not os.path.exists(header_path):
+            logger.error(f"BUILDER FATAL: Header image not found at '{header_path}'. Cannot proceed.")
+            return doc # Return empty doc
+        if not os.path.exists(footer_path):
+            logger.error(f"BUILDER FATAL: Footer image not found at '{footer_path}'. Cannot proceed.")
+            return doc # Return empty doc
+        # 3. Setup Headers
+        candidate_name = sections.get("Name", "Candidate Name Not Found")
+        experiences = sections.get("StructuredExperiences", [])
+        job_title = experiences[0].get("title", "") if experiences else ""
+        # -- First Page Header (Image + Name + Title) --
+        first_page_header = section.first_page_header
+        first_page_header.is_linked_to_previous = False
+        # Safely get or create a paragraph for the image
+        p_header_img_first = first_page_header.paragraphs[0] if first_page_header.paragraphs else first_page_header.add_paragraph()
+        p_header_img_first.clear()
+        p_header_img_first.paragraph_format.space_before = Pt(0)
+        p_header_img_first.paragraph_format.space_after = Pt(0)
+        p_header_img_first.paragraph_format.left_indent = -section.left_margin
+        p_header_img_first.add_run().add_picture(header_path, width=section.page_width)
+        logger.info("BUILDER: Inserted header.png into FIRST PAGE header.")
+        # Add Name
+        p_name = first_page_header.add_paragraph()
+        run_name = p_name.add_run(candidate_name.upper())
+        run_name.font.name = 'Arial'
+        run_name.font.size = Pt(14)
+        run_name.bold = True
+        p_name.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+        p_name.paragraph_format.space_before = Pt(6)
+        p_name.paragraph_format.space_after = Pt(0)
+        logger.info(f"BUILDER: Added candidate name '{candidate_name}' to FIRST PAGE header.")
+        # Add Job Title
+        if job_title:
+            p_title = first_page_header.add_paragraph()
+            run_title = p_title.add_run(job_title)
+            run_title.font.name = 'Arial'
+            run_title.font.size = Pt(11)
+            p_title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+            p_title.paragraph_format.space_before = Pt(0)
+            logger.info(f"BUILDER: Added job title '{job_title}' to FIRST PAGE header.")
+        # -- Primary Header for subsequent pages (Image Only) --
+        primary_header = section.header
+        primary_header.is_linked_to_previous = False
+        # Safely get or create a paragraph for the image
+        p_header_img_primary = primary_header.paragraphs[0] if primary_header.paragraphs else primary_header.add_paragraph()
+        p_header_img_primary.clear()
+        p_header_img_primary.paragraph_format.space_before = Pt(0)
+        p_header_img_primary.paragraph_format.space_after = Pt(0)
+        p_header_img_primary.paragraph_format.left_indent = -section.left_margin
+        p_header_img_primary.add_run().add_picture(header_path, width=section.page_width)
+        logger.info("BUILDER: Inserted header.png into PRIMARY header for subsequent pages.")
+        # 4. Insert Footer Image (same for all pages)
+        footer = section.footer
+        footer.is_linked_to_previous = False
+        # Safely get or create a paragraph for the image
+        p_footer_img = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
+        p_footer_img.clear()
+        p_footer_img.paragraph_format.space_before = Pt(0)
+        p_footer_img.paragraph_format.space_after = Pt(0)
+        p_footer_img.paragraph_format.left_indent = -section.left_margin
+        p_footer_img.add_run().add_picture(footer_path, width=section.page_width)
+        # Link the first page footer to the primary footer so we only define it once.
+        section.first_page_footer.is_linked_to_previous = True
+        logger.info("BUILDER: Inserted footer.png and configured for all pages.")
+        # 5. Build Resume Body
+        logger.info("BUILDER: Proceeding to add structured resume content to document body.")
+        # --- Professional Summary ---
+        if sections.get("Summary"):
+            add_section_heading(doc, "Professional Summary")
+            doc.add_paragraph(sections["Summary"]).paragraph_format.space_after = Pt(12)
+        # --- Skills ---
+        if sections.get("Skills"):
+            add_section_heading(doc, "Skills")
+            skills_text = ", ".join(sections["Skills"])
+            p = doc.add_paragraph(skills_text)
+            p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
+            p.paragraph_format.space_after = Pt(12)
+        # --- Professional Experience ---
+        if experiences:
+            add_section_heading(doc, "Professional Experience")
+            for exp in experiences:
+                if not isinstance(exp, dict):
+                    continue
+                p = doc.add_paragraph()
+                p.add_run(exp.get("title", "N/A")).bold = True
+                p.add_run(" | ").bold = True
+                p.add_run(exp.get("company", "N/A")).italic = True
+                p.add_run(f'\t{fmt_range(exp.get("date_range", ""))}')
+                tab_stops = p.paragraph_format.tab_stops
+                tab_stops.add_tab_stop(Inches(6.5), WD_TAB_ALIGNMENT.RIGHT)
+                responsibilities = exp.get("responsibilities", [])
+                if responsibilities and isinstance(responsibilities, list):
+                    for resp in responsibilities:
+                        if resp.strip():
+                            try:
+                                p_resp = doc.add_paragraph(resp, style='List Bullet')
+                            except KeyError:
+                                p_resp = doc.add_paragraph(f"• {resp}")
+                            p_resp.paragraph_format.left_indent = Inches(0.25)
+                            p_resp.paragraph_format.space_before = Pt(0)
+                            p_resp.paragraph_format.space_after = Pt(3)
+                doc.add_paragraph().paragraph_format.space_after = Pt(6)
+        # --- Education ---
+        if sections.get("Education"):
+            add_section_heading(doc, "Education")
+            for edu in sections.get("Education", []):
+                if edu.strip():
+                    try:
+                        p_edu = doc.add_paragraph(edu, style='List Bullet')
+                    except KeyError:
+                        p_edu = doc.add_paragraph(f"• {edu}")
+                    p_edu.paragraph_format.left_indent = Inches(0.25)
+        logger.info("BUILDER: Resume build process completed successfully.")
+        return doc
+    except Exception:
+        logger.error("BUILDER: An unexpected error occurred during resume generation.", exc_info=True)
+        return Document()

utils/openai_extractor.py CHANGED Viewed

@@ -1,165 +1,175 @@
-#!/usr/bin/env python3
 """
-OpenAI GPT-4o Resume Extractor
-This module provides resume extraction using OpenAI's GPT-4o model (GPT-4.1),
-which is the latest and most capable model for complex resume parsing.
 """
 import json
 import re
 import logging
-import os
 from typing import Dict, Any, List, Optional
 from openai import OpenAI
-# Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class OpenAIResumeExtractor:
     """
-    Production-ready resume extractor using OpenAI GPT-4o (GPT-4.1)
     """
     def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"):
-        """
-        Initialize the OpenAI extractor
-        Args:
-            api_key: OpenAI API key (optional, will use env var if not provided)
-            model: OpenAI model to use (gpt-4o is the latest and most capable GPT-4 model)
-        """
-        self.api_key = api_key or os.getenv('OPENAI_API_KEY')
         self.model = model
-        if not self.api_key:
-            raise ValueError("No OpenAI API key found. Set OPENAI_API_KEY environment variable.")
-        self.client = OpenAI(api_key=self.api_key)
     def extract_sections_openai(self, text: str) -> Dict[str, Any]:
         """
-        Extract resume sections using OpenAI GPT-4o
         Args:
             text: Raw resume text
         Returns:
-            Structured resume data
         """
-        logger.info("Starting OpenAI GPT-4o extraction...")
         try:
-            # Create a comprehensive prompt for structured extraction
             prompt = self._create_extraction_prompt(text)
-            # Make API call to OpenAI
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[
-                    {
-                        "role": "system",
-                        "content": "You are an expert resume parser. Extract information accurately and return valid JSON only."
-                    },
-                    {
-                        "role": "user",
-                        "content": prompt
-                    }
                 ],
-                temperature=0.1,  # Low temperature for consistent results
                 max_tokens=2000
             )
-            # Parse the response
-            result_text = response.choices[0].message.content.strip()
-            # Clean up the response to extract JSON
-            if "```json" in result_text:
-                result_text = result_text.split("```json")[1].split("```")[0]
-            elif "```" in result_text:
-                result_text = result_text.split("```")[1]
-            # Parse JSON
-            result = json.loads(result_text)
-            # Validate and clean the result
             result = self._validate_and_clean_result(result)
-            # Extract contact info from the original text
             contact_info = self._extract_contact_info(text)
             result["ContactInfo"] = contact_info
             logger.info("✅ OpenAI extraction completed successfully")
             return result
         except Exception as e:
             logger.error(f"OpenAI extraction failed: {e}")
-            # Check if it's an API key issue
-            if "401" in str(e) or "invalid_api_key" in str(e):
-                logger.error("❌ Invalid OpenAI API key - please check your OPENAI_API_KEY environment variable")
-                # Return empty result to force hybrid system to try other methods
-                return self._get_empty_result()
-            # For other errors, fallback to regex extraction
-            return self._fallback_extraction(text)
     def _create_extraction_prompt(self, text: str) -> str:
-        """Create a comprehensive prompt for resume extraction"""
         prompt = f"""
-Extract the following information from this resume text and return it as valid JSON:
-RESUME TEXT:
-{text}
-Extract and return ONLY a JSON object with this exact structure:
 {{
-    "Name": "Full name of the person",
-    "Summary": "Professional summary or objective (full text)",
-    "Skills": ["skill1", "skill2", "skill3"],
-    "StructuredExperiences": [
-        {{
-            "title": "Job title",
-            "company": "Company name",
-            "date_range": "Date range (e.g., Jan 2021 - Present)",
-            "responsibilities": ["responsibility 1", "responsibility 2"]
-        }}
-    ],
-    "Education": ["degree | institution | year"],
-    "Training": []
 }}
-EXTRACTION RULES:
-1. Name: Extract the full name from the top of the resume
-2. Summary: Extract the complete professional summary/objective section
-3. Skills: Extract technical skills only (programming languages, tools, frameworks)
-4. StructuredExperiences: For each job, extract:
-   - title: The job title/position
-   - company: Company name (include location if provided)
-   - date_range: Employment dates
-   - responsibilities: List of bullet points describing what they did
-5. Education: Extract degrees, institutions, and graduation years
-6. Training: Extract certifications, courses, training programs
-IMPORTANT:
 - Return ONLY valid JSON, no explanations
 - If a section is not found, use empty string or empty array
-- For skills, exclude company names and focus on technical skills
-- For experiences, look for patterns like "Title | Company | Dates" or similar
-- Extract ALL job experiences found in the resume
-- Include ALL bullet points under each job as responsibilities
 """
         return prompt
     def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
-        """Validate and clean the extraction result"""
         # Ensure all required keys exist
-        required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training"]
         for key in required_keys:
             if key not in result:
                 result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else ""
@@ -187,59 +197,45 @@ IMPORTANT:
         return result
-    def _get_empty_result(self) -> Dict[str, Any]:
-        """Return empty result structure for API failures"""
-        return {
-            "Name": "",
-            "Summary": "",
-            "Skills": [],
-            "StructuredExperiences": [],
-            "Education": [],
-            "Training": [],
-            "ContactInfo": {}
-        }
     def _is_company_name(self, text: str) -> bool:
-        """Check if text looks like a company name rather than a skill"""
         company_indicators = [
             "inc", "llc", "corp", "ltd", "company", "solutions", "services",
-            "systems", "technologies", "financial", "insurance", "abc", "xyz"
         ]
         text_lower = text.lower()
         return any(indicator in text_lower for indicator in company_indicators)
     def _fallback_extraction(self, text: str) -> Dict[str, Any]:
-        """Fallback to regex-based extraction if OpenAI fails"""
         logger.info("Using regex fallback extraction...")
-        try:
-            from utils.hf_extractor_simple import extract_sections_hf_simple
-            return extract_sections_hf_simple(text)
-        except ImportError:
-            # Basic regex fallback
-            return {
-                "Name": self._extract_name_regex(text),
-                "Summary": self._extract_summary_regex(text),
-                "Skills": self._extract_skills_regex(text),
-                "StructuredExperiences": self._extract_experiences_regex(text),
-                "Education": self._extract_education_regex(text),
-                "Training": [],
-                "ContactInfo": self._extract_contact_info(text)
-            }
     def _extract_name_regex(self, text: str) -> str:
-        """Regex fallback for name extraction"""
         lines = text.split('\n')[:5]
         for line in lines:
             line = line.strip()
             if re.search(r'@|phone|email|linkedin|github', line.lower()):
                 continue
-            name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line)
             if name_match:
                 return name_match.group(1)
         return ""
     def _extract_summary_regex(self, text: str) -> str:
-        """Regex fallback for summary extraction"""
         summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))'
         match = re.search(summary_pattern, text, re.DOTALL)
         if match:
@@ -250,7 +246,7 @@ IMPORTANT:
         return ""
     def _extract_skills_regex(self, text: str) -> List[str]:
-        """Regex fallback for skills extraction"""
         skills = set()
         # Look for technical skills section
@@ -269,7 +265,7 @@ IMPORTANT:
         return sorted(list(skills))
     def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
-        """Regex fallback for experience extraction"""
         experiences = []
         # Look for work experience section
@@ -303,7 +299,7 @@ IMPORTANT:
         return experiences
     def _extract_education_regex(self, text: str) -> List[str]:
-        """Regex fallback for education extraction"""
         education = []
         edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))'
@@ -319,98 +315,25 @@ IMPORTANT:
         return education
-    def _extract_contact_info(self, text: str) -> Dict[str, str]:
-        """Extract contact information (email, phone, LinkedIn)"""
-        contact_info = {}
-        # Extract email
-        email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
-        if email_match:
-            contact_info["email"] = email_match.group(0)
-        # Extract phone
-        phone_patterns = [
-            r'\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})',
-            r'(\d{3})[-.\s](\d{3})[-.\s](\d{4})',
-            r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
-        ]
-        for pattern in phone_patterns:
-            phone_match = re.search(pattern, text)
-            if phone_match:
-                contact_info["phone"] = phone_match.group(0)
-                break
-        # Extract LinkedIn
-        linkedin_patterns = [
-            r'linkedin\.com/in/[\w-]+',
-            r'linkedin\.com/[\w-]+',
-            r'(?i)linkedin[:\s]+[\w.-]+',
         ]
-        for pattern in linkedin_patterns:
-            linkedin_match = re.search(pattern, text)
-            if linkedin_match:
-                linkedin_url = linkedin_match.group(0)
-                if not linkedin_url.startswith('http'):
-                    linkedin_url = f"https://{linkedin_url}"
-                contact_info["linkedin"] = linkedin_url
-                break
-        return contact_info
-# Convenience function for easy usage
 def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
-    """
-    Extract resume sections using OpenAI GPT-4o (GPT-4.1)
-    Args:
-        text: Raw resume text
-        api_key: OpenAI API key (optional)
-    Returns:
-        Structured resume data
-    """
     extractor = OpenAIResumeExtractor(api_key=api_key)
-    return extractor.extract_sections_openai(text)
-# Test function
-def test_openai_extraction():
-    """Test the OpenAI extraction with sample resume"""
-    sample_text = """
-    John Doe
-    Selenium Java Automation Engineer
-    Email: johndoe@example.com | Phone: +1-123-456-7890
-    Professional Summary
-    Results-driven Automation Test Engineer with 8 years of experience in Selenium and Java,
-    specializing in automation frameworks for financial and insurance domains.
-    Technical Skills
-    Selenium WebDriver, Java, TestNG, Cucumber, Jenkins, Maven, Git, REST Assured, Postman,
-    JIRA, Agile/Scrum, CI/CD
-    Work Experience
-    Senior Automation Test Engineer | ABC Financial Services | Jan 2021 - Present
-    - Led automation framework enhancements using Selenium and Java, improving test efficiency.
-    - Automated end-to-end UI and API testing for financial applications, reducing manual effort by 40%.
-    Automation Test Engineer | XYZ Insurance Solutions | Jun 2017 - Dec 2020
-    - Designed and implemented Selenium automation framework using Java and TestNG.
-    - Developed automated test scripts for insurance policy management applications.
-    Education
-    Bachelor of Technology in Computer Science | ABC University | 2015
-    """
-    extractor = OpenAIResumeExtractor()
-    result = extractor.extract_sections_openai(sample_text)
-    print("OpenAI Extraction Results:")
-    print(json.dumps(result, indent=2))
-    return result
-if __name__ == "__main__":
-    test_openai_extraction()

 """
+OpenAI-based resume data extraction.
+Uses GPT models to extract structured information from resume text.
 """
 import json
 import re
 import logging
 from typing import Dict, Any, List, Optional
+import openai
 from openai import OpenAI
+# Set up logging
 logger = logging.getLogger(__name__)
 class OpenAIResumeExtractor:
     """
+    Resume data extractor using OpenAI's GPT models.
     """
     def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"):
+        """Initialize with OpenAI API key and model."""
+        self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
         self.model = model
+        logger.info(f"OpenAI extractor initialized with model: {model}")
     def extract_sections_openai(self, text: str) -> Dict[str, Any]:
         """
+        Extract resume sections using OpenAI API.
         Args:
             text: Raw resume text
         Returns:
+            Dict containing extracted sections
         """
+        logger.info("Starting OpenAI extraction...")
         try:
+            # Create extraction prompt
             prompt = self._create_extraction_prompt(text)
+            # Call OpenAI API
             response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[
+                    {"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."},
+                    {"role": "user", "content": prompt}
                 ],
+                temperature=0.1,
                 max_tokens=2000
             )
+            # Parse response
+            content = response.choices[0].message.content.strip()
+            logger.debug(f"OpenAI response: {content[:200]}...")
+            # Clean and parse JSON
+            content = self._clean_json_response(content)
+            result = json.loads(content)
+            # Validate and enhance result
             result = self._validate_and_clean_result(result)
+            # Add contact info extraction
             contact_info = self._extract_contact_info(text)
             result["ContactInfo"] = contact_info
             logger.info("✅ OpenAI extraction completed successfully")
             return result
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON parsing error: {e}")
+            logger.debug(f"Response content: {content}")
+            return self._fallback_extraction(text)
         except Exception as e:
             logger.error(f"OpenAI extraction failed: {e}")
+            return self._fallback_extraction(text)
+    def _clean_json_response(self, content: str) -> str:
+        """Clean JSON response from OpenAI."""
+        # Remove markdown code blocks
+        content = re.sub(r'```json\s*', '', content)
+        content = re.sub(r'```\s*$', '', content)
+        # Remove any text before first {
+        start = content.find('{')
+        if start > 0:
+            content = content[start:]
+        # Remove any text after last }
+        end = content.rfind('}')
+        if end > 0 and end < len(content) - 1:
+            content = content[:end + 1]
+        return content.strip()
     def _create_extraction_prompt(self, text: str) -> str:
+        """Create prompt for OpenAI extraction."""
         prompt = f"""
+Extract information from this resume and return ONLY valid JSON in this exact format:
 {{
+  "Name": "Full Name with credentials (PhD, MBA, etc.)",
+  "Summary": "Professional summary or objective",
+  "Skills": ["skill1", "skill2", "skill3"],
+  "StructuredExperiences": [
+    {{
+      "title": "Job Title",
+      "company": "Company Name",
+      "date_range": "Start Date - End Date",
+      "responsibilities": ["responsibility1", "responsibility2"]
+    }}
+  ],
+  "Education": ["degree info", "school info"],
+  "Training": ["certification1", "training1"],
+  "Address": "Full address if available"
 }}
+Resume text:
+{text}
+CRITICAL INSTRUCTIONS:
+- For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA"
+- Read the ENTIRE resume text carefully, don't miss content
+- Extract ALL work experiences with full details
 - Return ONLY valid JSON, no explanations
 - If a section is not found, use empty string or empty array
+- Extract actual technical skills, not company names
 """
         return prompt
+    def _extract_contact_info(self, text: str) -> Dict[str, str]:
+        """Extract contact information from resume text."""
+        contact_info = {}
+        # Extract email
+        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        email_match = re.search(email_pattern, text)
+        if email_match:
+            contact_info['email'] = email_match.group()
+        # Extract phone number
+        phone_patterns = [
+            r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
+            r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
+            r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
+        ]
+        for pattern in phone_patterns:
+            phone_match = re.search(pattern, text)
+            if phone_match:
+                contact_info['phone'] = phone_match.group().strip()
+                break
+        # Extract LinkedIn
+        linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+'
+        linkedin_match = re.search(linkedin_pattern, text)
+        if linkedin_match:
+            contact_info['linkedin'] = linkedin_match.group()
+        logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}")
+        return contact_info
     def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate and clean the extraction result."""
         # Ensure all required keys exist
+        required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"]
         for key in required_keys:
             if key not in result:
                 result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else ""
         return result
     def _is_company_name(self, text: str) -> bool:
+        """Check if text looks like a company name rather than a skill."""
         company_indicators = [
             "inc", "llc", "corp", "ltd", "company", "solutions", "services",
+            "systems", "technologies", "financial", "insurance"
         ]
         text_lower = text.lower()
         return any(indicator in text_lower for indicator in company_indicators)
     def _fallback_extraction(self, text: str) -> Dict[str, Any]:
+        """Fallback to regex-based extraction if OpenAI fails."""
         logger.info("Using regex fallback extraction...")
+        return {
+            "Name": self._extract_name_regex(text),
+            "Summary": self._extract_summary_regex(text),
+            "Skills": self._extract_skills_regex(text),
+            "StructuredExperiences": self._extract_experiences_regex(text),
+            "Education": self._extract_education_regex(text),
+            "Training": [],
+            "Address": self._extract_address_regex(text),
+            "ContactInfo": self._extract_contact_info(text)
+        }
     def _extract_name_regex(self, text: str) -> str:
+        """Regex fallback for name extraction."""
         lines = text.split('\n')[:5]
         for line in lines:
             line = line.strip()
             if re.search(r'@|phone|email|linkedin|github', line.lower()):
                 continue
+            # Match name with potential credentials (PhD, MBA, etc.)
+            name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s*[A-Z][a-z.]+(?:,\s*[A-Z][a-z.]+)?)?)', line)
             if name_match:
                 return name_match.group(1)
         return ""
     def _extract_summary_regex(self, text: str) -> str:
+        """Regex fallback for summary extraction."""
         summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))'
         match = re.search(summary_pattern, text, re.DOTALL)
         if match:
         return ""
     def _extract_skills_regex(self, text: str) -> List[str]:
+        """Regex fallback for skills extraction."""
         skills = set()
         # Look for technical skills section
         return sorted(list(skills))
     def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
+        """Regex fallback for experience extraction."""
         experiences = []
         # Look for work experience section
         return experiences
     def _extract_education_regex(self, text: str) -> List[str]:
+        """Regex fallback for education extraction."""
         education = []
         edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))'
         return education
+    def _extract_address_regex(self, text: str) -> str:
+        """Regex fallback for address extraction."""
+        # Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016"
+        address_patterns = [
+            r'(\d+\s+[A-Za-z\s\.]+(?:Suite|Apt|Unit)\s+\d+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})',
+            r'(\d+\s+[A-Za-z\s\.]+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})',
+            r'([A-Za-z\s\d\.]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})'
         ]
+        for pattern in address_patterns:
+            match = re.search(pattern, text)
+            if match:
+                return match.group(1).strip()
+        return ""
+# Main extraction function for compatibility
 def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
+    """Extract resume sections using OpenAI API."""
     extractor = OpenAIResumeExtractor(api_key=api_key)
+    return extractor.extract_sections_openai(text)

utils/parser.py CHANGED Viewed

@@ -3,7 +3,7 @@ import fitz  # PyMuPDF
 import re
 from io import BytesIO
 from docx import Document
-from config import supabase, embedding_model, client, query
 def extract_name(resume_text: str) -> str:
     # look at the very top lines for a capitalized full name

 import re
 from io import BytesIO
 from docx import Document
+from config import supabase, embedding_model, HF_ENDPOINTS, query
 def extract_name(resume_text: str) -> str:
     # look at the very top lines for a capitalized full name

utils/reporting.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import re
 import fitz  # PyMuPDF
 from io import BytesIO
-from config import supabase, embedding_model, client, query
 from .screening import evaluate_resumes
 def generate_pdf_report(shortlisted_candidates, questions=None):
@@ -45,7 +45,7 @@ def generate_interview_questions_from_summaries(candidates):
     )
     try:
-        response = client.chat.completions.create(
             model="tgi",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.7,

 import re
 import fitz  # PyMuPDF
 from io import BytesIO
+from config import supabase, embedding_model, query
 from .screening import evaluate_resumes
 def generate_pdf_report(shortlisted_candidates, questions=None):
     )
     try:
+        response = supabase.ai.chat.completions.create(
             model="tgi",
             messages=[{"role": "user", "content": prompt}],
             temperature=0.7,

utils/screening.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from .parser     import parse_resume, extract_email, summarize_resume
 from .hybrid_extractor import extract_resume_sections
 from .spacy_loader import get_nlp, is_spacy_available
-from config      import supabase, embedding_model, client
 from fuzzywuzzy import fuzz
 from sentence_transformers import util
 import streamlit as st

 from .parser     import parse_resume, extract_email, summarize_resume
 from .hybrid_extractor import extract_resume_sections
 from .spacy_loader import get_nlp, is_spacy_available
+from config      import supabase, embedding_model
 from fuzzywuzzy import fuzz
 from sentence_transformers import util
 import streamlit as st