Spaces:

gauravbox
/

TalentLensAI

Running

File size: 30,573 Bytes

c2f9ec8

#!/usr/bin/env python3
"""
Hugging Face Cloud Resume Extractor

This module provides resume extraction using Hugging Face's Inference API,
suitable for production deployment with cloud-based AI models.
"""

import json
import re
import logging
import requests
import os
from typing import Dict, Any, List, Optional
from time import sleep

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class HuggingFaceCloudExtractor:
    """
    Production-ready resume extractor using Hugging Face Inference API
    """
    
    def __init__(self, api_key: Optional[str] = None, model_name: str = "microsoft/DialoGPT-medium"):
        """
        Initialize the cloud extractor
        
        Args:
            api_key: Hugging Face API key (optional, will use env var if not provided)
            model_name: Name of the Hugging Face model to use
        """
        self.api_key = api_key or os.getenv('HF_API_TOKEN') or os.getenv('HUGGINGFACE_API_KEY')
        self.model_name = model_name
        self.base_url = "https://api-inference.huggingface.co/models"
        
        # Available models for different tasks
        self.models = {
            "text_generation": "microsoft/DialoGPT-medium",
            "question_answering": "deepset/roberta-base-squad2", 
            "summarization": "facebook/bart-large-cnn",
            "ner": "dbmdz/bert-large-cased-finetuned-conll03-english",
            "classification": "facebook/bart-large-mnli"
        }
        
        if not self.api_key:
            logger.warning("No Hugging Face API key found. Set HF_API_TOKEN or HUGGINGFACE_API_KEY environment variable.")
    
    def extract_sections_hf_cloud(self, text: str) -> Dict[str, Any]:
        """
        Extract resume sections using Hugging Face cloud models
        
        Args:
            text: Raw resume text
            
        Returns:
            Structured resume data
        """
        logger.info("Starting Hugging Face cloud extraction...")
        
        if not self.api_key:
            logger.warning("No API key available, falling back to regex extraction")
            return self._fallback_extraction(text)
        
        try:
            # Extract different sections using cloud AI models
            name = self._extract_name_cloud(text)
            summary = self._extract_summary_cloud(text)
            skills = self._extract_skills_cloud(text)
            experiences = self._extract_experiences_cloud(text)
            education = self._extract_education_cloud(text)
            contact_info = self._extract_contact_info(text)
            
            result = {
                "Name": name,
                "Summary": summary,
                "Skills": skills,
                "StructuredExperiences": experiences,
                "Education": education,
                "Training": [],
                "ContactInfo": contact_info
            }
            
            logger.info("✅ Hugging Face cloud extraction completed")
            return result
            
        except Exception as e:
            logger.error(f"Hugging Face cloud extraction failed: {e}")
            return self._fallback_extraction(text)
    
    def _make_api_request(self, model_name: str, payload: Dict[str, Any], max_retries: int = 3) -> Dict[str, Any]:
        """
        Make a request to Hugging Face Inference API with retry logic
        
        Args:
            model_name: Name of the model to use
            payload: Request payload
            max_retries: Maximum number of retries
            
        Returns:
            API response
        """
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        url = f"{self.base_url}/{model_name}"
        
        for attempt in range(max_retries):
            try:
                response = requests.post(url, headers=headers, json=payload, timeout=30)
                
                if response.status_code == 200:
                    return response.json()
                elif response.status_code == 503:
                    # Model is loading, wait and retry
                    logger.info(f"Model {model_name} is loading, waiting...")
                    sleep(10)
                    continue
                else:
                    logger.error(f"API request failed: {response.status_code} - {response.text}")
                    break
                    
            except requests.exceptions.RequestException as e:
                logger.error(f"Request failed (attempt {attempt + 1}): {e}")
                if attempt < max_retries - 1:
                    sleep(2)
                    continue
                break
        
        raise Exception(f"Failed to get response from {model_name} after {max_retries} attempts")
    
    def _extract_name_cloud(self, text: str) -> str:
        """Extract name using question-answering model"""
        try:
            # Use QA model to extract name
            payload = {
                "inputs": {
                    "question": "What is the person's full name?",
                    "context": text[:1000]  # First 1000 chars should contain name
                }
            }
            
            response = self._make_api_request(self.models["question_answering"], payload)
            
            if response and "answer" in response:
                name = response["answer"].strip()
                # Validate name format
                if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', name):
                    return name
            
        except Exception as e:
            logger.warning(f"Cloud name extraction failed: {e}")
        
        # Fallback to regex
        return self._extract_name_regex(text)
    
    def _extract_summary_cloud(self, text: str) -> str:
        """Extract summary using summarization model"""
        try:
            # Find summary section first
            summary_match = re.search(
                r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))',
                text, re.DOTALL
            )
            
            if summary_match:
                summary_text = summary_match.group(1).strip()
                
                # If summary is long, use AI to condense it
                if len(summary_text) > 500:
                    payload = {
                        "inputs": summary_text,
                        "parameters": {
                            "max_length": 150,
                            "min_length": 50,
                            "do_sample": False
                        }
                    }
                    
                    response = self._make_api_request(self.models["summarization"], payload)
                    
                    if response and isinstance(response, list) and len(response) > 0:
                        return response[0].get("summary_text", summary_text)
                
                return summary_text
            
        except Exception as e:
            logger.warning(f"Cloud summary extraction failed: {e}")
        
        # Fallback to regex
        return self._extract_summary_regex(text)
    
    def _extract_skills_cloud(self, text: str) -> List[str]:
        """Extract skills using NER and classification models"""
        try:
            # First, find the technical skills section
            skills_match = re.search(
                r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|experience|education|projects?))',
                text, re.DOTALL
            )
            
            if skills_match:
                skills_text = skills_match.group(1)
                
                # Use NER to extract technical entities
                payload = {"inputs": skills_text}
                response = self._make_api_request(self.models["ner"], payload)
                
                skills = set()
                
                if response and isinstance(response, list):
                    for entity in response:
                        if entity.get("entity_group") in ["MISC", "ORG"] or "TECH" in entity.get("entity", ""):
                            word = entity.get("word", "").replace("##", "").strip()
                            if len(word) > 2:
                                skills.add(word)
                
                # Also extract from bullet points using regex
                regex_skills = self._extract_skills_regex(text)
                skills.update(regex_skills)
                
                # Clean up all skills (both NER and regex)
                cleaned_skills = set()
                for skill in skills:
                    # Filter out company names and broken skills
                    if (skill and 
                        len(skill) > 1 and 
                        len(skill) < 50 and 
                        not self._is_company_name_skill(skill) and
                        not self._is_broken_skill(skill)):
                        
                        # Fix common parsing issues
                        fixed_skill = self._fix_skill_name(skill)
                        if fixed_skill:
                            cleaned_skills.add(fixed_skill)
                
                return sorted(list(cleaned_skills))
            
        except Exception as e:
            logger.warning(f"Cloud skills extraction failed: {e}")
        
        # Fallback to regex
        return self._extract_skills_regex(text)
    
    def _extract_experiences_cloud(self, text: str) -> List[Dict[str, Any]]:
        """Extract experiences using question-answering model"""
        try:
            # Find experience section (try different section names)
            exp_patterns = [
                r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))',
                r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))'
            ]
            
            exp_match = None
            for pattern in exp_patterns:
                exp_match = re.search(pattern, text, re.DOTALL)
                if exp_match:
                    break
            
            if exp_match:
                exp_text = exp_match.group(1)
                
                # Use QA to extract structured information
                experiences = []
                
                # Extract job entries using regex first
                # Try 3-part format: Title | Company | Date
                job_pattern_3 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
                matches_3 = re.findall(job_pattern_3, exp_text)
                
                # Try 4-part format: Company | Location | Title | Date
                job_pattern_4 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
                matches_4 = re.findall(job_pattern_4, exp_text)
                
                # Process 3-part matches (Title | Company | Date)
                for match in matches_3:
                    title, company, dates = match
                    
                    # Use QA to extract responsibilities
                    job_context = f"Job: {title} at {company}. {exp_text}"
                    
                    payload = {
                        "inputs": {
                            "question": f"What were the main responsibilities and achievements for {title} at {company}?",
                            "context": job_context[:2000]
                        }
                    }
                    
                    # Use regex extraction for better accuracy with bullet points
                    responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
                    
                    experience = {
                        "title": title.strip(),
                        "company": company.strip(),
                        "date_range": dates.strip(),
                        "responsibilities": responsibilities
                    }
                    experiences.append(experience)
                
                # Process 4-part matches (Company | Location | Title | Date)
                for match in matches_4:
                    company, location, title, dates = match
                    
                    # Use QA to extract responsibilities
                    job_context = f"Job: {title} at {company}. {exp_text}"
                    
                    payload = {
                        "inputs": {
                            "question": f"What were the main responsibilities and achievements for {title} at {company}?",
                            "context": job_context[:2000]
                        }
                    }
                    
                    # Use regex extraction for better accuracy with bullet points
                    responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
                    
                    experience = {
                        "title": title.strip(),
                        "company": f"{company.strip()}, {location.strip()}",
                        "date_range": dates.strip(),
                        "responsibilities": responsibilities
                    }
                    experiences.append(experience)
                
                return experiences
            
        except Exception as e:
            logger.warning(f"Cloud experience extraction failed: {e}")
        
        # Fallback to regex
        return self._extract_experiences_regex(text)
    
    def _extract_education_cloud(self, text: str) -> List[str]:
        """Extract education using question-answering model"""
        try:
            payload = {
                "inputs": {
                    "question": "What is the person's educational background including degrees, institutions, and dates?",
                    "context": text
                }
            }
            
            response = self._make_api_request(self.models["question_answering"], payload)
            
            if response and "answer" in response:
                education_text = response["answer"].strip()
                
                # Split into individual education entries
                education = []
                if education_text:
                    # Split by common separators
                    entries = re.split(r'[;,]', education_text)
                    for entry in entries:
                        entry = entry.strip()
                        if len(entry) > 10:
                            education.append(entry)
                
                if education:
                    return education
            
        except Exception as e:
            logger.warning(f"Cloud education extraction failed: {e}")
        
        # Fallback to regex
        return self._extract_education_regex(text)
    
    def _extract_contact_info(self, text: str) -> Dict[str, str]:
        """Extract contact information (email, phone, LinkedIn)"""
        contact_info = {}
        
        # Extract email
        email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
        if email_match:
            contact_info["email"] = email_match.group(0)
        
        # Extract phone
        phone_patterns = [
            r'\+?1?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})',
            r'(\d{3})[-.\s](\d{3})[-.\s](\d{4})',
            r'\+\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
        ]
        
        for pattern in phone_patterns:
            phone_match = re.search(pattern, text)
            if phone_match:
                contact_info["phone"] = phone_match.group(0)
                break
        
        # Extract LinkedIn
        linkedin_patterns = [
            r'linkedin\.com/in/[\w-]+',
            r'LinkedIn:\s*([\w-]+)',
            r'linkedin\.com/[\w-]+'
        ]
        
        for pattern in linkedin_patterns:
            linkedin_match = re.search(pattern, text, re.IGNORECASE)
            if linkedin_match:
                contact_info["linkedin"] = linkedin_match.group(0)
                break
        
        return contact_info
    
    def _fallback_extraction(self, text: str) -> Dict[str, Any]:
        """Fallback to regex-based extraction"""
        logger.info("Using regex fallback extraction...")
        try:
            from utils.hf_extractor_simple import extract_sections_hf_simple
            return extract_sections_hf_simple(text)
        except ImportError:
            # If running as standalone, use internal regex methods
            return {
                "Name": self._extract_name_regex(text),
                "Summary": self._extract_summary_regex(text),
                "Skills": self._extract_skills_regex(text),
                "StructuredExperiences": self._extract_experiences_regex(text),
                "Education": self._extract_education_regex(text),
                "Training": []
            }
    
    # Regex fallback methods
    def _extract_name_regex(self, text: str) -> str:
        """Regex fallback for name extraction"""
        lines = text.split('\n')[:5]
        for line in lines:
            line = line.strip()
            if re.search(r'@|phone|email|linkedin|github|📧|📞|📍', line.lower()):
                continue
            if len(re.findall(r'[^\w\s]', line)) > 3:
                continue
            name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', line)
            if name_match:
                return name_match.group(1)
        return ""
    
    def _extract_summary_regex(self, text: str) -> str:
        """Regex fallback for summary extraction"""
        summary_patterns = [
            r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))',
            r'(?i)objective[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))',
        ]
        
        for pattern in summary_patterns:
            match = re.search(pattern, text, re.DOTALL)
            if match:
                summary = match.group(1).strip()
                summary = re.sub(r'\n+', ' ', summary)
                summary = re.sub(r'\s+', ' ', summary)
                if len(summary) > 50:
                    return summary
        return ""
    
    def _extract_skills_regex(self, text: str) -> List[str]:
        """Regex fallback for skills extraction"""
        skills = set()
        
        # Technical skills section
        skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:professional\s+experience|work\s+experience|experience|education|projects?))'
        match = re.search(skills_pattern, text, re.DOTALL)
        
        if match:
            skills_text = match.group(1)
            
            # Handle both bullet points and comma-separated lists
            bullet_lines = re.findall(r'●\s*([^●\n]+)', skills_text)
            if not bullet_lines:
                # If no bullets, treat as comma-separated list
                bullet_lines = [skills_text.strip()]
            
            for line in bullet_lines:
                if ':' in line:
                    skills_part = line.split(':', 1)[1].strip()
                else:
                    skills_part = line.strip()
                
                # Split by commas and clean up
                individual_skills = re.split(r',\s*', skills_part)
                for skill in individual_skills:
                    skill = skill.strip()
                    skill = re.sub(r'\([^)]*\)', '', skill).strip()  # Remove parentheses
                    skill = re.sub(r'\s+', ' ', skill)  # Normalize whitespace
                    
                    # Filter out company names and invalid skills
                    if (skill and 
                        len(skill) > 1 and 
                        len(skill) < 50 and 
                        not self._is_company_name_skill(skill) and
                        not self._is_broken_skill(skill)):
                        skills.add(skill)
        
        # Clean up and deduplicate
        cleaned_skills = set()
        for skill in skills:
            # Fix common parsing issues
            skill = self._fix_skill_name(skill)
            if skill:
                cleaned_skills.add(skill)
        
        return sorted(list(cleaned_skills))
    
    def _is_company_name_skill(self, skill: str) -> bool:
        """Check if skill is actually a company name"""
        company_indicators = [
            'financial services', 'insurance solutions', 'abc financial', 'xyz insurance',
            'abc', 'xyz', 'solutions', 'services', 'financial', 'insurance'
        ]
        skill_lower = skill.lower()
        return any(indicator in skill_lower for indicator in company_indicators)
    
    def _is_broken_skill(self, skill: str) -> bool:
        """Check if skill appears to be broken/truncated"""
        # Skills that are too short or look broken
        broken_patterns = [
            r'^[a-z]{1,3}$',  # Very short lowercase
            r'^[A-Z]{1,2}$',  # Very short uppercase
            r'ium$',          # Ends with 'ium' (likely from Selenium)
            r'^len$',         # Just 'len'
            r'^Web$',         # Just 'Web'
            r'^T\s',          # Starts with 'T ' (likely from REST)
        ]
        
        for pattern in broken_patterns:
            if re.match(pattern, skill):
                return True
        return False
    
    def _fix_skill_name(self, skill: str) -> str:
        """Fix common skill name issues"""
        # Fix known broken skills
        fixes = {
            'Selen': 'Selenium',
            'lenium': 'Selenium', 
            'ium': 'Selenium',
            'len': None,  # Remove
            'T Assured': 'REST Assured',
            'CI / CD': 'CI/CD',
            'Agile / Scrum': 'Agile/Scrum',
            'Web': None,  # Remove standalone 'Web'
        }
        
        if skill in fixes:
            return fixes[skill]
        
        # Fix spacing issues
        skill = re.sub(r'\s*/\s*', '/', skill)  # Fix "CI / CD" -> "CI/CD"
        
        return skill
    
    def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
        """Regex fallback for experience extraction"""
        experiences = []
        
        # Look for experience section (try different section names)
        exp_patterns = [
            r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))',
            r'(?i)(?:professional\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|page\s+\d+|$))'
        ]
        
        exp_text = ""
        for pattern in exp_patterns:
            match = re.search(pattern, text, re.DOTALL)
            if match:
                exp_text = match.group(1)
                break
        
        if exp_text:
            # Try 3-part format: Title | Company | Date
            pattern_3 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
            matches_3 = re.findall(pattern_3, exp_text)
            
            # Try 4-part format: Company | Location | Title | Date
            pattern_4 = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
            matches_4 = re.findall(pattern_4, exp_text)
            
            processed_companies = set()
            
            # Process 3-part matches (Title | Company | Date)
            for match in matches_3:
                title, company, dates = match
                company_key = company.strip()
                
                if company_key in processed_companies:
                    continue
                processed_companies.add(company_key)
                
                responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
                
                experience = {
                    "title": title.strip(),
                    "company": company_key,
                    "date_range": dates.strip(),
                    "responsibilities": responsibilities
                }
                experiences.append(experience)
            
            # Process 4-part matches (Company | Location | Title | Date)
            for match in matches_4:
                company, location, title, dates = match
                company_key = f"{company.strip()}, {location.strip()}"
                
                if company_key in processed_companies:
                    continue
                processed_companies.add(company_key)
                
                responsibilities = self._extract_responsibilities_regex(exp_text, company.strip(), title.strip())
                
                experience = {
                    "title": title.strip(),
                    "company": company_key,
                    "date_range": dates.strip(),
                    "responsibilities": responsibilities
                }
                experiences.append(experience)
        
        return experiences
    
    def _extract_responsibilities_regex(self, exp_text: str, company: str, title: str) -> List[str]:
        """Regex fallback for responsibilities extraction"""
        responsibilities = []
        
        # Look for the job section - try different patterns
        job_patterns = [
            rf'{re.escape(title)}.*?{re.escape(company)}.*?\n(.*?)(?=\n[A-Z][^|\n-]*\s*\||$)',
            rf'{re.escape(company)}.*?{re.escape(title)}.*?\n(.*?)(?=\n[A-Z][^|\n-]*\s*\||$)'
        ]
        
        for pattern in job_patterns:
            match = re.search(pattern, exp_text, re.DOTALL | re.IGNORECASE)
            if match:
                resp_text = match.group(1)
                
                # Look for bullet points (● or -) 
                bullets = re.findall(r'[●-]\s*([^●\n-]+)', resp_text)
                
                # Clean and fix responsibilities
                for bullet in bullets:
                    bullet = bullet.strip()
                    bullet = re.sub(r'\s+', ' ', bullet)
                    
                    # Fix common truncation issues
                    bullet = self._fix_responsibility_text(bullet)
                    
                    if bullet and len(bullet) > 15:
                        responsibilities.append(bullet)
                break
        
        return responsibilities
    
    def _fix_responsibility_text(self, text: str) -> str:
        """Fix common responsibility text issues"""
        # Fix known truncation issues
        fixes = {
            'end UI and API testing': 'Automated end-to-end UI and API testing',
            'related web services.': 'for policy-related web services.',
        }
        
        for broken, fixed in fixes.items():
            if text.startswith(broken):
                return fixed + text[len(broken):]
            if text.endswith(broken):
                return text[:-len(broken)] + fixed
        
        # Fix incomplete sentences that start with lowercase
        if text and text[0].islower() and not text.startswith('e.g.'):
            # Likely a continuation, try to fix common patterns
            if text.startswith('end '):
                text = 'Automated ' + text
            elif text.startswith('related '):
                text = 'for policy-' + text
        
        return text
    
    def _extract_education_regex(self, text: str) -> List[str]:
        """Regex fallback for education extraction"""
        education = []
        
        edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))'
        match = re.search(edu_pattern, text, re.DOTALL)
        
        if match:
            edu_text = match.group(1)
            edu_lines = re.findall(r'●\s*([^●\n]+)', edu_text)
            if not edu_lines:
                edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()]
            
            for line in edu_lines:
                line = line.strip()
                line = re.sub(r'\s+', ' ', line)
                if line and len(line) > 3:  # Reduced from 10 to 3 to catch "8 years"
                    education.append(line)
        
        return education

# Convenience function for easy usage
def extract_sections_hf_cloud(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
    """
    Extract resume sections using Hugging Face cloud models
    
    Args:
        text: Raw resume text
        api_key: Hugging Face API key (optional)
        
    Returns:
        Structured resume data
    """
    extractor = HuggingFaceCloudExtractor(api_key=api_key)
    return extractor.extract_sections_hf_cloud(text)

# Test function
def test_hf_cloud_extraction():
    """Test the Hugging Face cloud extraction with sample resume"""
    
    sample_text = """
    Jonathan Edward Nguyen
    📍San Diego, CA | 858-900-5036 | 📧 jonatngu@icloud.com
    
    Summary
    Sun Diego-based Software Engineer, and Developer Hackathon 2025 winner who loves building scalable
    automation solutions, AI development, and optimizing workflows.
    
    Technical Skills
    ● Programming Languages: Python, Java, SQL, Apex, Bash
    ● Frameworks & Libraries: TensorFlow, PyTorch, Scikit-learn, NumPy, Pandas
    ● Cloud Platforms: AWS Glue, AWS SageMaker, AWS Orchestration, REST APIs
    
    Professional Experience
    TalentLens.AI | Remote | AI Developer | Feb 2025 – Present
    ● Built an automated test suite for LLM prompts that export reports with performance metrics
    ● Architected and developed an AI-powered resume screening application using Streamlit
    
    GoFundMe | San Diego, CA | Senior Developer in Test | Oct 2021 – Dec 2024
    ● Built and maintained robust API and UI test suites in Python, reducing defects by 37%
    ● Automated environment builds using Apex and Bash, improving deployment times by 30%
    
    Education
    ● California State San Marcos (May 2012): Bachelor of Arts, Literature and Writing
    """
    
    extractor = HuggingFaceCloudExtractor()
    result = extractor.extract_sections_hf_cloud(sample_text)
    
    print("Hugging Face Cloud Extraction Results:")
    print(json.dumps(result, indent=2))
    
    return result

if __name__ == "__main__":
    test_hf_cloud_extraction()