Spaces:

theaniketgiri
/

synthex

Sleeping

App Files Files Community

theaniketgiri commited on Jun 15

Commit

373e5ff

1 Parent(s): 3f61e65

backend

Browse files

Files changed (4) hide show

README.md +63 -25
app.py +129 -32
pytest.ini +10 -0
test_backend.py +209 -0

README.md CHANGED Viewed

@@ -7,42 +7,80 @@ sdk: docker
 app_port: 7860
 ---
-# Synthex Medical Text Generator
-A synthetic medical text generator built with FastAPI and Hugging Face Transformers.
-## Features
-- Generate synthetic medical text data
-- Multiple record types:
-  - Clinical Notes
-  - Discharge Summaries
-  - Lab Reports
-  - Prescriptions
-- HIPAA-compliant fictional data
-- RESTful API endpoints
-## API Endpoints
-- `GET /`: Get API information
-- `POST /generate`: Generate medical records
-- `GET /health`: Health check endpoint
-## Example Usage
 ```bash
-# Generate a clinical note
-curl -X POST "https://theaniketgiri-synthex.hf.space/generate" \
-     -H "Content-Type: application/json" \
-     -d '{"record_type": "clinical_note", "count": 1}'
 ```
-## Technical Details
-- Built with FastAPI
-- Uses Bio_ClinicalBERT model from Hugging Face
-- Docker container with Python 3.9
-- Exposed on port 7860
 ## License

 app_port: 7860
 ---
+# Synthex Backend
+FastAPI backend for the Synthex medical text generation service.
+## Project Structure
+```
+backend/
+├── app/                    # Main application code
+│   ├── api/               # API endpoints
+│   ├── core/              # Core functionality
+│   ├── models/            # Database models
+│   └── services/          # Business logic
+├── tests/                 # Test files
+├── Dockerfile            # Docker configuration
+├── requirements.txt      # Production dependencies
+├── requirements-dev.txt  # Development dependencies
+└── README.md            # This file
+```
+## Setup
+1. Create a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+pip install -r requirements-dev.txt  # For development
+```
+3. Run the application:
+```bash
+uvicorn app.main:app --reload
+```
+## Testing
+1. Run all tests:
 ```bash
+python run_tests.py
 ```
+2. Run backend API tests:
+```bash
+python test_backend.py
+```
+3. Run linters:
+```bash
+python run_linters.py
+```
+## API Endpoints
+- `GET /health`: Health check endpoint
+- `POST /generate`: Generate medical records
+  - Parameters:
+    - `record_type`: Type of record to generate
+    - `count`: Number of records to generate
+## Development
+- Use `requirements-dev.txt` for development dependencies
+- Run linters before committing
+- Write tests for new features
+- Follow PEP 8 style guide
+## Deployment
+The backend is deployed on Hugging Face Spaces. The Dockerfile is configured for this deployment.
 ## License

app.py CHANGED Viewed

@@ -1,11 +1,19 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from typing import List, Optional
 from datetime import datetime
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import json
 app = FastAPI(
     title="Synthex Medical Text Generator",
@@ -25,18 +33,78 @@ app.add_middleware(
 # Initialize model and tokenizer
 model = None
 tokenizer = None
 def load_model():
-    global model, tokenizer
-    if model is None or tokenizer is None:
-        model_name = "emilyalsentzer/Bio_ClinicalBERT"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-    return model, tokenizer
 class GenerateRequest(BaseModel):
-    record_type: str
-    count: int = 1
 class MedicalRecord(BaseModel):
     type: str
@@ -58,32 +126,61 @@ async def generate_records(request: GenerateRequest):
         records = []
         for i in range(request.count):
-            # Generate text using the model
-            input_text = f"Generate a {request.record_type}:"
-            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
-            outputs = model.generate(
-                inputs["input_ids"],
-                max_length=200,
-                num_return_sequences=1,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
-            )
-            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Create record
-            record = MedicalRecord(
-                type=request.record_type,
-                content=generated_text,
-                generated_at=datetime.now().isoformat()
-            )
-            records.append(record)
         return records
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 def health_check():
-    return {"status": "healthy"}

 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, validator
+from typing import List, Optional, Literal
 from datetime import datetime
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import json
+import os
+import logging
+import time
+from huggingface_hub import snapshot_download
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI(
     title="Synthex Medical Text Generator",
 # Initialize model and tokenizer
 model = None
 tokenizer = None
+MODEL_LOADED = False
+MODEL_LOADING = False
+def download_model_with_retry(model_name: str, max_retries: int = 3, retry_delay: int = 60):
+    """Download model with retry logic"""
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"Downloading model (attempt {attempt + 1}/{max_retries})...")
+            # Download model files first
+            snapshot_download(
+                repo_id=model_name,
+                local_files_only=False,
+                resume_download=True
+            )
+            return True
+        except Exception as e:
+            logger.error(f"Download attempt {attempt + 1} failed: {str(e)}")
+            if attempt < max_retries - 1:
+                logger.info(f"Waiting {retry_delay} seconds before retrying...")
+                time.sleep(retry_delay)
+            else:
+                raise
 def load_model():
+    global model, tokenizer, MODEL_LOADED, MODEL_LOADING
+    try:
+        if not MODEL_LOADED and not MODEL_LOADING:
+            MODEL_LOADING = True
+            logger.info("Loading model and tokenizer...")
+            model_name = "emilyalsentzer/Bio_ClinicalBERT"
+            # Set environment variable to disable symlinks warning
+            os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+            # Download model first
+            download_model_with_retry(model_name)
+            # Load tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                local_files_only=True
+            )
+            logger.info("Tokenizer loaded successfully")
+            # Load model
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                local_files_only=True
+            )
+            logger.info("Model loaded successfully")
+            MODEL_LOADED = True
+            MODEL_LOADING = False
+        return model, tokenizer
+    except Exception as e:
+        MODEL_LOADING = False
+        logger.error(f"Error loading model: {str(e)}")
+        raise HTTPException(
+            status_code=503,
+            detail="Model loading failed. Please try again later."
+        )
 class GenerateRequest(BaseModel):
+    record_type: Literal["clinical_note", "discharge_summary", "lab_report", "prescription"]
+    count: int = Field(gt=0, le=10, default=1)
+    @validator('count')
+    def validate_count(cls, v):
+        if v <= 0:
+            raise ValueError("Count must be greater than 0")
+        if v > 10:
+            raise ValueError("Count cannot exceed 10")
+        return v
 class MedicalRecord(BaseModel):
     type: str
         records = []
         for i in range(request.count):
+            try:
+                # Generate text using the model
+                input_text = f"Generate a {request.record_type}:"
+                inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
+                outputs = model.generate(
+                    inputs["input_ids"],
+                    max_length=200,
+                    num_return_sequences=1,
+                    temperature=0.7,
+                    top_p=0.9,
+                    do_sample=True
+                )
+                generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Create record
+                record = MedicalRecord(
+                    type=request.record_type,
+                    content=generated_text,
+                    generated_at=datetime.now().isoformat()
+                )
+                records.append(record)
+            except Exception as e:
+                logger.error(f"Error generating record {i+1}: {str(e)}")
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Error generating record: {str(e)}"
+                )
         return records
     except Exception as e:
+        logger.error(f"Error in generate_records: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error generating records: {str(e)}"
+        )
 @app.get("/health")
 def health_check():
+    try:
+        # Try to load model if not loaded
+        if not MODEL_LOADED and not MODEL_LOADING:
+            load_model()
+        return {
+            "status": "healthy" if MODEL_LOADED else "loading",
+            "timestamp": datetime.now().isoformat(),
+            "model_loaded": MODEL_LOADED,
+            "model_loading": MODEL_LOADING
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {str(e)}")
+        return {
+            "status": "unhealthy",
+            "timestamp": datetime.now().isoformat(),
+            "model_loaded": MODEL_LOADED,
+            "model_loading": MODEL_LOADING,
+            "error": str(e)
+        }

pytest.ini ADDED Viewed

	@@ -0,0 +1,10 @@

+[pytest]
+testpaths = .
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts = -v --tb=short
+log_cli = true
+log_cli_level = INFO
+log_cli_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)
+log_cli_date_format = %Y-%m-%d %H:%M:%S

test_backend.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import requests
+import json
+import pytest
+from typing import Dict, List
+import os
+from datetime import datetime
+import time
+# Base URLs for different environments
+LOCAL_URL = "http://127.0.0.1:8000"
+PROD_URL = "https://theaniketgiri-synthex.hf.space"
+# Use environment variable to determine which URL to use
+BASE_URL = os.getenv("API_URL", LOCAL_URL)
+def wait_for_model_loading(max_retries=10, delay=30):
+    """Wait for model to load before running tests"""
+    for i in range(max_retries):
+        try:
+            response = requests.get(f"{BASE_URL}/health")
+            data = response.json()
+            print(f"\nHealth check response: {json.dumps(data, indent=2)}")
+            if data.get("model_loaded", False):
+                return True
+            elif data.get("model_loading", False):
+                print(f"Model is still loading, attempt {i+1}/{max_retries}")
+            else:
+                print(f"Model not loaded yet, attempt {i+1}/{max_retries}")
+            time.sleep(delay)
+        except Exception as e:
+            print(f"Error checking health: {str(e)}")
+            time.sleep(delay)
+    return False
+class TestBackendAPI:
+    @classmethod
+    def setup_class(cls):
+        """Setup before running tests"""
+        if not wait_for_model_loading():
+            pytest.skip("Model failed to load within timeout")
+    def test_health(self):
+        """Test the health check endpoint"""
+        response = requests.get(f"{BASE_URL}/health")
+        assert response.status_code == 200
+        data = response.json()
+        assert "status" in data
+        assert data["status"] in ["healthy", "unhealthy"]
+        assert "timestamp" in data
+        assert "model_loaded" in data
+        print(f"\n=== Health Check ===")
+        print(f"Status: {data['status']}")
+        print(f"Model Loaded: {data['model_loaded']}")
+        print(f"Timestamp: {data['timestamp']}")
+    @pytest.mark.parametrize("record_type", [
+        "clinical_note",
+        "discharge_summary",
+        "lab_report",
+        "prescription"
+    ])
+    def test_generate_single_record(self, record_type: str):
+        """Test generating a single record of each type"""
+        url = f"{BASE_URL}/generate"
+        payload = {
+            "record_type": record_type,
+            "count": 1
+        }
+        print(f"\n=== Generating {record_type} ===")
+        response = requests.post(url, json=payload)
+        if response.status_code == 503:
+            pytest.skip("Model not loaded")
+        elif response.status_code == 500:
+            error = response.json()
+            pytest.fail(f"Generation failed: {error.get('detail', 'Unknown error')}")
+        assert response.status_code == 200
+        data = response.json()
+        assert isinstance(data, list)
+        assert len(data) == 1
+        record = data[0]
+        print(f"Generated Record:")
+        print(json.dumps(record, indent=2))
+        # Validate record structure
+        assert "type" in record
+        assert record["type"] == record_type
+        assert "content" in record
+        assert "generated_at" in record
+    def test_generate_multiple_records(self):
+        """Test generating multiple records"""
+        url = f"{BASE_URL}/generate"
+        payload = {
+            "record_type": "clinical_note",
+            "count": 3
+        }
+        print("\n=== Generating Multiple Records ===")
+        response = requests.post(url, json=payload)
+        if response.status_code == 503:
+            pytest.skip("Model not loaded")
+        elif response.status_code == 500:
+            error = response.json()
+            pytest.fail(f"Generation failed: {error.get('detail', 'Unknown error')}")
+        assert response.status_code == 200
+        data = response.json()
+        assert isinstance(data, list)
+        assert len(data) == 3
+        print(f"Generated {len(data)} records")
+        for i, record in enumerate(data, 1):
+            print(f"\nRecord {i}:")
+            print(json.dumps(record, indent=2))
+    def test_invalid_record_type(self):
+        """Test error handling for invalid record type"""
+        url = f"{BASE_URL}/generate"
+        payload = {
+            "record_type": "invalid_type",
+            "count": 1
+        }
+        print("\n=== Testing Invalid Record Type ===")
+        response = requests.post(url, json=payload)
+        assert response.status_code == 422  # FastAPI validation error
+        error = response.json()
+        assert "detail" in error
+        print(f"Error: {error['detail']}")
+    def test_invalid_count(self):
+        """Test error handling for invalid count"""
+        url = f"{BASE_URL}/generate"
+        payload = {
+            "record_type": "clinical_note",
+            "count": 0
+        }
+        print("\n=== Testing Invalid Count ===")
+        response = requests.post(url, json=payload)
+        assert response.status_code == 422  # FastAPI validation error
+        error = response.json()
+        assert "detail" in error
+        print(f"Error: {error['detail']}")
+    def test_record_content_quality(self):
+        """Test the quality of generated record content"""
+        url = f"{BASE_URL}/generate"
+        payload = {
+            "record_type": "clinical_note",
+            "count": 1
+        }
+        print("\n=== Testing Record Content Quality ===")
+        response = requests.post(url, json=payload)
+        if response.status_code == 503:
+            pytest.skip("Model not loaded")
+        elif response.status_code == 500:
+            error = response.json()
+            pytest.fail(f"Generation failed: {error.get('detail', 'Unknown error')}")
+        assert response.status_code == 200
+        data = response.json()
+        record = data[0]
+        # Check content length
+        assert len(record["content"]) > 100, "Content too short"
+        # Check for common medical terms
+        medical_terms = ["patient", "diagnosis", "treatment", "symptoms"]
+        content_lower = record["content"].lower()
+        assert any(term in content_lower for term in medical_terms), "Missing medical terminology"
+        print("Content Quality Checks Passed")
+        print(f"Content Length: {len(record['content'])} characters")
+def main():
+    """Run all tests"""
+    print("Starting API Tests...")
+    print(f"Testing against: {BASE_URL}")
+    print("=" * 50)
+    test_suite = TestBackendAPI()
+    # Run all tests
+    test_suite.test_health()
+    test_suite.test_generate_single_record("clinical_note")
+    test_suite.test_generate_multiple_records()
+    test_suite.test_invalid_record_type()
+    test_suite.test_invalid_count()
+    test_suite.test_record_content_quality()
+    print("\nAll tests completed successfully!")
+    print("=" * 50)
+if __name__ == "__main__":
+    main()