Spaces:

point9
/

bert

Running

App Files Files Community

namanpenguin commited on 12 days ago

Commit

ad944b3

verified ·

1 Parent(s): 8cc5725

Upload 15 files

Browse files

Files changed (15) hide show

BERT_model.pth +3 -0
Dockerfile +54 -0
app.py +504 -0
config.py +69 -0
dataset_utils.py +165 -0
docker-compose.yml +18 -0
label_encoders.pkl +3 -0
models/__pycache__/bert_model.cpython-311.pyc +0 -0
models/__pycache__/deberta_model.cpython-311.pyc +0 -0
models/__pycache__/parallel_bert_deberta.cpython-311.pyc +0 -0
models/__pycache__/roberta_model.cpython-311.pyc +0 -0
models/__pycache__/text_and_metadata_model.cpython-311.pyc +0 -0
models/bert_model.py +59 -0
requirements.txt +13 -0
train_utils.py +310 -0

BERT_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7050d02ac599ef72d7b0410a79a72537fb44d4ac66eb8a1dc719329c8c4b07b
+size 438239057

Dockerfile ADDED Viewed

	@@ -0,0 +1,54 @@

+# Use Python 3.9 as base image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Create a non-root user
+RUN useradd -m -u 1000 appuser
+# Copy requirements file
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Create necessary directories with proper permissions
+RUN mkdir -p /app/uploads \
+    /app/saved_models/bert \
+    /app/predictions \
+    /app/tokenizer \
+    && chmod -R 777 /app/uploads \
+    /app/saved_models \
+    /app/predictions \
+    /app/tokenizer
+# Switch to non-root user
+USER appuser
+# Copy the application code and utilities
+COPY . /app/
+COPY ../dataset_utils.py /app/
+COPY ../train_utils.py /app/
+COPY ../config.py /app/
+COPY ../models/bert_model.py /app/models/
+COPY ../label_encoders.pkl /app/
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+ENV PORT=7860
+# Expose the port the app runs on
+EXPOSE 7860
+# Command to run the application
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,504 @@

+from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+from typing import Optional, Dict, Any, List
+import uvicorn
+import torch
+from transformers import BertTokenizer, BertForSequenceClassification
+from torch.utils.data import DataLoader
+import logging
+import os
+import asyncio
+import pandas as pd
+from datetime import datetime
+import shutil
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+import zipfile
+import io
+import numpy as np
+import sys
+# Import existing utilities
+from dataset_utils import (
+    ComplianceDataset,
+    ComplianceDatasetWithMetadata,
+    load_and_preprocess_data,
+    get_tokenizer,
+    save_label_encoders,
+    get_num_labels,
+    load_label_encoders
+)
+from train_utils import (
+    initialize_criterions,
+    train_model,
+    evaluate_model,
+    save_model,
+    summarize_metrics,
+    predict_probabilities
+)
+from models.bert_model import BertMultiOutputModel
+from config import (
+    TEXT_COLUMN,
+    LABEL_COLUMNS,
+    DEVICE,
+    NUM_EPOCHS,
+    LEARNING_RATE,
+    MAX_LEN,
+    BATCH_SIZE,
+    METADATA_COLUMNS
+)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="BERT Compliance Predictor API")
+# Create necessary directories
+UPLOAD_DIR = Path("uploads")
+MODEL_SAVE_DIR = Path("saved_models")
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
+# Global variables to track training status
+training_status = {
+    "is_training": False,
+    "current_epoch": 0,
+    "total_epochs": 0,
+    "current_loss": 0.0,
+    "start_time": None,
+    "end_time": None,
+    "status": "idle",
+    "metrics": None
+}
+# Load the model and tokenizer for prediction
+model_path = "BERT_model.pth"
+tokenizer = get_tokenizer('bert-base-uncased')
+model = BertMultiOutputModel([len(load_label_encoders()[col].classes_) for col in LABEL_COLUMNS]).to(DEVICE)
+if os.path.exists(model_path):
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.eval()
+class TrainingConfig(BaseModel):
+    model_name: str = "bert-base-uncased"
+    batch_size: int = 8
+    learning_rate: float = 2e-5
+    num_epochs: int = 2
+    max_length: int = 128
+    test_size: float = 0.2
+    random_state: int = 42
+class TrainingResponse(BaseModel):
+    message: str
+    training_id: str
+    status: str
+    download_url: Optional[str] = None
+class ValidationResponse(BaseModel):
+    message: str
+    metrics: Dict[str, Any]
+    predictions: List[Dict[str, Any]]
+class TransactionData(BaseModel):
+    Transaction_Id: str
+    Hit_Seq: int
+    Hit_Id_List: str
+    Origin: str
+    Designation: str
+    Keywords: str
+    Name: str
+    SWIFT_Tag: str
+    Currency: str
+    Entity: str
+    Message: str
+    City: str
+    Country: str
+    State: str
+    Hit_Type: str
+    Record_Matching_String: str
+    WatchList_Match_String: str
+    Payment_Sender_Name: Optional[str] = ""
+    Payment_Reciever_Name: Optional[str] = ""
+    Swift_Message_Type: str
+    Text_Sanction_Data: str
+    Matched_Sanctioned_Entity: str
+    Is_Match: int
+    Red_Flag_Reason: str
+    Risk_Level: str
+    Risk_Score: float
+    Risk_Score_Description: str
+    CDD_Level: str
+    PEP_Status: str
+    Value_Date: str
+    Last_Review_Date: str
+    Next_Review_Date: str
+    Sanction_Description: str
+    Checker_Notes: str
+    Sanction_Context: str
+    Maker_Action: str
+    Customer_ID: int
+    Customer_Type: str
+    Industry: str
+    Transaction_Date_Time: str
+    Transaction_Type: str
+    Transaction_Channel: str
+    Originating_Bank: str
+    Beneficiary_Bank: str
+    Geographic_Origin: str
+    Geographic_Destination: str
+    Match_Score: float
+    Match_Type: str
+    Sanctions_List_Version: str
+    Screening_Date_Time: str
+    Risk_Category: str
+    Risk_Drivers: str
+    Alert_Status: str
+    Investigation_Outcome: str
+    Case_Owner_Analyst: str
+    Escalation_Level: str
+    Escalation_Date: str
+    Regulatory_Reporting_Flags: bool
+    Audit_Trail_Timestamp: str
+    Source_Of_Funds: str
+    Purpose_Of_Transaction: str
+    Beneficial_Owner: str
+    Sanctions_Exposure_History: bool
+class PredictionRequest(BaseModel):
+    transaction_data: TransactionData
+@app.get("/")
+async def root():
+    return {"message": "BERT Compliance Predictor API"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+@app.get("/training-status")
+async def get_training_status():
+    return training_status
+@app.post("/upload")
+async def upload_file(file: UploadFile = File(...)):
+    """Upload a CSV file for training or validation"""
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Only CSV files are allowed")
+    file_path = UPLOAD_DIR / file.filename
+    with file_path.open("wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    return {"message": f"File {file.filename} uploaded successfully", "file_path": str(file_path)}
+@app.post("/bert/train", response_model=TrainingResponse)
+async def start_training(
+    config: TrainingConfig,
+    background_tasks: BackgroundTasks,
+    file_path: str
+):
+    if training_status["is_training"]:
+        raise HTTPException(status_code=400, detail="Training is already in progress")
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="Training file not found")
+    training_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+    training_status.update({
+        "is_training": True,
+        "current_epoch": 0,
+        "total_epochs": config.num_epochs,
+        "start_time": datetime.now().isoformat(),
+        "status": "starting"
+    })
+    background_tasks.add_task(train_model_task, config, file_path, training_id)
+    download_url = f"/bert/download-model/{training_id}"
+    return TrainingResponse(
+        message="Training started successfully",
+        training_id=training_id,
+        status="started",
+        download_url=download_url
+    )
+@app.post("/bert/validate")
+async def validate_model(
+    file: UploadFile = File(...),
+    model_name: str = "bert_model_latest"
+):
+    """Validate a BERT model on uploaded data"""
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Only CSV files are allowed")
+    try:
+        file_path = UPLOAD_DIR / file.filename
+        with file_path.open("wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        data_df, label_encoders = load_and_preprocess_data(str(file_path))
+        model_path = MODEL_SAVE_DIR / f"{model_name}.pth"
+        if not model_path.exists():
+            raise HTTPException(status_code=404, detail="BERT model file not found")
+        num_labels_list = [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]
+        metadata_df = data_df[METADATA_COLUMNS] if METADATA_COLUMNS and all(col in data_df.columns for col in METADATA_COLUMNS) else None
+        if metadata_df is not None:
+            metadata_dim = metadata_df.shape[1]
+            model = BertMultiOutputModel(num_labels_list, metadata_dim=metadata_dim).to(DEVICE)
+        else:
+            model = BertMultiOutputModel(num_labels_list).to(DEVICE)
+        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+        model.eval()
+        texts = data_df[TEXT_COLUMN]
+        labels_array = data_df[LABEL_COLUMNS].values
+        tokenizer = get_tokenizer("bert-base-uncased")
+        if metadata_df is not None:
+            dataset = ComplianceDatasetWithMetadata(
+                texts.tolist(),
+                metadata_df.values,
+                labels_array,
+                tokenizer,
+                MAX_LEN
+            )
+        else:
+            dataset = ComplianceDataset(
+                texts.tolist(),
+                labels_array,
+                tokenizer,
+                MAX_LEN
+            )
+        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)
+        metrics, y_true_list, y_pred_list = evaluate_model(model, dataloader)
+        summary_metrics = summarize_metrics(metrics).to_dict()
+        all_probs = predict_probabilities(model, dataloader)
+        predictions = []
+        for i, (true_labels, pred_labels) in enumerate(zip(y_true_list, y_pred_list)):
+            field = LABEL_COLUMNS[i]
+            label_encoder = label_encoders[field]
+            true_labels_orig = label_encoder.inverse_transform(true_labels)
+            pred_labels_orig = label_encoder.inverse_transform(pred_labels)
+            for true, pred, probs in zip(true_labels_orig, pred_labels_orig, all_probs[i]):
+                predictions.append({
+                    "field": field,
+                    "true_label": true,
+                    "predicted_label": pred,
+                    "probabilities": probs.tolist()
+                })
+        return ValidationResponse(
+            message="Validation completed successfully",
+            metrics=summary_metrics,
+            predictions=predictions
+        )
+    except Exception as e:
+        logger.error(f"Validation failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
+    finally:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+@app.post("/bert/predict")
+async def predict(request: PredictionRequest):
+    """Make predictions on a single transaction"""
+    try:
+        input_data = pd.DataFrame([request.transaction_data.dict()])
+        text_input = f"""
+        Transaction ID: {input_data['Transaction_Id'].iloc[0]}
+        Origin: {input_data['Origin'].iloc[0]}
+        Designation: {input_data['Designation'].iloc[0]}
+        Keywords: {input_data['Keywords'].iloc[0]}
+        Name: {input_data['Name'].iloc[0]}
+        SWIFT Tag: {input_data['SWIFT_Tag'].iloc[0]}
+        Currency: {input_data['Currency'].iloc[0]}
+        Entity: {input_data['Entity'].iloc[0]}
+        Message: {input_data['Message'].iloc[0]}
+        City: {input_data['City'].iloc[0]}
+        Country: {input_data['Country'].iloc[0]}
+        State: {input_data['State'].iloc[0]}
+        Hit Type: {input_data['Hit_Type'].iloc[0]}
+        Record Matching String: {input_data['Record_Matching_String'].iloc[0]}
+        WatchList Match String: {input_data['WatchList_Match_String'].iloc[0]}
+        Payment Sender: {input_data['Payment_Sender_Name'].iloc[0]}
+        Payment Receiver: {input_data['Payment_Reciever_Name'].iloc[0]}
+        Swift Message Type: {input_data['Swift_Message_Type'].iloc[0]}
+        Text Sanction Data: {input_data['Text_Sanction_Data'].iloc[0]}
+        Matched Sanctioned Entity: {input_data['Matched_Sanctioned_Entity'].iloc[0]}
+        Red Flag Reason: {input_data['Red_Flag_Reason'].iloc[0]}
+        Risk Level: {input_data['Risk_Level'].iloc[0]}
+        Risk Score: {input_data['Risk_Score'].iloc[0]}
+        CDD Level: {input_data['CDD_Level'].iloc[0]}
+        PEP Status: {input_data['PEP_Status'].iloc[0]}
+        Sanction Description: {input_data['Sanction_Description'].iloc[0]}
+        Checker Notes: {input_data['Checker_Notes'].iloc[0]}
+        Sanction Context: {input_data['Sanction_Context'].iloc[0]}
+        Maker Action: {input_data['Maker_Action'].iloc[0]}
+        Customer Type: {input_data['Customer_Type'].iloc[0]}
+        Industry: {input_data['Industry'].iloc[0]}
+        Transaction Type: {input_data['Transaction_Type'].iloc[0]}
+        Transaction Channel: {input_data['Transaction_Channel'].iloc[0]}
+        Geographic Origin: {input_data['Geographic_Origin'].iloc[0]}
+        Geographic Destination: {input_data['Geographic_Destination'].iloc[0]}
+        Risk Category: {input_data['Risk_Category'].iloc[0]}
+        Risk Drivers: {input_data['Risk_Drivers'].iloc[0]}
+        Alert Status: {input_data['Alert_Status'].iloc[0]}
+        Investigation Outcome: {input_data['Investigation_Outcome'].iloc[0]}
+        Source of Funds: {input_data['Source_Of_Funds'].iloc[0]}
+        Purpose of Transaction: {input_data['Purpose_Of_Transaction'].iloc[0]}
+        Beneficial Owner: {input_data['Beneficial_Owner'].iloc[0]}
+        """
+        dataset = ComplianceDataset(
+            texts=[text_input],
+            labels=[[0] * len(LABEL_COLUMNS)],
+            tokenizer=tokenizer,
+            max_len=MAX_LEN
+        )
+        loader = DataLoader(dataset, batch_size=1, shuffle=False)
+        all_probabilities = predict_probabilities(model, loader)
+        label_encoders = load_label_encoders()
+        response = {}
+        for i, (col, probs) in enumerate(zip(LABEL_COLUMNS, all_probabilities)):
+            pred = np.argmax(probs[0])
+            decoded_pred = label_encoders[col].inverse_transform([pred])[0]
+            class_probs = {
+                label: float(probs[0][j])
+                for j, label in enumerate(label_encoders[col].classes_)
+            }
+            response[col] = {
+                "prediction": decoded_pred,
+                "probabilities": class_probs
+            }
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/bert/download-model/{model_id}")
+async def download_model(model_id: str):
+    """Download a trained model"""
+    model_path = MODEL_SAVE_DIR / f"{model_id}.pth"
+    if not model_path.exists():
+        raise HTTPException(status_code=404, detail="Model not found")
+    return FileResponse(
+        path=model_path,
+        filename=f"bert_model_{model_id}.pth",
+        media_type="application/octet-stream"
+    )
+async def train_model_task(config: TrainingConfig, file_path: str, training_id: str):
+    try:
+        data_df_original, label_encoders = load_and_preprocess_data(file_path)
+        save_label_encoders(label_encoders)
+        train_df, val_df = train_test_split(
+            data_df_original,
+            test_size=config.test_size,
+            random_state=config.random_state,
+            stratify=data_df_original[LABEL_COLUMNS[0]]
+        )
+        train_texts = train_df[TEXT_COLUMN]
+        val_texts = val_df[TEXT_COLUMN]
+        train_labels_array = train_df[LABEL_COLUMNS].values
+        val_labels_array = val_df[LABEL_COLUMNS].values
+        train_metadata_df = train_df[METADATA_COLUMNS] if METADATA_COLUMNS and all(col in train_df.columns for col in METADATA_COLUMNS) else None
+        val_metadata_df = val_df[METADATA_COLUMNS] if METADATA_COLUMNS and all(col in val_df.columns for col in METADATA_COLUMNS) else None
+        num_labels_list = get_num_labels(label_encoders)
+        tokenizer = get_tokenizer(config.model_name)
+        if train_metadata_df is not None and val_metadata_df is not None:
+            metadata_dim = train_metadata_df.shape[1]
+            train_dataset = ComplianceDatasetWithMetadata(
+                train_texts.tolist(),
+                train_metadata_df.values,
+                train_labels_array,
+                tokenizer,
+                config.max_length
+            )
+            val_dataset = ComplianceDatasetWithMetadata(
+                val_texts.tolist(),
+                val_metadata_df.values,
+                val_labels_array,
+                tokenizer,
+                config.max_length
+            )
+            model = BertMultiOutputModel(num_labels_list, metadata_dim=metadata_dim).to(DEVICE)
+        else:
+            train_dataset = ComplianceDataset(
+                train_texts.tolist(),
+                train_labels_array,
+                tokenizer,
+                config.max_length
+            )
+            val_dataset = ComplianceDataset(
+                val_texts.tolist(),
+                val_labels_array,
+                tokenizer,
+                config.max_length
+            )
+            model = BertMultiOutputModel(num_labels_list).to(DEVICE)
+        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
+        val_loader = DataLoader(val_dataset, batch_size=config.batch_size)
+        criterions = initialize_criterions(num_labels_list)
+        optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
+        best_val_loss = float('inf')
+        for epoch in range(config.num_epochs):
+            training_status["current_epoch"] = epoch + 1
+            train_loss = train_model(model, train_loader, criterions, optimizer)
+            val_metrics, _, _ = evaluate_model(model, val_loader)
+            training_status["current_loss"] = train_loss
+            if val_metrics["loss"] < best_val_loss:
+                best_val_loss = val_metrics["loss"]
+                save_model(model, training_id)
+        training_status.update({
+            "is_training": False,
+            "end_time": datetime.now().isoformat(),
+            "status": "completed",
+            "metrics": summarize_metrics(val_metrics).to_dict()
+        })
+    except Exception as e:
+        logger.error(f"Training failed: {str(e)}")
+        training_status.update({
+            "is_training": False,
+            "end_time": datetime.now().isoformat(),
+            "status": "failed",
+            "error": str(e)
+        })
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+ # config.py
+import torch
+import os
+# --- Paths ---
+# Adjust DATA_PATH to your actual data location
+DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
+TOKENIZER_PATH = './tokenizer/'
+LABEL_ENCODERS_PATH = './label_encoders.pkl'
+MODEL_SAVE_DIR = './saved_models/'
+PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
+# --- Data Columns ---
+TEXT_COLUMN = "Sanction_Context"
+# Define all your target label columns
+LABEL_COLUMNS = [
+    "Red_Flag_Reason",
+    "Maker_Action",
+    "Escalation_Level",
+    "Risk_Category",
+    "Risk_Drivers",
+    "Investigation_Outcome"
+]
+# Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
+# For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
+METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
+# --- Model Hyperparameters ---
+MAX_LEN = 128       # Maximum sequence length for transformer tokenizers
+BATCH_SIZE = 16     # Batch size for training and evaluation
+LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
+NUM_EPOCHS = 3      # Number of training epochs. Adjust based on convergence.
+DROPOUT_RATE = 0.3  # Dropout rate for regularization
+# --- Device Configuration ---
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Specific Model Configurations ---
+BERT_MODEL_NAME = 'bert-base-uncased'
+ROBERTA_MODEL_NAME = 'roberta-base'
+DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
+# TF-IDF
+TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
+# --- Field-Specific Strategy (Conceptual) ---
+# This dictionary provides conceptual strategies for enhancing specific fields.
+# Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
+FIELD_STRATEGIES = {
+    "Maker_Action": {
+        "loss": "focal_loss", # Requires custom Focal Loss implementation
+        "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
+    },
+    "Risk_Category": {
+        "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
+    },
+    "Escalation_Level": {
+        "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
+    },
+    "Investigation_Outcome": {
+        "type": "classification_or_generation" # If generation, T5/BART would be needed.
+    }
+}
+# Ensure model save and predictions directories exist
+os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
+os.makedirs(TOKENIZER_PATH, exist_ok=True)

dataset_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# dataset_utils.py
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from sklearn.preprocessing import LabelEncoder
+from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer
+import pickle
+import os
+from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, TOKENIZER_PATH, LABEL_ENCODERS_PATH, METADATA_COLUMNS
+class ComplianceDataset(Dataset):
+    """
+    Custom Dataset class for handling text and multi-output labels for PyTorch models.
+    """
+    def __init__(self, texts, labels, tokenizer, max_len):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        """Returns the total number of samples in the dataset."""
+        return len(self.texts)
+    def __getitem__(self, idx):
+        """
+        Retrieves a sample from the dataset at the given index.
+        Tokenizes the text and converts labels to a PyTorch tensor.
+        """
+        text = str(self.texts[idx])
+        # Tokenize the text, padding to max_length and truncating if longer.
+        # return_tensors="pt" ensures PyTorch tensors are returned.
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        # Squeeze removes the batch dimension (which is 1 here because we process one sample at a time)
+        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
+        # Convert labels to a PyTorch long tensor
+        labels = torch.tensor(self.labels[idx], dtype=torch.long)
+        return inputs, labels
+class ComplianceDatasetWithMetadata(Dataset):
+    """
+    Custom Dataset class for handling text, additional numerical metadata, and multi-output labels.
+    Used for hybrid models combining text and tabular features.
+    """
+    def __init__(self, texts, metadata, labels, tokenizer, max_len):
+        self.texts = texts
+        self.metadata = metadata # Expects metadata as a NumPy array or list of lists
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        """Returns the total number of samples in the dataset."""
+        return len(self.texts)
+    def __getitem__(self, idx):
+        """
+        Retrieves a sample, its metadata, and labels from the dataset at the given index.
+        Tokenizes text, converts metadata and labels to PyTorch tensors.
+        """
+        text = str(self.texts[idx])
+        inputs = self.tokenizer(
+            text,
+            padding='max_length',
+            truncation=True,
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
+        # Convert metadata for the current sample to a float tensor
+        metadata = torch.tensor(self.metadata[idx], dtype=torch.float)
+        labels = torch.tensor(self.labels[idx], dtype=torch.long)
+        return inputs, metadata, labels
+def load_and_preprocess_data(data_path):
+    """
+    Loads data from a CSV, fills missing values, and encodes categorical labels.
+    Also handles converting specified METADATA_COLUMNS to numeric.
+    Args:
+        data_path (str): Path to the CSV data file.
+    Returns:
+        tuple: A tuple containing:
+            - data (pd.DataFrame): The preprocessed DataFrame.
+            - label_encoders (dict): A dictionary of LabelEncoder objects for each label column.
+    """
+    data = pd.read_csv(data_path)
+    data.fillna("Unknown", inplace=True) # Fill any missing text values with "Unknown"
+    # Convert metadata columns to numeric, coercing errors and filling NaNs with 0
+    # This ensures metadata is suitable for neural networks.
+    for col in METADATA_COLUMNS:
+        if col in data.columns:
+            data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) # Fill NaN with 0 or a suitable value
+    label_encoders = {col: LabelEncoder() for col in LABEL_COLUMNS}
+    for col in LABEL_COLUMNS:
+        # Fit and transform each label column using its respective LabelEncoder
+        data[col] = label_encoders[col].fit_transform(data[col])
+    return data, label_encoders
+def get_tokenizer(model_name):
+    """
+    Returns the appropriate Hugging Face tokenizer based on the model name.
+    Args:
+        model_name (str): The name of the pre-trained model (e.g., 'bert-base-uncased').
+    Returns:
+        transformers.PreTrainedTokenizer: The initialized tokenizer.
+    """
+    if "bert" in model_name.lower():
+        return BertTokenizer.from_pretrained(model_name)
+    elif "roberta" in model_name.lower():
+        return RobertaTokenizer.from_pretrained(model_name)
+    elif "deberta" in model_name.lower():
+        return DebertaTokenizer.from_pretrained(model_name)
+    else:
+        raise ValueError(f"Unsupported tokenizer for model: {model_name}")
+def save_label_encoders(label_encoders):
+    """
+    Saves a dictionary of label encoders to a pickle file.
+    This is crucial for decoding predictions back to original labels.
+    Args:
+        label_encoders (dict): Dictionary of LabelEncoder objects.
+    """
+    with open(LABEL_ENCODERS_PATH, "wb") as f:
+        pickle.dump(label_encoders, f)
+    print(f"Label encoders saved to {LABEL_ENCODERS_PATH}")
+def load_label_encoders():
+    """
+    Loads a dictionary of label encoders from a pickle file.
+    Returns:
+        dict: Loaded dictionary of LabelEncoder objects.
+    """
+    with open(LABEL_ENCODERS_PATH, "rb") as f:
+        return pickle.load(f)
+    print(f"Label encoders loaded from {LABEL_ENCODERS_PATH}")
+def get_num_labels(label_encoders):
+    """
+    Returns a list containing the number of unique classes for each label column.
+    This list is used to define the output dimensions of the model's classification heads.
+    Args:
+        label_encoders (dict): Dictionary of LabelEncoder objects.
+    Returns:
+        list: A list of integers, where each integer is the number of classes for a label.
+    """
+    return [len(label_encoders[col].classes_) for col in LABEL_COLUMNS]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,18 @@

+version: '3.8'
+services:
+  bert-api:
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - ../saved_models:/app/saved_models
+      - ../tokenizer:/app/tokenizer
+      - ../predictions:/app/predictions
+      - ../label_encoders.pkl:/app/label_encoders.pkl
+      - ../.cache:/app/.cache
+    environment:
+      - PYTHONUNBUFFERED=1
+      - TRANSFORMERS_CACHE=/app/.cache
+      - PORT=7860
+    restart: unless-stopped

label_encoders.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c336fd07858af76d40c7200de1a769099abeec25d4f48b999351318680d4e4d6
+size 2047

models/__pycache__/bert_model.cpython-311.pyc ADDED Viewed

Binary file (3.29 kB). View file

models/__pycache__/deberta_model.cpython-311.pyc ADDED Viewed

Binary file (3.15 kB). View file

models/__pycache__/parallel_bert_deberta.cpython-311.pyc ADDED Viewed

Binary file (6.45 kB). View file

models/__pycache__/roberta_model.cpython-311.pyc ADDED Viewed

Binary file (3.18 kB). View file

models/__pycache__/text_and_metadata_model.cpython-311.pyc ADDED Viewed

Binary file (4.09 kB). View file

models/bert_model.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# models/bert_model.py
+import torch
+import torch.nn as nn
+from transformers import BertModel
+from config import DROPOUT_RATE, BERT_MODEL_NAME # Import BERT_MODEL_NAME from config
+class BertMultiOutputModel(nn.Module):
+    """
+    BERT-based model for multi-output classification.
+    It uses a pre-trained BERT model as its backbone and adds a dropout layer
+    followed by separate linear classification heads for each target label.
+    """
+    # Statically set tokenizer name for easy access in main.py
+    tokenizer_name = BERT_MODEL_NAME
+    def __init__(self, num_labels):
+        """
+        Initializes the BertMultiOutputModel.
+        Args:
+            num_labels (list): A list where each element is the number of classes
+                                for a corresponding label column.
+        """
+        super(BertMultiOutputModel, self).__init__()
+        # Load the pre-trained BERT model.
+        # BertModel provides contextual embeddings and a pooled output for classification.
+        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
+        self.dropout = nn.Dropout(DROPOUT_RATE) # Dropout layer for regularization
+        # Create a list of classification heads, one for each label column.
+        # Each head is a linear layer mapping BERT's pooled output size to the number of classes for that label.
+        self.classifiers = nn.ModuleList([
+            nn.Linear(self.bert.config.hidden_size, n_classes) for n_classes in num_labels
+        ])
+    def forward(self, input_ids, attention_mask):
+        """
+        Performs the forward pass of the model.
+        Args:
+            input_ids (torch.Tensor): Tensor of token IDs (from tokenizer).
+            attention_mask (torch.Tensor): Tensor indicating attention (from tokenizer).
+        Returns:
+            list: A list of logit tensors, one for each classification head.
+                  Each tensor has shape (batch_size, num_classes_for_that_label).
+        """
+        # Pass input_ids and attention_mask through BERT.
+        # .pooler_output typically represents the hidden state of the [CLS] token,
+        # processed through a linear layer and tanh activation, often used for classification.
+        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
+        # Apply dropout for regularization
+        pooled_output = self.dropout(pooled_output)
+        # Pass the pooled output through each classification head.
+        # The result is a list of logits (raw scores before softmax/sigmoid) for each label.
+        return [classifier(pooled_output) for classifier in self.classifiers]

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+pydantic==2.4.2
+torch==2.1.0
+transformers==4.35.0
+pandas==2.1.2
+numpy==1.24.3
+scikit-learn==1.3.2
+python-multipart==0.0.6
+python-jose==3.3.0
+passlib==1.7.4
+bcrypt==4.0.1
+python-dotenv==1.0.0

train_utils.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# train_utils.py
+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from sklearn.metrics import classification_report
+from sklearn.utils.class_weight import compute_class_weight
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+import os
+import joblib
+from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR
+def get_class_weights(data_df, field, label_encoder):
+    """
+    Computes balanced class weights for a given target field.
+    These weights can be used in the loss function to mitigate class imbalance.
+    Args:
+        data_df (pd.DataFrame): The DataFrame containing the original (unencoded) label data.
+        field (str): The name of the label column for which to compute weights.
+        label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder fitted for this field.
+    Returns:
+        torch.Tensor: A tensor of class weights for the specified field.
+    """
+    # Get the original labels for the specified field
+    y = data_df[field].values
+    # Use label_encoder.transform directly - it will handle unseen labels
+    try:
+        y_encoded = label_encoder.transform(y)
+    except ValueError as e:
+        print(f"Warning: {e}")
+        print(f"Using only seen labels for class weights calculation")
+        # Filter out unseen labels
+        seen_labels = set(label_encoder.classes_)
+        y_filtered = [label for label in y if label in seen_labels]
+        y_encoded = label_encoder.transform(y_filtered)
+    # Ensure y_encoded is integer type
+    y_encoded = y_encoded.astype(int)
+    # Initialize counts for all possible classes
+    n_classes = len(label_encoder.classes_)
+    class_counts = np.zeros(n_classes, dtype=int)
+    # Count occurrences of each class
+    for i in range(n_classes):
+        class_counts[i] = np.sum(y_encoded == i)
+    # Calculate weights for all classes
+    total_samples = len(y_encoded)
+    class_weights = np.ones(n_classes)  # Default weight of 1 for unseen classes
+    seen_classes = class_counts > 0
+    if np.any(seen_classes):
+        class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])
+    return torch.tensor(class_weights, dtype=torch.float)
+def initialize_criterions(data_df, label_encoders):
+    """
+    Initializes CrossEntropyLoss criteria for each label column, applying class weights.
+    Args:
+        data_df (pd.DataFrame): The original (unencoded) DataFrame. Used to compute class weights.
+        label_encoders (dict): Dictionary of LabelEncoder objects.
+    Returns:
+        dict: A dictionary where keys are label column names and values are
+              initialized `torch.nn.CrossEntropyLoss` objects.
+    """
+    field_criterions = {}
+    for field in LABEL_COLUMNS:
+        # Get class weights for the current field
+        weights = get_class_weights(data_df, field, label_encoders[field])
+        # Initialize CrossEntropyLoss with the computed weights and move to the device
+        field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
+    return field_criterions
+def train_model(model, loader, optimizer, field_criterions, epoch):
+    """
+    Trains the given PyTorch model for one epoch.
+    Args:
+        model (torch.nn.Module): The model to train.
+        loader (torch.utils.data.DataLoader): DataLoader for training data.
+        optimizer (torch.optim.Optimizer): Optimizer for model parameters.
+        field_criterions (dict): Dictionary of loss functions for each label.
+        epoch (int): Current epoch number (for progress bar description).
+    Returns:
+        float: Average training loss for the epoch.
+    """
+    model.train() # Set the model to training mode
+    total_loss = 0
+    # Use tqdm for a progress bar during training
+    tqdm_loader = tqdm(loader, desc=f"Epoch {epoch + 1} Training")
+    for batch in tqdm_loader:
+        # Unpack batch based on whether it contains metadata
+        if len(batch) == 2: # Text-only models (inputs, labels)
+            inputs, labels = batch
+            input_ids = inputs['input_ids'].to(DEVICE)
+            attention_mask = inputs['attention_mask'].to(DEVICE)
+            labels = labels.to(DEVICE)
+            # Forward pass through the model
+            outputs = model(input_ids, attention_mask)
+        elif len(batch) == 3: # Text + Metadata models (inputs, metadata, labels)
+            inputs, metadata, labels = batch
+            input_ids = inputs['input_ids'].to(DEVICE)
+            attention_mask = inputs['attention_mask'].to(DEVICE)
+            metadata = metadata.to(DEVICE)
+            labels = labels.to(DEVICE)
+            # Forward pass through the hybrid model
+            outputs = model(input_ids, attention_mask, metadata)
+        else:
+            raise ValueError("Unsupported batch format. Expected 2 or 3 items in batch.")
+        loss = 0
+        # Calculate total loss by summing loss for each label column
+        # `outputs` is a list of logits, one for each label column
+        for i, output_logits in enumerate(outputs):
+            # `labels[:, i]` gets the true labels for the i-th label column
+            # `field_criterions[LABEL_COLUMNS[i]]` selects the appropriate loss function
+            loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])
+        optimizer.zero_grad() # Clear previous gradients
+        loss.backward()       # Backpropagation
+        optimizer.step()      # Update model parameters
+        total_loss += loss.item() # Accumulate loss
+        tqdm_loader.set_postfix(loss=loss.item()) # Update progress bar with current batch loss
+    return total_loss / len(loader) # Return average loss for the epoch
+def evaluate_model(model, loader):
+    """
+    Evaluates the given PyTorch model on a validation/test set.
+    Args:
+        model (torch.nn.Module): The model to evaluate.
+        loader (torch.utils.data.DataLoader): DataLoader for evaluation data.
+    Returns:
+        tuple: A tuple containing:
+            - reports (dict): Classification reports (dict format) for each label column.
+            - truths (list): List of true label arrays for each label column.
+            - predictions (list): List of predicted label arrays for each label column.
+    """
+    model.eval() # Set the model to evaluation mode (disables dropout, batch norm updates, etc.)
+    # Initialize lists to store predictions and true labels for each output head
+    predictions = [[] for _ in range(len(LABEL_COLUMNS))]
+    truths = [[] for _ in range(len(LABEL_COLUMNS))]
+    with torch.no_grad(): # Disable gradient calculations during evaluation for efficiency
+        for batch in tqdm(loader, desc="Evaluation"):
+            if len(batch) == 2:
+                inputs, labels = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask)
+            elif len(batch) == 3:
+                inputs, metadata, labels = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                metadata = metadata.to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask, metadata)
+            else:
+                raise ValueError("Unsupported batch format.")
+            for i, output_logits in enumerate(outputs):
+                # Get the predicted class by taking the argmax of the logits
+                preds = torch.argmax(output_logits, dim=1).cpu().numpy()
+                predictions[i].extend(preds)
+                # Get the true labels for the current output head
+                truths[i].extend(labels[:, i].cpu().numpy())
+    reports = {}
+    # Generate classification report for each label column
+    for i, col in enumerate(LABEL_COLUMNS):
+        try:
+            # `zero_division=0` handles cases where a class might have no true or predicted samples
+            reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
+        except ValueError:
+            # Handle cases where a label might not appear in the validation set,
+            # which could cause classification_report to fail.
+            print(f"Warning: Could not generate classification report for {col}. Skipping.")
+            reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
+    return reports, truths, predictions
+def summarize_metrics(metrics):
+    """
+    Summarizes classification reports into a readable Pandas DataFrame.
+    Args:
+        metrics (dict): Dictionary of classification reports, as returned by `evaluate_model`.
+    Returns:
+        pd.DataFrame: A DataFrame summarizing precision, recall, f1-score, accuracy, and support for each field.
+    """
+    summary = []
+    for field, report in metrics.items():
+        # Safely get metrics, defaulting to 0 if not present (e.g., for empty reports)
+        precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
+        recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
+        f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
+        support = report['weighted avg']['support'] if 'weighted avg' in report else 0
+        accuracy = report['accuracy'] if 'accuracy' in report else 0 # Accuracy is usually top-level
+        summary.append({
+            "Field": field,
+            "Precision": precision,
+            "Recall": recall,
+            "F1-Score": f1,
+            "Accuracy": accuracy,
+            "Support": support
+        })
+    return pd.DataFrame(summary)
+def save_model(model, model_name, save_format='pth'):
+    """
+    Saves the state dictionary of a PyTorch model.
+    Args:
+        model (torch.nn.Module): The trained PyTorch model.
+        model_name (str): A descriptive name for the model (used for filename).
+        save_format (str): Format to save the model in ('pth' for PyTorch models, 'pickle' for traditional ML models).
+    """
+    # Construct the save path dynamically relative to the project root
+    if save_format == 'pth':
+        model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
+        torch.save(model.state_dict(), model_path)
+    elif save_format == 'pickle':
+        model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
+        joblib.dump(model, model_path)
+    else:
+        raise ValueError(f"Unsupported save format: {save_format}")
+    print(f"Model saved to {model_path}")
+def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
+    """
+    Loads the state dictionary into a PyTorch model.
+    Args:
+        model (torch.nn.Module): An initialized model instance (architecture).
+        model_name (str): The name of the model to load.
+        model_class (class): The class of the model (e.g., BertMultiOutputModel).
+        num_labels (list): List of number of classes for each label.
+        metadata_dim (int): Dimensionality of metadata features, if applicable (default 0 for text-only).
+    Returns:
+        torch.nn.Module: The model with loaded state_dict, moved to the correct device, and set to eval mode.
+    """
+    model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
+    if not os.path.exists(model_path):
+        print(f"Warning: Model file not found at {model_path}. Returning a newly initialized model instance.")
+        # Re-initialize the model if not found, to ensure it has the correct architecture
+        if metadata_dim > 0:
+            return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
+        else:
+            return model_class(num_labels).to(DEVICE)
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.to(DEVICE)
+    model.eval() # Set to evaluation mode after loading
+    print(f"Model loaded from {model_path}")
+    return model
+def predict_probabilities(model, loader):
+    """
+    Generates prediction probabilities for each label for a given model.
+    This is used for confidence scoring and feeding into a voting ensemble.
+    Args:
+        model (torch.nn.Module): The trained PyTorch model.
+        loader (torch.utils.data.DataLoader): DataLoader for the data to predict on.
+    Returns:
+        list: A list of lists of numpy arrays. Each inner list corresponds to a label column,
+              containing the softmax probabilities for each sample for that label.
+    """
+    model.eval() # Set to evaluation mode
+    # List to store probabilities for each output head
+    all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]
+    with torch.no_grad():
+        for batch in tqdm(loader, desc="Predicting Probabilities"):
+            # Unpack batch, ignoring labels as we only need inputs
+            if len(batch) == 2:
+                inputs, _ = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                outputs = model(input_ids, attention_mask)
+            elif len(batch) == 3:
+                inputs, metadata, _ = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                metadata = metadata.to(DEVICE)
+                outputs = model(input_ids, attention_mask, metadata)
+            else:
+                raise ValueError("Unsupported batch format.")
+            for i, out_logits in enumerate(outputs):
+                # Apply softmax to logits to get probabilities
+                probs = torch.softmax(out_logits, dim=1).cpu().numpy()
+                all_probabilities[i].extend(probs)
+    return all_probabilities