# config.py import torch import os # --- Paths --- # Adjust DATA_PATH to your actual data location DATA_PATH = './data/synthetic_transactions_samples_5000.csv' TOKENIZER_PATH = './tokenizer/' LABEL_ENCODERS_PATH = './label_encoders.pkl' MODEL_SAVE_DIR = './saved_models/' PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble # --- Data Columns --- TEXT_COLUMN = "Sanction_Context" # Define all your target label columns LABEL_COLUMNS = [ "Red_Flag_Reason", "Maker_Action", "Escalation_Level", "Risk_Category", "Risk_Drivers", "Investigation_Outcome" ] # Example metadata columns. Add actual numerical/categorical metadata if available in your CSV. # For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded. METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"] # --- Model Hyperparameters --- MAX_LEN = 128 # Maximum sequence length for transformer tokenizers BATCH_SIZE = 16 # Batch size for training and evaluation LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer NUM_EPOCHS = 3 # Number of training epochs. Adjust based on convergence. DROPOUT_RATE = 0.3 # Dropout rate for regularization # --- Device Configuration --- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # --- Specific Model Configurations --- ROBERTA_MODEL_NAME = 'roberta-base' BERT_MODEL_NAME = 'bert-base-uncased' DEBERTA_MODEL_NAME = 'microsoft/deberta-base' # TF-IDF TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer # --- Field-Specific Strategy (Conceptual) --- # This dictionary provides conceptual strategies for enhancing specific fields. # Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration). FIELD_STRATEGIES = { "Maker_Action": { "loss": "focal_loss", # Requires custom Focal Loss implementation "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts }, "Risk_Category": { "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS }, "Escalation_Level": { "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata }, "Investigation_Outcome": { "type": "classification_or_generation" # If generation, T5/BART would be needed. } } # Ensure model save and predictions directories exist os.makedirs(MODEL_SAVE_DIR, exist_ok=True) os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True) os.makedirs(TOKENIZER_PATH, exist_ok=True)