Spaces:

point9
/

bert

Running

App Files Files Community

namanpenguin commited on 16 days ago

Commit

64f09ee

verified ·

1 Parent(s): 55fd36f

Update config.py

Browse files

Files changed (1) hide show

config.py +68 -68

config.py CHANGED Viewed

@@ -1,69 +1,69 @@
- # config.py
-import torch
-import os
-# --- Paths ---
-# Adjust DATA_PATH to your actual data location
-DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
-TOKENIZER_PATH = './tokenizer/'
-LABEL_ENCODERS_PATH = './label_encoders.pkl'
-MODEL_SAVE_DIR = './saved_models/'
-PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
-# --- Data Columns ---
-TEXT_COLUMN = "Sanction_Context"
-# Define all your target label columns
-LABEL_COLUMNS = [
-    "Red_Flag_Reason",
-    "Maker_Action",
-    "Escalation_Level",
-    "Risk_Category",
-    "Risk_Drivers",
-    "Investigation_Outcome"
-]
-# Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
-# For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
-METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
-# --- Model Hyperparameters ---
-MAX_LEN = 128       # Maximum sequence length for transformer tokenizers
-BATCH_SIZE = 16     # Batch size for training and evaluation
-LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
-NUM_EPOCHS = 3      # Number of training epochs. Adjust based on convergence.
-DROPOUT_RATE = 0.3  # Dropout rate for regularization
-# --- Device Configuration ---
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# --- Specific Model Configurations ---
-BERT_MODEL_NAME = 'bert-base-uncased'
-ROBERTA_MODEL_NAME = 'roberta-base'
-DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
-# TF-IDF
-TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
-# --- Field-Specific Strategy (Conceptual) ---
-# This dictionary provides conceptual strategies for enhancing specific fields.
-# Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
-FIELD_STRATEGIES = {
-    "Maker_Action": {
-        "loss": "focal_loss", # Requires custom Focal Loss implementation
-        "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
-    },
-    "Risk_Category": {
-        "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
-    },
-    "Escalation_Level": {
-        "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
-    },
-    "Investigation_Outcome": {
-        "type": "classification_or_generation" # If generation, T5/BART would be needed.
-    }
-}
-# Ensure model save and predictions directories exist
-os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
-os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
 os.makedirs(TOKENIZER_PATH, exist_ok=True)

+ # config.py
+import torch
+import os
+# --- Paths ---
+# Adjust DATA_PATH to your actual data location
+DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
+TOKENIZER_PATH = '/app/tokenizer/'
+LABEL_ENCODERS_PATH = '/app/label_encoders.pkl'
+MODEL_SAVE_DIR = '/app/saved_models/'
+PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
+# --- Data Columns ---
+TEXT_COLUMN = "Sanction_Context"
+# Define all your target label columns
+LABEL_COLUMNS = [
+    "Red_Flag_Reason",
+    "Maker_Action",
+    "Escalation_Level",
+    "Risk_Category",
+    "Risk_Drivers",
+    "Investigation_Outcome"
+]
+# Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
+# For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
+METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
+# --- Model Hyperparameters ---
+MAX_LEN = 128       # Maximum sequence length for transformer tokenizers
+BATCH_SIZE = 16     # Batch size for training and evaluation
+LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
+NUM_EPOCHS = 3      # Number of training epochs. Adjust based on convergence.
+DROPOUT_RATE = 0.3  # Dropout rate for regularization
+# --- Device Configuration ---
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Specific Model Configurations ---
+BERT_MODEL_NAME = 'bert-base-uncased'
+ROBERTA_MODEL_NAME = 'roberta-base'
+DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
+# TF-IDF
+TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
+# --- Field-Specific Strategy (Conceptual) ---
+# This dictionary provides conceptual strategies for enhancing specific fields.
+# Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
+FIELD_STRATEGIES = {
+    "Maker_Action": {
+        "loss": "focal_loss", # Requires custom Focal Loss implementation
+        "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
+    },
+    "Risk_Category": {
+        "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
+    },
+    "Escalation_Level": {
+        "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
+    },
+    "Investigation_Outcome": {
+        "type": "classification_or_generation" # If generation, T5/BART would be needed.
+    }
+}
+# Ensure model save and predictions directories exist
+os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
 os.makedirs(TOKENIZER_PATH, exist_ok=True)