namanpenguin commited on
Commit
64f09ee
·
verified ·
1 Parent(s): 55fd36f

Update config.py

Browse files
Files changed (1) hide show
  1. config.py +68 -68
config.py CHANGED
@@ -1,69 +1,69 @@
1
- # config.py
2
-
3
- import torch
4
- import os
5
-
6
- # --- Paths ---
7
- # Adjust DATA_PATH to your actual data location
8
- DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
9
- TOKENIZER_PATH = './tokenizer/'
10
- LABEL_ENCODERS_PATH = './label_encoders.pkl'
11
- MODEL_SAVE_DIR = './saved_models/'
12
- PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
13
-
14
- # --- Data Columns ---
15
- TEXT_COLUMN = "Sanction_Context"
16
- # Define all your target label columns
17
- LABEL_COLUMNS = [
18
- "Red_Flag_Reason",
19
- "Maker_Action",
20
- "Escalation_Level",
21
- "Risk_Category",
22
- "Risk_Drivers",
23
- "Investigation_Outcome"
24
- ]
25
- # Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
26
- # For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
27
- METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
28
-
29
- # --- Model Hyperparameters ---
30
- MAX_LEN = 128 # Maximum sequence length for transformer tokenizers
31
- BATCH_SIZE = 16 # Batch size for training and evaluation
32
- LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
33
- NUM_EPOCHS = 3 # Number of training epochs. Adjust based on convergence.
34
- DROPOUT_RATE = 0.3 # Dropout rate for regularization
35
-
36
- # --- Device Configuration ---
37
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
-
39
- # --- Specific Model Configurations ---
40
- BERT_MODEL_NAME = 'bert-base-uncased'
41
- ROBERTA_MODEL_NAME = 'roberta-base'
42
- DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
43
-
44
- # TF-IDF
45
- TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
46
-
47
- # --- Field-Specific Strategy (Conceptual) ---
48
- # This dictionary provides conceptual strategies for enhancing specific fields.
49
- # Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
50
- FIELD_STRATEGIES = {
51
- "Maker_Action": {
52
- "loss": "focal_loss", # Requires custom Focal Loss implementation
53
- "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
54
- },
55
- "Risk_Category": {
56
- "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
57
- },
58
- "Escalation_Level": {
59
- "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
60
- },
61
- "Investigation_Outcome": {
62
- "type": "classification_or_generation" # If generation, T5/BART would be needed.
63
- }
64
- }
65
-
66
- # Ensure model save and predictions directories exist
67
- os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
68
- os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
69
  os.makedirs(TOKENIZER_PATH, exist_ok=True)
 
1
+ # config.py
2
+
3
+ import torch
4
+ import os
5
+
6
+ # --- Paths ---
7
+ # Adjust DATA_PATH to your actual data location
8
+ DATA_PATH = './data/synthetic_transactions_samples_5000.csv'
9
+ TOKENIZER_PATH = '/app/tokenizer/'
10
+ LABEL_ENCODERS_PATH = '/app/label_encoders.pkl'
11
+ MODEL_SAVE_DIR = '/app/saved_models/'
12
+ PREDICTIONS_SAVE_DIR = './predictions/' # To save predictions for voting ensemble
13
+
14
+ # --- Data Columns ---
15
+ TEXT_COLUMN = "Sanction_Context"
16
+ # Define all your target label columns
17
+ LABEL_COLUMNS = [
18
+ "Red_Flag_Reason",
19
+ "Maker_Action",
20
+ "Escalation_Level",
21
+ "Risk_Category",
22
+ "Risk_Drivers",
23
+ "Investigation_Outcome"
24
+ ]
25
+ # Example metadata columns. Add actual numerical/categorical metadata if available in your CSV.
26
+ # For now, it's an empty list. If you add metadata, ensure these columns exist and are numeric or can be encoded.
27
+ METADATA_COLUMNS = [] # e.g., ["Risk_Score", "Transaction_Amount"]
28
+
29
+ # --- Model Hyperparameters ---
30
+ MAX_LEN = 128 # Maximum sequence length for transformer tokenizers
31
+ BATCH_SIZE = 16 # Batch size for training and evaluation
32
+ LEARNING_RATE = 2e-5 # Learning rate for AdamW optimizer
33
+ NUM_EPOCHS = 3 # Number of training epochs. Adjust based on convergence.
34
+ DROPOUT_RATE = 0.3 # Dropout rate for regularization
35
+
36
+ # --- Device Configuration ---
37
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+
39
+ # --- Specific Model Configurations ---
40
+ BERT_MODEL_NAME = 'bert-base-uncased'
41
+ ROBERTA_MODEL_NAME = 'roberta-base'
42
+ DEBERTA_MODEL_NAME = 'microsoft/deberta-base'
43
+
44
+ # TF-IDF
45
+ TFIDF_MAX_FEATURES = 5000 # Max features for TF-IDF vectorizer
46
+
47
+ # --- Field-Specific Strategy (Conceptual) ---
48
+ # This dictionary provides conceptual strategies for enhancing specific fields.
49
+ # Actual implementation requires adapting the models (e.g., custom loss functions, metadata integration).
50
+ FIELD_STRATEGIES = {
51
+ "Maker_Action": {
52
+ "loss": "focal_loss", # Requires custom Focal Loss implementation
53
+ "enhancements": ["action_templates", "context_prompt_tuning"] # Advanced NLP concepts
54
+ },
55
+ "Risk_Category": {
56
+ "enhancements": ["numerical_metadata", "transaction_patterns"] # Integrate METADATA_COLUMNS
57
+ },
58
+ "Escalation_Level": {
59
+ "enhancements": ["class_balancing", "policy_keyword_patterns"] # Handled by class weights/metadata
60
+ },
61
+ "Investigation_Outcome": {
62
+ "type": "classification_or_generation" # If generation, T5/BART would be needed.
63
+ }
64
+ }
65
+
66
+ # Ensure model save and predictions directories exist
67
+ os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
68
+ os.makedirs(PREDICTIONS_SAVE_DIR, exist_ok=True)
69
  os.makedirs(TOKENIZER_PATH, exist_ok=True)