In [1]:
!pip install scikit-learn
!pip install pandas
!pip install tqdm
!pip install sentencepiece
!pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install --upgrade transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m 

In [2]:
import os
import json
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from transformers import CamembertTokenizer, CamembertForSequenceClassification, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

In [3]:
# ─────────────────────────────────────────────
# ⚙️ Config
# ─────────────────────────────────────────────
DEBUG = False
BATCH_SIZE = 64
EPOCHS = 3 if not DEBUG else 1
MAX_LEN = 128
LR = 2e-5
PATIENCE = 2 # pour l'early stopping

In [4]:
# ─────────────────────────────────────────────
# 📁 Chargement du dataset
# ─────────────────────────────────────────────
df = pd.read_csv("jigsaw-toxic-comment-train-google-fr-cleaned.csv")
df['comment_text'] = df['comment_text'].astype(str)
df.rename(columns={'comment_text': 'texts'}, inplace=True)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
other_cols_to_drop = ['Unnamed: 0.1', 'Unnamed: 0', 'id']
cols_to_drop = label_cols + other_cols_to_drop

df['df_labels'] = df[label_cols].max(axis=1)
df = df.drop(columns=cols_to_drop)

# Debug : sous-échantillonnage équilibré
if DEBUG:
 df_0 = df[df["df_labels"] == 0].sample(500, random_state=42)
 df_1 = df[df["df_labels"] == 1].sample(500, random_state=42)
 df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42)

print("Classes :", df['df_labels'].value_counts())

Classes : df_labels
0 189412
1 33982
Name: count, dtype: int64


In [5]:
# ─────────────────────────────────────────────
# 🔢 Dataset
# ─────────────────────────────────────────────
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

class CommentDataset(Dataset):
 def __init__(self, texts, labels, tokenizer, max_len):
 self.texts = texts
 self.labels = labels
 self.tokenizer = tokenizer
 self.max_len = max_len

 def __len__(self):
 return len(self.texts)

 def __getitem__(self, idx):
 encoding = self.tokenizer(
 self.texts[idx],
 padding="max_length",
 truncation=True,
 max_length=self.max_len,
 return_tensors="pt"
 )
 item = {key: val.squeeze() for key, val in encoding.items()}
 item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
 return item

# Split
X_train, X_val, y_train, y_val = train_test_split(df["texts"].tolist(), df["df_labels"].tolist(), test_size=0.2, random_state=42)

train_dataset = CommentDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = CommentDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [6]:
# ─────────────────────────────────────────────
# 🧠 Modèle + loss pondérée
# ─────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2).to(device)

# pondération dynamique
if DEBUG:
 class_weights = torch.tensor([1.0, 1.0], dtype=torch.float)
else:
 count_0 = df[df["df_labels"] == 0].shape[0]
 count_1 = df[df["df_labels"] == 1].shape[0]
 class_weights = torch.tensor([1.0, count_0 / count_1], dtype=torch.float)

print(f"Poids pour la loss : {class_weights}")
loss_fn = CrossEntropyLoss(weight=class_weights.to(device))

# Optimiseur et scheduler
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS)


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Poids pour la loss : tensor([1.0000, 5.5739])


In [7]:
best_f1 = 0
patience_counter = 0
os.makedirs("outputs/model", exist_ok=True)

for epoch in range(EPOCHS):
 print(f"\n🌟 Epoch {epoch + 1}/{EPOCHS}")
 model.train()
 total_loss = 0

 for batch in tqdm(train_loader, desc="Entraînement"):
 batch = {k: v.to(device) for k, v in batch.items()}
 logits = model(**batch).logits
 loss = loss_fn(logits, batch["labels"])
 loss.backward()
 optimizer.step()
 scheduler.step()
 optimizer.zero_grad()
 total_loss += loss.item()

 avg_loss = total_loss / len(train_loader)
 print(f"📉 Loss moyenne : {avg_loss:.4f}")

 # 🔍 Évaluation
 model.eval()
 y_true, y_pred = [], []
 with torch.no_grad():
 for batch in tqdm(val_loader, desc="Évaluation"):
 batch = {k: v.to(device) for k, v in batch.items()}
 logits = model(**batch).logits
 preds = torch.argmax(logits, dim=1)
 y_true.extend(batch["labels"].cpu().tolist())
 y_pred.extend(preds.cpu().tolist())

 report = classification_report(y_true, y_pred, target_names=["Non toxique", "Toxique"], output_dict=True)
 f1 = report["weighted avg"]["f1-score"]
 print(f"🎯 F1-score (weighted) : {f1:.4f}")

 if f1 > best_f1:
 best_f1 = f1
 patience_counter = 0
 print("✅ Nouveau meilleur modèle — sauvegarde manuelle...")

 import os

 # 📂 Dossier de sauvegarde
 save_dir = "outputs/model"
 os.makedirs(save_dir, exist_ok=True)

 # 💾 Sauvegarde manuelle des poids
 torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))

 # 💾 Sauvegarde de la configuration du modèle
 model.config.to_json_file(os.path.join(save_dir, "config.json"))

 # 💾 Sauvegarde du tokenizer
 tokenizer.save_pretrained(save_dir)

 # 💾 Sauvegarde des métriques
 with open("outputs/metrics.json", "w") as f:
 json.dump(report, f, indent=4)

 else:
 patience_counter += 1
 print(f"⏳ EarlyStopping patience : {patience_counter}/{PATIENCE}")
 if patience_counter >= PATIENCE:
 print("🛑 Arrêt anticipé — pas d'amélioration")
 break


🌟 Epoch 1/3


Entraînement: 100%|██████████| 2793/2793 [18:23<00:00, 2.53it/s]


📉 Loss moyenne : 0.5043


Évaluation: 100%|██████████| 699/699 [01:50<00:00, 6.32it/s]


🎯 F1-score (weighted) : 0.8826
✅ Nouveau meilleur modèle — sauvegarde manuelle...

🌟 Epoch 2/3


Entraînement: 100%|██████████| 2793/2793 [18:26<00:00, 2.53it/s]


📉 Loss moyenne : 0.4711


Évaluation: 100%|██████████| 699/699 [01:49<00:00, 6.39it/s]


🎯 F1-score (weighted) : 0.8735
⏳ EarlyStopping patience : 1/2

🌟 Epoch 3/3


Entraînement: 100%|██████████| 2793/2793 [18:26<00:00, 2.52it/s]


📉 Loss moyenne : 0.4485


Évaluation: 100%|██████████| 699/699 [01:50<00:00, 6.35it/s]


🎯 F1-score (weighted) : 0.8816
⏳ EarlyStopping patience : 2/2
🛑 Arrêt anticipé — pas d'amélioration


In [8]:
import json
import os

# 📁 Chemin du fichier de métriques
metrics_path = "outputs/metrics.json"

# ✅ Vérifie l'existence du fichier
if os.path.exists(metrics_path):
 with open(metrics_path, "r") as f:
 metrics = json.load(f)

 print("📊 Métriques sauvegardées :\n")
 for label in ["Non toxique", "Toxique"]:
 print(f"🗂 Classe : {label}")
 print(f" 🔸 Précision : {metrics[label]['precision']:.4f}")
 print(f" 🔸 Rappel : {metrics[label]['recall']:.4f}")
 print(f" 🔸 F1-score : {metrics[label]['f1-score']:.4f}\n")

 print("🔄 Moyennes pondérées (weighted avg) :")
 print(f" ✅ Précision : {metrics['weighted avg']['precision']:.4f}")
 print(f" ✅ Rappel : {metrics['weighted avg']['recall']:.4f}")
 print(f" ✅ F1-score : {metrics['weighted avg']['f1-score']:.4f}")
else:
 print("❌ Aucune métrique trouvée dans outputs/metrics.json")

📊 Métriques sauvegardées :

🗂 Classe : Non toxique
 🔸 Précision : 0.9294
 🔸 Rappel : 0.9329
 🔸 F1-score : 0.9312

🗂 Classe : Toxique
 🔸 Précision : 0.6193
 🔸 Rappel : 0.6065
 🔸 F1-score : 0.6129

🔄 Moyennes pondérées (weighted avg) :
 ✅ Précision : 0.8821
 ✅ Rappel : 0.8831
 ✅ F1-score : 0.8826
