{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e63f42f0-e971-4017-8550-21fdcfc2de11", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.7.0)\n", "Requirement already satisfied: numpy>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.24.1)\n", "Requirement already satisfied: scipy>=1.8.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.15.3)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.5.1)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.6.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.3.0)\n", "Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.24.1)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2025.2)\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.67.1)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.2.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Looking in indexes: https://download.pytorch.org/whl/cu118\n", "Collecting torch==2.0.1\n", " Using cached https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)\n", "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (0.16.0+cu118)\n", "Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (2.1.0+cu118)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1) (3.9.0)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1) (4.4.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1) (3.0)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1) (3.1.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.1) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.1) (4.0.3)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.1) (18.1.8)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision) (1.24.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchvision) (2.31.0)\n", "INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.\n", "Collecting torchvision\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.1 kB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp310-cp310-linux_x86_64.whl.metadata (6.1 kB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.20.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.5 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.20.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.5 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.19.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.3 MB)\n", "INFO: pip is still looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.18.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.18.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.17.2%2Bcu118-cp310-cp310-linux_x86_64.whl (6.2 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.17.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.2 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.17.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.2 MB)\n", "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.16.2%2Bcu118-cp310-cp310-linux_x86_64.whl (6.1 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.16.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.1 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchvision-0.15.2%2Bcu118-cp310-cp310-linux_x86_64.whl (6.1 MB)\n", "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision) (9.3.0)\n", "INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.\n", "Collecting torchaudio\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.6 kB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.6 kB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp310-cp310-linux_x86_64.whl.metadata (6.6 kB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.5.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.4.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", "INFO: pip is still looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.3.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.3.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.2.2%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.2.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)\n", "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.1.2%2Bcu118-cp310-cp310-linux_x86_64.whl (3.2 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.1.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.2 MB)\n", " Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.0.2%2Bcu118-cp310-cp310-linux_x86_64.whl (4.4 MB)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.0.1) (2.1.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2.1.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (1.26.13)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchvision) (2022.12.7)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.0.1) (1.3.0)\n", "Installing collected packages: torch, torchvision, torchaudio\n", " Attempting uninstall: torch\n", " Found existing installation: torch 2.1.0+cu118\n", " Uninstalling torch-2.1.0+cu118:\n", " Successfully uninstalled torch-2.1.0+cu118\n", " Rolling back uninstall of torch\n", " Moving to /usr/local/bin/convert-caffe2-to-onnx\n", " from /tmp/pip-uninstall-ior9qvf0/convert-caffe2-to-onnx\n", " Moving to /usr/local/bin/convert-onnx-to-caffe2\n", " from /tmp/pip-uninstall-ior9qvf0/convert-onnx-to-caffe2\n", " Moving to /usr/local/bin/torchrun\n", " from /tmp/pip-uninstall-ior9qvf0/torchrun\n", " Moving to /usr/local/lib/python3.10/dist-packages/functorch/\n", " from /usr/local/lib/python3.10/dist-packages/~unctorch\n", " Moving to /usr/local/lib/python3.10/dist-packages/nvfuser/\n", " from /usr/local/lib/python3.10/dist-packages/~vfuser\n", " Moving to /usr/local/lib/python3.10/dist-packages/torch-2.1.0+cu118.dist-info/\n", " from /usr/local/lib/python3.10/dist-packages/~orch-2.1.0+cu118.dist-info\n", " Moving to /usr/local/lib/python3.10/dist-packages/torch/\n", " from /usr/local/lib/python3.10/dist-packages/~orch\n", " Moving to /usr/local/lib/python3.10/dist-packages/torchgen/\n", " from /usr/local/lib/python3.10/dist-packages/~orchgen\n", "\u001b[31mERROR: Could not install packages due to an OSError: [Errno 28] No space left on device\n", "\u001b[0m\u001b[31m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.52.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.9.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.30.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.33.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.24.1)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.21.1)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.5.3)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.30.0->transformers) (2025.5.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.30.0->transformers) (4.4.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.30.0->transformers) (1.1.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.1.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.13)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install scikit-learn\n", "!pip install pandas\n", "!pip install tqdm\n", "!pip install sentencepiece\n", "!pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n", "!pip install --upgrade transformers" ] }, { "cell_type": "code", "execution_count": 2, "id": "7db96f7b-0cd3-4710-93d8-391622e60c25", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import pandas as pd\n", "import torch\n", "from torch.optim import AdamW\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.nn import CrossEntropyLoss\n", "from transformers import CamembertTokenizer, CamembertForSequenceClassification, get_scheduler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 3, "id": "338ab5df-bc6d-4f2c-8a5e-bc9bb7b658f1", "metadata": {}, "outputs": [], "source": [ "# ─────────────────────────────────────────────\n", "# ⚙️ Config\n", "# ─────────────────────────────────────────────\n", "DEBUG = False\n", "BATCH_SIZE = 64\n", "EPOCHS = 3 if not DEBUG else 1\n", "MAX_LEN = 128\n", "LR = 2e-5\n", "PATIENCE = 2 # pour l'early stopping" ] }, { "cell_type": "code", "execution_count": 4, "id": "3ccb7df2-77fc-461b-ac1e-05f1d8be7ed0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Classes : df_labels\n", "0 189412\n", "1 33982\n", "Name: count, dtype: int64\n" ] } ], "source": [ "# ─────────────────────────────────────────────\n", "# 📁 Chargement du dataset\n", "# ─────────────────────────────────────────────\n", "df = pd.read_csv(\"jigsaw-toxic-comment-train-google-fr-cleaned.csv\")\n", "df['comment_text'] = df['comment_text'].astype(str)\n", "df.rename(columns={'comment_text': 'texts'}, inplace=True)\n", "\n", "label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n", "other_cols_to_drop = ['Unnamed: 0.1', 'Unnamed: 0', 'id']\n", "cols_to_drop = label_cols + other_cols_to_drop\n", "\n", "df['df_labels'] = df[label_cols].max(axis=1)\n", "df = df.drop(columns=cols_to_drop)\n", "\n", "# Debug : sous-échantillonnage équilibré\n", "if DEBUG:\n", " df_0 = df[df[\"df_labels\"] == 0].sample(500, random_state=42)\n", " df_1 = df[df[\"df_labels\"] == 1].sample(500, random_state=42)\n", " df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42)\n", "\n", "print(\"Classes :\", df['df_labels'].value_counts())" ] }, { "cell_type": "code", "execution_count": 5, "id": "83c4cf79-57d9-4d54-8d8f-0e4249b8a930", "metadata": {}, "outputs": [], "source": [ "# ─────────────────────────────────────────────\n", "# 🔢 Dataset\n", "# ─────────────────────────────────────────────\n", "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n", "\n", "class CommentDataset(Dataset):\n", " def __init__(self, texts, labels, tokenizer, max_len):\n", " self.texts = texts\n", " self.labels = labels\n", " self.tokenizer = tokenizer\n", " self.max_len = max_len\n", "\n", " def __len__(self):\n", " return len(self.texts)\n", "\n", " def __getitem__(self, idx):\n", " encoding = self.tokenizer(\n", " self.texts[idx],\n", " padding=\"max_length\",\n", " truncation=True,\n", " max_length=self.max_len,\n", " return_tensors=\"pt\"\n", " )\n", " item = {key: val.squeeze() for key, val in encoding.items()}\n", " item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n", " return item\n", "\n", "# Split\n", "X_train, X_val, y_train, y_val = train_test_split(df[\"texts\"].tolist(), df[\"df_labels\"].tolist(), test_size=0.2, random_state=42)\n", "\n", "train_dataset = CommentDataset(X_train, y_train, tokenizer, MAX_LEN)\n", "val_dataset = CommentDataset(X_val, y_val, tokenizer, MAX_LEN)\n", "\n", "train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)\n", "val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)" ] }, { "cell_type": "code", "execution_count": 6, "id": "05e8419e-dbeb-42ba-b9ed-ea099e96244a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Poids pour la loss : tensor([1.0000, 5.5739])\n" ] } ], "source": [ "# ─────────────────────────────────────────────\n", "# 🧠 Modèle + loss pondérée\n", "# ─────────────────────────────────────────────\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model = CamembertForSequenceClassification.from_pretrained(\"camembert-base\", num_labels=2).to(device)\n", "\n", "# pondération dynamique\n", "if DEBUG:\n", " class_weights = torch.tensor([1.0, 1.0], dtype=torch.float)\n", "else:\n", " count_0 = df[df[\"df_labels\"] == 0].shape[0]\n", " count_1 = df[df[\"df_labels\"] == 1].shape[0]\n", " class_weights = torch.tensor([1.0, count_0 / count_1], dtype=torch.float)\n", "\n", "print(f\"Poids pour la loss : {class_weights}\")\n", "loss_fn = CrossEntropyLoss(weight=class_weights.to(device))\n", "\n", "# Optimiseur et scheduler\n", "optimizer = AdamW(model.parameters(), lr=LR)\n", "scheduler = get_scheduler(\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS)\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "0b181b9f-64a6-479f-b585-221a598cded6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "🌟 Epoch 1/3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Entraînement: 100%|██████████| 2793/2793 [18:23<00:00, 2.53it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "📉 Loss moyenne : 0.5043\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Évaluation: 100%|██████████| 699/699 [01:50<00:00, 6.32it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "🎯 F1-score (weighted) : 0.8826\n", "✅ Nouveau meilleur modèle — sauvegarde manuelle...\n", "\n", "🌟 Epoch 2/3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Entraînement: 100%|██████████| 2793/2793 [18:26<00:00, 2.53it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "📉 Loss moyenne : 0.4711\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Évaluation: 100%|██████████| 699/699 [01:49<00:00, 6.39it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "🎯 F1-score (weighted) : 0.8735\n", "⏳ EarlyStopping patience : 1/2\n", "\n", "🌟 Epoch 3/3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Entraînement: 100%|██████████| 2793/2793 [18:26<00:00, 2.52it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "📉 Loss moyenne : 0.4485\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Évaluation: 100%|██████████| 699/699 [01:50<00:00, 6.35it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "🎯 F1-score (weighted) : 0.8816\n", "⏳ EarlyStopping patience : 2/2\n", "🛑 Arrêt anticipé — pas d'amélioration\n" ] } ], "source": [ "best_f1 = 0\n", "patience_counter = 0\n", "os.makedirs(\"outputs/model\", exist_ok=True)\n", "\n", "for epoch in range(EPOCHS):\n", " print(f\"\\n🌟 Epoch {epoch + 1}/{EPOCHS}\")\n", " model.train()\n", " total_loss = 0\n", "\n", " for batch in tqdm(train_loader, desc=\"Entraînement\"):\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " logits = model(**batch).logits\n", " loss = loss_fn(logits, batch[\"labels\"])\n", " loss.backward()\n", " optimizer.step()\n", " scheduler.step()\n", " optimizer.zero_grad()\n", " total_loss += loss.item()\n", "\n", " avg_loss = total_loss / len(train_loader)\n", " print(f\"📉 Loss moyenne : {avg_loss:.4f}\")\n", "\n", " # 🔍 Évaluation\n", " model.eval()\n", " y_true, y_pred = [], []\n", " with torch.no_grad():\n", " for batch in tqdm(val_loader, desc=\"Évaluation\"):\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", " logits = model(**batch).logits\n", " preds = torch.argmax(logits, dim=1)\n", " y_true.extend(batch[\"labels\"].cpu().tolist())\n", " y_pred.extend(preds.cpu().tolist())\n", "\n", " report = classification_report(y_true, y_pred, target_names=[\"Non toxique\", \"Toxique\"], output_dict=True)\n", " f1 = report[\"weighted avg\"][\"f1-score\"]\n", " print(f\"🎯 F1-score (weighted) : {f1:.4f}\")\n", "\n", " if f1 > best_f1:\n", " best_f1 = f1\n", " patience_counter = 0\n", " print(\"✅ Nouveau meilleur modèle — sauvegarde manuelle...\")\n", "\n", " import os\n", "\n", " # 📂 Dossier de sauvegarde\n", " save_dir = \"outputs/model\"\n", " os.makedirs(save_dir, exist_ok=True)\n", "\n", " # 💾 Sauvegarde manuelle des poids\n", " torch.save(model.state_dict(), os.path.join(save_dir, \"pytorch_model.bin\"))\n", "\n", " # 💾 Sauvegarde de la configuration du modèle\n", " model.config.to_json_file(os.path.join(save_dir, \"config.json\"))\n", "\n", " # 💾 Sauvegarde du tokenizer\n", " tokenizer.save_pretrained(save_dir)\n", "\n", " # 💾 Sauvegarde des métriques\n", " with open(\"outputs/metrics.json\", \"w\") as f:\n", " json.dump(report, f, indent=4)\n", "\n", " else:\n", " patience_counter += 1\n", " print(f\"⏳ EarlyStopping patience : {patience_counter}/{PATIENCE}\")\n", " if patience_counter >= PATIENCE:\n", " print(\"🛑 Arrêt anticipé — pas d'amélioration\")\n", " break" ] }, { "cell_type": "code", "execution_count": 8, "id": "ba6f2d7c-0daf-48db-96a2-33935dca1d9e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📊 Métriques sauvegardées :\n", "\n", "🗂 Classe : Non toxique\n", " 🔸 Précision : 0.9294\n", " 🔸 Rappel : 0.9329\n", " 🔸 F1-score : 0.9312\n", "\n", "🗂 Classe : Toxique\n", " 🔸 Précision : 0.6193\n", " 🔸 Rappel : 0.6065\n", " 🔸 F1-score : 0.6129\n", "\n", "🔄 Moyennes pondérées (weighted avg) :\n", " ✅ Précision : 0.8821\n", " ✅ Rappel : 0.8831\n", " ✅ F1-score : 0.8826\n" ] } ], "source": [ "import json\n", "import os\n", "\n", "# 📁 Chemin du fichier de métriques\n", "metrics_path = \"outputs/metrics.json\"\n", "\n", "# ✅ Vérifie l'existence du fichier\n", "if os.path.exists(metrics_path):\n", " with open(metrics_path, \"r\") as f:\n", " metrics = json.load(f)\n", "\n", " print(\"📊 Métriques sauvegardées :\\n\")\n", " for label in [\"Non toxique\", \"Toxique\"]:\n", " print(f\"🗂 Classe : {label}\")\n", " print(f\" 🔸 Précision : {metrics[label]['precision']:.4f}\")\n", " print(f\" 🔸 Rappel : {metrics[label]['recall']:.4f}\")\n", " print(f\" 🔸 F1-score : {metrics[label]['f1-score']:.4f}\\n\")\n", "\n", " print(\"🔄 Moyennes pondérées (weighted avg) :\")\n", " print(f\" ✅ Précision : {metrics['weighted avg']['precision']:.4f}\")\n", " print(f\" ✅ Rappel : {metrics['weighted avg']['recall']:.4f}\")\n", " print(f\" ✅ F1-score : {metrics['weighted avg']['f1-score']:.4f}\")\n", "else:\n", " print(\"❌ Aucune métrique trouvée dans outputs/metrics.json\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }