# Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/MisinformationTruthTell/

/content/drive/MyDrive/MisinformationTruthTell


In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
%cd /content/drive/MyDrive/MisinformationTruthTell/

/content/drive/MyDrive/MisinformationTruthTell


In [None]:
! kaggle datasets download clmentbisaillon/fake-and-real-news-dataset

Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
fake-and-real-news-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
path = '/content/drive/MyDrive/MisinformationTruthTell'

In [None]:
! unzip fake-and-real-news-dataset.zip

Archive:  fake-and-real-news-dataset.zip
replace Fake.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Fake.csv                
replace True.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: True.csv                


In [None]:
# Load Dataset
import pandas as pd
true_data = pd.read_csv('True.csv')
fake_data = pd.read_csv('Fake.csv')

In [None]:
# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)



In [None]:
# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'


data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)


# See how the data looks like
print(data.shape)
data.head()

(44898, 5)


Unnamed: 0,title,text,subject,date,Target
0,THE LOST VIDEO: Watch MSNBC’S Mika Shamelessly...,With all the anger and nasty comments coming f...,left-news,"Jun 30, 2017",Fake
1,Trump Is Now Threatening A Lawsuit Over Copyr...,The artist who painted a nude portrait of Dona...,News,"April 17, 2016",Fake
2,Oklahoma Gun Range Refused Service To Muslim ...,You d think in the year 2016 we d no longer ha...,News,"February 19, 2016",Fake
3,Republican debate without Trump draws 12.5 mil...,LOS ANGELES (Reuters) - The Republican preside...,politicsNews,"January 29, 2016",True
4,SHOCKING REPORT: 50% of Babies in 24 States Bo...,New Mexico led all states with 72 percent of t...,Government News,"Mar 24, 2017",Fake


In [None]:
data['text'] = data.apply(lambda x: str(x.title) + '. ' + str(x.text), axis=1)
data = data[['text', 'Target']]

In [None]:

data.to_csv('./train.csv', index=False)

# Training

In [None]:
#!pip install transformers
import transformers

In [None]:
from transformers import Trainer, TrainingArguments, LineByLineTextDataset

In [None]:
import pandas as pd

In [None]:
!pip install datasets



In [None]:
from datasets import Dataset

In [None]:
df = pd.read_csv('./train.csv')

In [None]:
df

Unnamed: 0,text,Target
0,THE LOST VIDEO: Watch MSNBC’S Mika Shamelessly...,Fake
1,Trump Is Now Threatening A Lawsuit Over Copyr...,Fake
2,Oklahoma Gun Range Refused Service To Muslim ...,Fake
3,Republican debate without Trump draws 12.5 mil...,True
4,SHOCKING REPORT: 50% of Babies in 24 States Bo...,Fake
...,...,...
44893,Syrian Kurdish YPG accuses Turkey of Afrin agg...,True
44894,Turkey's military says two Turkish soldiers ki...,True
44895,"On Election Eve for five states, Trump rips Cr...",True
44896,EU's Juncker hails Macron speech as 'very Euro...,True


In [None]:
# Replace 'Fake' with 0 and 'True' with 1 in the Target column
df['Target'] = df['Target'].map({'Fake': 1, 'True': 0})

# Verify the changes
print(df.head())

                                                text  Target
0  THE LOST VIDEO: Watch MSNBC’S Mika Shamelessly...       1
1   Trump Is Now Threatening A Lawsuit Over Copyr...       1
2   Oklahoma Gun Range Refused Service To Muslim ...       1
3  Republican debate without Trump draws 12.5 mil...       0
4  SHOCKING REPORT: 50% of Babies in 24 States Bo...       1


In [None]:
df

Unnamed: 0,text,Target
0,THE LOST VIDEO: Watch MSNBC’S Mika Shamelessly...,1
1,Trump Is Now Threatening A Lawsuit Over Copyr...,1
2,Oklahoma Gun Range Refused Service To Muslim ...,1
3,Republican debate without Trump draws 12.5 mil...,0
4,SHOCKING REPORT: 50% of Babies in 24 States Bo...,1
...,...,...
44893,Syrian Kurdish YPG accuses Turkey of Afrin agg...,0
44894,Turkey's military says two Turkish soldiers ki...,0
44895,"On Election Eve for five states, Trump rips Cr...",0
44896,EU's Juncker hails Macron speech as 'very Euro...,0


In [None]:
df['labels'] = df['Target']

In [None]:
df = df[['text', 'labels']]

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 44898
})

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline

model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

In [None]:
dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/44898 [00:00<?, ? examples/s]

In [None]:
dataset_splitted = dataset.shuffle(1337).train_test_split(0.1)

In [None]:
dataset_splitted

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 40408
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4490
    })
})

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
for name, param in model.named_parameters():
    if name in ['classifier.weight', 'classifier.bias']:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model, train_dataset=dataset_splitted['train'],
    eval_dataset=dataset_splitted['test'],
    compute_metrics=compute_metrics,
    args=TrainingArguments(
        load_best_model_at_end=True,
        output_dir="./my_saved_model", overwrite_output_dir=True,
        num_train_epochs=10, per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        eval_strategy = "epoch",
        save_strategy = "epoch",
        save_steps=10_000, save_total_limit=2),
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,1.123,0.627089,0.688196
2,0.6186,0.555046,0.838307
3,0.5672,0.5078,0.847216
4,0.5023,0.475175,0.86882
5,0.4817,0.451593,0.874833
6,0.4665,0.435129,0.879287
7,0.4567,0.42282,0.879733
8,0.4404,0.415625,0.883296
9,0.436,0.410988,0.883073
10,0.4309,0.409735,0.883296


TrainOutput(global_step=6320, training_loss=0.5362479149540769, metrics={'train_runtime': 7283.4335, 'train_samples_per_second': 55.479, 'train_steps_per_second': 0.868, 'total_flos': 5.352742644891648e+16, 'train_loss': 0.5362479149540769, 'epoch': 10.0})