# Copyright 2020-2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import csv import evaluate import numpy as np import torch from datasets import load_dataset from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, is_torch_npu_available, is_torch_xpu_available toxicity = evaluate.load("ybelkada/toxicity", "DaNLP/da-electra-hatespeech-detection", module_type="measurement") ds = load_dataset("OxAISH-AL-LLM/wiki_toxic", split="test") parser = argparse.ArgumentParser(description="Evaluate de-toxified models") parser.add_argument("--model_type", default="all", type=str, help="Relative path to the source model folder") parser.add_argument("--output_file", default="toxicity.csv", type=str, help="Relative path to the source model folder") parser.add_argument("--batch_size", default=64, type=int, help="Batch size") parser.add_argument("--num_samples", default=400, type=int, help="Number of samples") parser.add_argument("--context_length", default=2000, type=int, help="Number of samples") parser.add_argument("--max_new_tokens", default=30, type=int, help="Max new tokens for generation") args = parser.parse_args() if args.model_type == "all": MODELS_TO_TEST = [ "ybelkada/gpt-neo-125m-detox", "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-2.7B", "ybelkada/gpt-neo-2.7B-detox", "ybelkada/gpt-j-6b-sharded-bf16", "ybelkada/gpt-j-6b-detoxs", ] elif args.model_type == "gpt-neo": MODELS_TO_TEST = [ "ybelkada/gpt-neo-125m-detox", "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-2.7B", "ybelkada/gpt-neo-2.7B-detox", ] elif args.model_type == "gpt-j": MODELS_TO_TEST = [ "ybelkada/gpt-j-6b-sharded-bf16", "ybelkada/gpt-j-6b-detox", ] else: MODELS_TO_TEST = [args.model_type] NUM_SAMPLES = args.num_samples BATCH_SIZE = args.batch_size output_file = args.output_file max_new_tokens = args.max_new_tokens context_length = args.context_length if is_torch_xpu_available(): device = torch.xpu.current_device() elif is_torch_npu_available(): device = torch.npu.current_device() else: device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu" # consider only toxic prompts ds = ds.filter(lambda x: x["label"] == 1) toxicities = {} # open a csv file file = open(f"{output_file}", "w", newline="") writer = csv.writer(file) # add first rows writer.writerow(["model_id", "mean_toxicity", "std_toxicity"]) for model_id in tqdm(MODELS_TO_TEST): model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": device}, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" input_texts = [] for i, example in enumerate(ds): # set seed torch.manual_seed(42) input_text = example["comment_text"] input_texts.append(input_text[:2000]) if i > NUM_SAMPLES: break if (i + 1) % BATCH_SIZE == 0: inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device) inputs.input_ids = inputs.input_ids[:context_length] inputs.attention_mask = inputs.attention_mask[:context_length] outputs = model.generate(**inputs, do_sample=True, max_new_tokens=max_new_tokens, use_cache=True) generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True) generated_texts = [ generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts) ] toxicity_score = toxicity.compute(predictions=generated_texts) input_texts = [] if model_id not in toxicities: toxicities[model_id] = [] toxicities[model_id].extend(toxicity_score["toxicity"]) # last batch inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device) outputs = model.generate(**inputs, do_sample=True, max_new_tokens=30) generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True) generated_texts = [generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)] toxicity_score = toxicity.compute(predictions=generated_texts) toxicities[model_id].extend(toxicity_score["toxicity"]) # compute mean & std using np mean = np.mean(toxicities[model_id]) std = np.std(toxicities[model_id]) # save to file writer.writerow([model_id, mean, std]) # print print(f"Model: {model_id} - Mean: {mean} - Std: {std}") model = None if is_torch_xpu_available(): torch.xpu.empty_cache() elif is_torch_npu_available(): torch.npu.empty_cache() else: torch.cuda.empty_cache() # close file file.close()