Spaces:

Doaneer
/

Truong

Sleeping

App Files Files Community

Truong / MMAudio-main /transformers /tests /trainer /test_trainer_fsdp.py

Doaneer

Upload 5065 files

e0be88b verified 3 months ago

raw

history blame contribute delete

6.73 kB

	# Copyright 2024 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	from transformers import is_torch_available
	from transformers.testing_utils import (
	TestCasePlus,
	backend_device_count,
	execute_subprocess_async,
	get_torch_dist_unique_port,
	require_accelerate,
	require_fp8,
	require_torch_multi_accelerator,
	run_first,
	torch_device,
	)


	if is_torch_available():
	import torch
	import torch.distributed
	import torch.utils.data

	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	DataCollatorForSeq2Seq,
	EvalPrediction,
	GenerationConfig,
	HfArgumentParser,
	PreTrainedTokenizerBase,
	Seq2SeqTrainer,
	Seq2SeqTrainingArguments,
	)

	class DummyTextDataset(torch.utils.data.Dataset[str]):
	def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
	data = 4 * [
	"Hello world!",
	"The quick brown fox jumps over the lazy dog.",
	]
	self.data = [
	{k: v.squeeze(0) for k, v in tokenizer(item, return_tensors="pt", return_attention_mask=True).items()}
	for item in data
	]
	for item in self.data:
	item["labels"] = item["input_ids"]

	def __len__(self) -> int:
	return len(self.data)

	def __getitem__(self, i: int) -> str:
	return self.data[i]


	class TestFSDPTrainer(TestCasePlus):
	@require_torch_multi_accelerator
	@require_accelerate
	@run_first
	def test_trainer(self):
	output_dir = self.get_auto_remove_tmp_dir()
	cmd = [
	"accelerate",
	"launch",
	"--use_fsdp",
	"--main_process_port",
	f"{get_torch_dist_unique_port()}",
	"--num_processes",
	f"{backend_device_count(torch_device)}",
	"--fsdp_transformer_layer_cls_to_wrap",
	"GPT2Block",
	f"{self.test_file_dir}/test_trainer_fsdp.py",
	"--output_dir",
	f"{output_dir}",
	"--report_to",
	"none",
	]
	execute_subprocess_async(cmd, env=self.get_env())
	# successful return here == success - any errors would have caused an error in the sub-call


	class TestFSDPTrainerFP8(TestCasePlus):
	@require_torch_multi_accelerator
	@require_accelerate
	@require_fp8
	@run_first
	def test_trainer(self):
	output_dir = self.get_auto_remove_tmp_dir()
	cmd = [
	"accelerate",
	"launch",
	"--use_fsdp",
	"--main_process_port",
	f"{get_torch_dist_unique_port()}",
	"--num_processes",
	f"{backend_device_count(torch_device)}",
	"--mixed_precision",
	"fp8",
	"--fsdp_transformer_layer_cls_to_wrap",
	"GPT2Block",
	f"{self.test_file_dir}/test_trainer_fsdp.py",
	"--output_dir",
	f"{output_dir}",
	"--report_to",
	"none",
	]
	execute_subprocess_async(cmd, env=self.get_env())
	# successful return here == success - any errors would have caused an error in the sub-call


	class TestFSDPTrainerWrap(TestCasePlus):
	@require_torch_multi_accelerator
	@require_accelerate
	@run_first
	def test_trainer(self):
	output_dir = self.get_auto_remove_tmp_dir()
	cmd = [
	"accelerate",
	"launch",
	"--use_fsdp",
	"--main_process_port",
	f"{get_torch_dist_unique_port()}",
	"--num_processes",
	f"{backend_device_count(torch_device)}",
	"--fsdp_transformer_layer_cls_to_wrap",
	"GPT2Block",
	f"{self.test_file_dir}/test_trainer_fsdp.py",
	"--output_dir",
	f"{output_dir}",
	"--report_to",
	"none",
	"--auto_find_batch_size",
	"True",
	]
	execute_subprocess_async(cmd, env=self.get_env())
	# successful return here == success - any errors would have caused an error in the sub-call


	class TestFSDPTrainerTorchCompile(TestCasePlus):
	@require_torch_multi_accelerator
	@require_accelerate
	@run_first
	def test_trainer(self):
	output_dir = self.get_auto_remove_tmp_dir()
	cmd = [
	"accelerate",
	"launch",
	"--use_fsdp",
	"--main_process_port",
	f"{get_torch_dist_unique_port()}",
	"--num_processes",
	f"{backend_device_count(torch_device)}",
	"--fsdp_transformer_layer_cls_to_wrap",
	"GPT2Block",
	f"{self.test_file_dir}/test_trainer_fsdp.py",
	"--torch_compile_mode",
	"default",
	"--output_dir",
	f"{output_dir}",
	"--report_to",
	"none",
	]
	execute_subprocess_async(cmd, env=self.get_env())
	# successful return here == success - any errors would have caused an error in the sub-call


	if __name__ == "__main__":
	parser = HfArgumentParser((Seq2SeqTrainingArguments,))
	training_args = parser.parse_args_into_dataclasses()[0]
	training_args.per_device_eval_batch_size = 1
	training_args.use_legacy_prediction_loop = False
	training_args.predict_with_generate = True
	training_args.generation_config = GenerationConfig(max_length=30)

	pretrained_model_name = "hf-internal-testing/tiny-random-gpt2"
	tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
	tokenizer.pad_token = tokenizer.eos_token
	device = torch.device(torch.distributed.get_rank())
	model = AutoModelForCausalLM.from_pretrained(pretrained_model_name).to(device)

	def compute_metrics(p: EvalPrediction) -> dict[str, bool]:
	return {"accuracy": (p.predictions == p.label_ids).mean()}

	trainer = Seq2SeqTrainer(
	model=model,
	args=training_args,
	data_collator=DataCollatorForSeq2Seq(tokenizer, model),
	eval_dataset=DummyTextDataset(tokenizer),
	compute_metrics=compute_metrics,
	)

	metrics = trainer.evaluate()