Spaces:

jamtur01
/

MMaDA

Runtime error

App Files Files Community

MMaDA / venv /lib /python3.11 /site-packages /deepspeed /sequence /fpdt_layer.py

jamtur01

Upload folder using huggingface_hub

9c6594c verified about 1 month ago

raw

history blame contribute delete

56.2 kB

	# Copyright (c) Microsoft Corporation.
	# SPDX-License-Identifier: Apache-2.0

	# DeepSpeed Team

	import torch

	from typing import Optional, Any, Tuple
	from torch import Tensor
	from packaging import version
	import deepspeed.comm as dist
	from deepspeed.accelerator import get_accelerator

	try:
	import flash_attn
	from flash_attn.flash_attn_interface import _flash_attn_forward, _flash_attn_backward
	flash_attn_version = version.parse(flash_attn.__version__)
	except ImportError:
	_flash_attn_forward = None
	_flash_attn_backward = None

	from einops import rearrange
	from .layer import single_all_to_all, apply_rotary_pos_emb


	def _rotate_half_backward(x):
	x = rearrange(x, '... (j d) -> ... j d', j=2)
	x1, x2 = x.unbind(dim=-2)
	return torch.cat((x2, -x1), dim=-1)


	def apply_rotary_pos_emb_backward(grad_output, freqs_cos, freqs_sin):
	rot_dim = freqs_cos.shape[-1]
	grad, grad_pass = grad_output[..., :rot_dim], grad_output[..., rot_dim:]
	grad_t = (grad * freqs_cos) + (_rotate_half_backward(grad * freqs_sin))
	grad = grad_t if grad_pass.shape[-1] == 0 else torch.cat((grad_t, grad_pass), dim=-1)
	return grad


	def _update_out_and_lse(
	out: torch.Tensor,
	lse: torch.Tensor,
	block_out: torch.Tensor,
	block_lse: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:

	block_out = block_out.to(torch.float32)
	block_lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1)

	new_lse = lse + torch.log1p(torch.exp(block_lse - lse))

	out = torch.exp(lse - new_lse) * out + torch.exp(block_lse - new_lse) * block_out

	lse = new_lse
	return out, lse


	def update_out_and_lse(
	out: Optional[torch.Tensor],
	lse: Optional[torch.Tensor],
	block_out: torch.Tensor,
	block_lse: torch.Tensor,
	slice_=None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	if out is None:
	if slice_ is not None:
	raise RuntimeError("first update_out_and_lse should not pass slice_ args")
	out = block_out.to(torch.float32)
	lse = block_lse.permute(0, 2, 1).contiguous().unsqueeze(dim=-1).contiguous()
	elif slice_ is not None:
	slice_out, slice_lse = out[slice_], lse[slice_]
	slice_out, slice_lse = _update_out_and_lse(slice_out, slice_lse, block_out, block_lse)
	out[slice_], lse[slice_] = slice_out, slice_lse
	else:
	out, lse = _update_out_and_lse(out, lse, block_out, block_lse)
	return out, lse


	class FPDT_InputConstruct(torch.nn.Module):

	def __init__(self, tokens, labels, loss_mask, attention_mask, position_ids, args, sp_size, sp_rank) -> None:

	super(FPDT_InputConstruct, self).__init__()
	self.tokens = tokens
	self.labels = labels
	self.loss_mask = loss_mask
	self.attention_mask = attention_mask
	self.position_ids = position_ids
	global_seq_len = tokens.shape[1]
	batch_size = tokens.shape[0]
	assert global_seq_len % sp_size == 0
	assert global_seq_len % args.ds_sequence_parallel_fpdt_chunk_size == 0
	num_chunk_per_gpu = global_seq_len // args.ds_sequence_parallel_fpdt_chunk_size
	local_seq_len = global_seq_len // sp_size
	assert local_seq_len % num_chunk_per_gpu == 0

	self.num_chunk_per_gpu = num_chunk_per_gpu
	self.chunk_size = local_seq_len // num_chunk_per_gpu
	self.sp_size = sp_size
	self.sp_rank = sp_rank
	self.global_seq_len = global_seq_len
	self.local_seq_len = local_seq_len
	self.batch_size = batch_size
	self.device = tokens.device

	def generate(self):
	device = self.device
	totalChunks = self.global_seq_len // self.chunk_size
	token_chunk_idx = torch.arange(self.global_seq_len, device=device, dtype=torch.int) // self.chunk_size
	chunk_to_gpu = torch.arange(totalChunks, device=device, dtype=torch.int)
	chunk_to_gpu = chunk_to_gpu.reshape(self.num_chunk_per_gpu, -1).t().contiguous()

	gather_chunk = chunk_to_gpu.flatten().unsqueeze(1).contiguous()
	mask = gather_chunk == token_chunk_idx

	indices = mask.nonzero(as_tuple=False)
	gather_indices = indices[:, 0]
	token_chunk_indices = indices[:, 1]
	indices = torch.cat([token_chunk_indices[gather_indices == i] for i in range(gather_chunk.shape[0])])
	load_balanced_loss_mask = self.loss_mask[:, indices] if self.loss_mask is not None else self.loss_mask

	indices = indices.reshape(-1, self.chunk_size)[self.num_chunk_per_gpu * self.sp_rank:self.num_chunk_per_gpu *
	(self.sp_rank + 1)].flatten().contiguous()
	load_balanced_tokens = self.tokens[:, indices]
	load_balanced_labels = self.labels[:, indices] if self.labels is not None else self.labels

	load_balanced_attention_mask = self.attention_mask if self.attention_mask is not None else self.attention_mask
	load_balanced_position_ids = self.position_ids[:,
	indices] if self.position_ids is not None else self.position_ids

	return load_balanced_tokens, load_balanced_labels, load_balanced_loss_mask, load_balanced_attention_mask, load_balanced_position_ids


	class _FPDTGPUAttentionImpl_(torch.autograd.Function):
	generate_vmap_rule = False

	@staticmethod
	def forward(ctx: Any,
	layernorm_output,
	attention_mask,
	inference_params,
	rotary_pos_emb,
	spg,
	scatter_idx,
	gather_idx,
	hidden_size,
	projection_size,
	hidden_size_per_attention_head,
	kv_projection_size,
	qkv_linear_weight,
	qkv_linear_bias,
	dropout,
	num_chunks=8,
	cpu_offloading=True):

	do_save = layernorm_output.requires_grad

	if rotary_pos_emb is not None:
	pos_emb_cos, pos_emb_sin = rotary_pos_emb[0].permute(1, 0, 2, 3), rotary_pos_emb[1].permute(1, 0, 2, 3)
	ctx.pos_emb_cos = pos_emb_cos
	ctx.pos_emb_sin = pos_emb_sin
	else:
	ctx.pos_emb_cos = None
	ctx.pos_emb_sin = None

	with torch.no_grad():
	per_gpu_seq_len = layernorm_output.shape[0]
	chunk_size = per_gpu_seq_len // num_chunks
	assert chunk_size * num_chunks == per_gpu_seq_len
	assert attention_mask is None
	ctx.num_chunks = num_chunks
	ctx.cpu_offloading = cpu_offloading
	ctx.spg = spg
	ctx.scatter_idx = scatter_idx
	ctx.gather_idx = gather_idx

	device = get_accelerator().current_device_name()
	ctx.device = device
	ctx.dtype = layernorm_output.dtype
	ctx.projection_size = projection_size
	ctx.kv_projection_size = kv_projection_size

	global_q = []
	global_k = []
	global_v = []

	ctx.softmax_scale = hidden_size_per_attention_head**(-0.5)

	ctx.dropout_p = dropout
	ctx.window_size = (-1, -1)
	ctx.alibi_slopes = None

	batch_size = layernorm_output.shape[1]

	global_o = [None for _ in range(num_chunks)]
	global_lse = [None for _ in range(num_chunks)]

	for i in range(num_chunks):

	st = chunk_size * i
	ed = st + chunk_size

	qkv_chunk = torch.matmul(layernorm_output[st:ed], qkv_linear_weight.t()) + qkv_linear_bias

	q_chunk = qkv_chunk[:, :, :projection_size].contiguous().reshape(
	qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
	hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous() # b, l, nh, hd
	q_chunk = single_all_to_all(q_chunk, scatter_idx, gather_idx, 0, spg)
	global_q_chunk_len = q_chunk.shape[1]
	if rotary_pos_emb is not None:
	q_chunk = apply_rotary_pos_emb(q_chunk,
	pos_emb_cos[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)],
	pos_emb_sin[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)])
	global_q.append(q_chunk)

	k_chunk = qkv_chunk[:, :, projection_size:projection_size + kv_projection_size].contiguous().reshape(
	qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
	hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous() # b, l, nh, hd
	k_chunk = single_all_to_all(k_chunk, scatter_idx, gather_idx, 0, spg)
	if rotary_pos_emb is not None:
	k_chunk = apply_rotary_pos_emb(k_chunk,
	pos_emb_cos[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)],
	pos_emb_sin[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)])
	global_k.append(k_chunk)

	v_chunk = qkv_chunk[:, :, projection_size + kv_projection_size:].contiguous().reshape(
	qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
	hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous() # b, l, nh, hd
	v_chunk = single_all_to_all(v_chunk, scatter_idx, gather_idx, 0, spg)
	global_v.append(v_chunk)

	for k_i in range(len(global_k)):
	causal_chunk = i == k_i
	if flash_attn_version >= version.parse("2.6.0"):
	block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(global_q[i],
	global_k[k_i],
	global_v[k_i],
	ctx.dropout_p,
	ctx.softmax_scale,
	causal=causal_chunk,
	window_size=ctx.window_size,
	softcap=0.0,
	alibi_slopes=ctx.alibi_slopes,
	return_softmax=False)
	else:
	block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(global_q[i],
	global_k[k_i],
	global_v[k_i],
	ctx.dropout_p,
	ctx.softmax_scale,
	causal=causal_chunk,
	window_size=ctx.window_size,
	alibi_slopes=ctx.alibi_slopes,
	return_softmax=False)

	global_o[i], global_lse[i] = update_out_and_lse(global_o[i], global_lse[i], block_out, block_lse)

	global_o[i] = global_o[i].to(q_chunk.dtype)

	output = [None for i in range(num_chunks)]

	for i in range(num_chunks):
	global_lse[i] = global_lse[i][:, :, :, 0].permute(0, 2, 1).contiguous()
	output[i] = single_all_to_all(global_o[i].to(ctx.dtype).contiguous(), gather_idx, scatter_idx, 0, spg)
	output = torch.cat(output, dim=1)

	head_dim = output.shape[-1]

	if do_save:
	ctx.save_for_backward(layernorm_output)
	ctx.global_q = global_q
	ctx.global_k = global_k
	ctx.global_v = global_v
	ctx.attn_output = global_o
	ctx.attn_lse = global_lse
	ctx.head_dim = head_dim
	ctx.batch_size = batch_size

	ctx.qkv_linear_weight = qkv_linear_weight
	ctx.qkv_linear_bias = qkv_linear_bias

	return output

	@staticmethod
	def backward(ctx, grad_output):

	num_chunks = ctx.num_chunks
	device = ctx.device
	dtype = ctx.dtype
	spg = ctx.spg
	scatter_idx = ctx.scatter_idx
	gather_idx = ctx.gather_idx
	softmax_scale = ctx.softmax_scale
	dropout_p = ctx.dropout_p
	window_size = ctx.window_size
	alibi_slopes = ctx.alibi_slopes

	projection_size = ctx.projection_size
	kv_projection_size = ctx.kv_projection_size

	layernorm_output = ctx.saved_tensors[0]

	global_q = ctx.global_q
	global_k = ctx.global_k
	global_v = ctx.global_v
	attn_output = ctx.attn_output
	lse = ctx.attn_lse

	qkv_linear_weight = ctx.qkv_linear_weight
	qkv_linear_bias = ctx.qkv_linear_bias

	input_chunk_size = layernorm_output.shape[0] // num_chunks
	grad_layernorm_output = [
	torch.zeros((input_chunk_size, layernorm_output.shape[1], layernorm_output.shape[2]),
	device=device,
	dtype=dtype) for _ in range(num_chunks)
	]

	grad_global_attn_output = []
	chunk_size = grad_output.shape[1] // num_chunks

	for i in range(num_chunks):
	st = chunk_size * i
	ed = st + chunk_size
	grad_global_attn_output.append(
	single_all_to_all(grad_output[:, st:ed].contiguous(), scatter_idx, gather_idx, 0, spg))

	del grad_output

	dq = [torch.zeros(global_q[0].shape, dtype=torch.float, device=device) for _ in range(num_chunks)]
	dk = [torch.zeros(global_q[0].shape, dtype=torch.float, device=device) for _ in range(num_chunks)]
	dv = [torch.zeros(global_q[0].shape, dtype=torch.float, device=device) for _ in range(num_chunks)]

	grad_qkv_linear_weight = torch.zeros(qkv_linear_weight.shape,
	device=qkv_linear_weight.device,
	dtype=torch.float)
	grad_qkv_linear_bias = torch.zeros(qkv_linear_bias.shape, device=qkv_linear_weight.device, dtype=torch.float)

	for i in range(num_chunks):
	k_chunk = global_k[i]
	v_chunk = global_v[i]

	for q_i in range(num_chunks):
	no_computation = q_i < i
	if no_computation:
	continue

	causal_chunk = q_i == i

	q_chunk = global_q[q_i]
	attn_output_chunk = attn_output[q_i]
	lse_chunk = lse[q_i]
	d_out = grad_global_attn_output[q_i]

	dq_this = torch.zeros(global_q[0].shape, dtype=dtype, device=device)
	dk_this = torch.zeros(global_k[0].shape, dtype=dtype, device=device)
	dv_this = torch.zeros(global_v[0].shape, dtype=dtype, device=device)

	if flash_attn_version >= version.parse("2.6.0"):
	_flash_attn_backward(d_out,
	q_chunk,
	k_chunk,
	v_chunk,
	attn_output_chunk,
	lse_chunk,
	dq_this,
	dk_this,
	dv_this,
	dropout_p,
	softmax_scale,
	causal_chunk,
	window_size,
	softcap=0.0,
	alibi_slopes=alibi_slopes,
	deterministic=False,
	rng_state=None)
	else:
	_flash_attn_backward(d_out,
	q_chunk,
	k_chunk,
	v_chunk,
	attn_output_chunk,
	lse_chunk,
	dq_this,
	dk_this,
	dv_this,
	dropout_p,
	softmax_scale,
	causal_chunk,
	window_size,
	alibi_slopes=alibi_slopes,
	deterministic=False,
	rng_state=None)

	dq[q_i].add_(dq_this.to(torch.float))
	dk[i].add_(dk_this.to(torch.float))
	dv[i].add_(dv_this.to(torch.float))

	dk_seq_len = dk[i].shape[1]

	if ctx.pos_emb_cos is not None:
	dk[i] = apply_rotary_pos_emb_backward(dk[i].to(dtype),
	ctx.pos_emb_cos[:, dk_seq_len * i:dk_seq_len * (i + 1)],
	ctx.pos_emb_sin[:, dk_seq_len * i:dk_seq_len * (i + 1)])
	else:
	dk[i] = dk[i].to(dtype)
	dv[i] = dv[i].to(dtype)
	dk[i] = single_all_to_all(dk[i].contiguous(), gather_idx, scatter_idx, 0, spg)
	dv[i] = single_all_to_all(dv[i].contiguous(), gather_idx, scatter_idx, 0, spg)

	input_st = i * input_chunk_size
	input_ed = input_st + input_chunk_size

	input_chunk = layernorm_output[input_st:input_ed].reshape(-1, layernorm_output.shape[-1])

	dk[i] = dk[i].flatten(2).permute(1, 0, 2)
	dv[i] = dv[i].flatten(2).permute(1, 0, 2)
	l, b = dk[i].shape[0], dk[i].shape[1]
	grad_qkv_linear_weight[projection_size:projection_size + kv_projection_size].add_(
	torch.matmul(dk[i].reshape(l * b, -1).t(), input_chunk))
	grad_qkv_linear_weight[projection_size + kv_projection_size:].add_(
	torch.matmul(dv[i].reshape(l * b, -1).t(), input_chunk))
	grad_qkv_linear_bias[projection_size:projection_size + kv_projection_size].add_(dk[i].sum(0).sum(0))
	grad_qkv_linear_bias[projection_size + kv_projection_size:].add_(dv[i].sum(0).sum(0))

	grad_layernorm_output[i].add_(
	torch.matmul(dk[i], qkv_linear_weight[projection_size:projection_size + kv_projection_size]))
	grad_layernorm_output[i].add_(torch.matmul(dv[i],
	qkv_linear_weight[projection_size + kv_projection_size:]))

	dk[i] = None
	dv[i] = None

	for i in range(num_chunks):
	dq_seq_len = dq[i].shape[1]
	if ctx.pos_emb_cos is not None:
	dq[i] = apply_rotary_pos_emb_backward(dq[i].to(dtype),
	ctx.pos_emb_cos[:, dq_seq_len * i:dq_seq_len * (i + 1)],
	ctx.pos_emb_sin[:, dq_seq_len * i:dq_seq_len * (i + 1)])
	else:
	dq[i] = dq[i].to(dtype)
	dq[i] = single_all_to_all(dq[i].to(dtype).contiguous(), gather_idx, scatter_idx, 0, spg)

	input_chunk = layernorm_output[:input_chunk_size].reshape(-1, layernorm_output.shape[-1])
	layernorm_output = layernorm_output[input_chunk_size:]

	dq[i] = dq[i].flatten(2).permute(1, 0, 2)
	l, b = dq[i].shape[0], dq[i].shape[1]
	grad_qkv_linear_weight[:projection_size].add_(torch.matmul(dq[i].reshape(l * b, -1).t(), input_chunk))
	grad_qkv_linear_bias[:projection_size].add_(dq[i].sum(0).sum(0))

	grad_layernorm_output[i].add_(torch.matmul(dq[i], qkv_linear_weight[:projection_size]))

	dq[i] = None

	return torch.cat(
	grad_layernorm_output,
	dim=0).to(dtype), None, None, None, None, None, None, None, None, None, None, grad_qkv_linear_weight.to(
	dtype), grad_qkv_linear_bias.to(dtype), None, None, None


	class SequenceChunk:

	def __init__(self, chunk: torch.Tensor, device=None, is_in_use=False):

	self.chunk_shape = chunk.shape
	self.chunk_dtype = chunk.dtype
	self.device = chunk.device if device is None else device

	cpu_chunk = torch.empty(chunk.shape, dtype=chunk.dtype, device='cpu', pin_memory=True)

	if get_accelerator().on_accelerator(chunk):
	cpu_chunk.copy_(chunk, non_blocking=True)
	else:
	cpu_chunk = chunk

	self.cpu_chunk = cpu_chunk

	self.gpu_chunk = chunk if is_in_use else None

	def load_to_gpu(self):
	assert self.gpu_chunk is None
	if self.gpu_chunk is not None:
	pass
	else:
	gpu_chunk = torch.empty(self.chunk_shape, device=self.device, dtype=self.chunk_dtype)
	gpu_chunk.copy_(self.cpu_chunk, non_blocking=True)
	self.gpu_chunk = gpu_chunk

	def get_gpu_chunk(self):
	assert self.gpu_chunk is not None and self.gpu_chunk.device == self.device
	return self.gpu_chunk

	def check_gpu_chunk(self, ):
	assert (self.gpu_chunk is not None) and (
	self.gpu_chunk.device == self.device
	), f"gpu_chunk {self.gpu_chunk is not None} shound be on {self.device}, but it is now on {self.gpu_chunk.device}"
	return True

	def offload(self):
	assert self.gpu_chunk is not None and self.gpu_chunk.device == self.device
	del self.gpu_chunk
	self.gpu_chunk = None

	def overwrite_to_cpu(self):
	assert self.gpu_chunk is not None and self.gpu_chunk.device == self.device
	self.cpu_chunk.copy_(self.gpu_chunk, non_blocking=True)


	class _FPDTGPUOffloadingAttentionImpl_(torch.autograd.Function):
	generate_vmap_rule = False

	@staticmethod
	def forward(ctx: Any,
	layernorm_output,
	attention_mask,
	inference_params,
	rotary_pos_emb,
	spg,
	scatter_idx,
	gather_idx,
	hidden_size,
	projection_size,
	hidden_size_per_attention_head,
	kv_projection_size,
	qkv_linear_weight,
	qkv_linear_bias,
	dropout,
	num_chunks=8,
	cpu_offloading=True):

	do_save = layernorm_output.requires_grad

	if rotary_pos_emb is not None:
	pos_emb_cos, pos_emb_sin = rotary_pos_emb[0].permute(1, 0, 2, 3), rotary_pos_emb[1].permute(1, 0, 2, 3)
	ctx.pos_emb_cos = pos_emb_cos
	ctx.pos_emb_sin = pos_emb_sin
	else:
	ctx.pos_emb_cos = None
	ctx.pos_emb_sin = None
	with torch.no_grad():
	per_gpu_seq_len = layernorm_output.shape[0]
	chunk_size = per_gpu_seq_len // num_chunks
	assert chunk_size * num_chunks == per_gpu_seq_len
	assert attention_mask is None
	ctx.num_chunks = num_chunks
	ctx.cpu_offloading = cpu_offloading
	ctx.spg = spg
	ctx.scatter_idx = scatter_idx
	ctx.gather_idx = gather_idx

	ctx.chunk_size = chunk_size
	device = get_accelerator().current_device_name()
	ctx.device = device
	ctx.dtype = layernorm_output.dtype
	ctx.projection_size = projection_size
	ctx.kv_projection_size = kv_projection_size

	global_q = []
	global_k = []
	global_v = []

	ctx.softmax_scale = hidden_size_per_attention_head**(-0.5)

	ctx.dropout_p = dropout
	ctx.window_size = (-1, -1)
	ctx.alibi_slopes = None

	batch_size = layernorm_output.shape[1]

	global_o = []
	global_lse = []

	layernorm_output_cpu = []
	final_output = []

	offload_stream = get_accelerator().Stream()
	general_offload_stream = get_accelerator().Stream()
	compute_stream = get_accelerator().default_stream()

	q_compute_chunk_idx = 0
	kv_compute_chunk_idx = 0
	for i in range(num_chunks):

	qkv_chunk = torch.matmul(layernorm_output[:chunk_size],
	qkv_linear_weight.t()) + qkv_linear_bias # torch.Size([18126, 1, 12288])

	with get_accelerator().stream(general_offload_stream):
	layernorm_output_cpu.append(SequenceChunk(layernorm_output[:chunk_size]))

	layernorm_output = layernorm_output[chunk_size:]

	q_chunk = qkv_chunk[:, :, :projection_size].contiguous().reshape(
	qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
	hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous() # b, l, nh, hd
	q_chunk = single_all_to_all(q_chunk, scatter_idx, gather_idx, 0, spg)
	global_q_chunk_len = q_chunk.shape[1]

	k_chunk = qkv_chunk[:, :, projection_size:projection_size + kv_projection_size].contiguous().reshape(
	qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
	hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous() # b, l, nh, hd
	k_chunk = single_all_to_all(k_chunk, scatter_idx, gather_idx, 0, spg)

	v_chunk = qkv_chunk[:, :, projection_size + kv_projection_size:].contiguous().reshape(
	qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
	hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous() # b, l, nh, hd
	v_chunk = single_all_to_all(v_chunk, scatter_idx, gather_idx, 0, spg)

	dist.barrier()

	if ctx.pos_emb_cos is not None:
	pos_emb_cos_chunk = pos_emb_cos[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)]
	pos_emb_sin_chunk = pos_emb_sin[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)]

	q_chunk = apply_rotary_pos_emb(q_chunk, pos_emb_cos_chunk, pos_emb_sin_chunk)
	k_chunk = apply_rotary_pos_emb(k_chunk, pos_emb_cos_chunk, pos_emb_sin_chunk)

	compute_stream.wait_stream(offload_stream)
	compute_stream.synchronize()
	with get_accelerator().stream(offload_stream):
	global_q.append(SequenceChunk(q_chunk, is_in_use=True))
	global_k.append(SequenceChunk(k_chunk, is_in_use=True))
	global_v.append(SequenceChunk(v_chunk, is_in_use=True))

	del qkv_chunk

	cur_attn_output = None
	cur_attn_lse = None
	for k_i in range(len(global_k)):
	causal_chunk = i == k_i
	with get_accelerator().stream(compute_stream):
	if flash_attn_version >= version.parse("2.6.0"):
	block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(
	global_q[q_compute_chunk_idx].get_gpu_chunk(),
	global_k[kv_compute_chunk_idx].get_gpu_chunk(),
	global_v[kv_compute_chunk_idx].get_gpu_chunk(),
	ctx.dropout_p,
	ctx.softmax_scale,
	causal=causal_chunk,
	window_size=ctx.window_size,
	softcap=0.0,
	alibi_slopes=ctx.alibi_slopes,
	return_softmax=False)
	else:
	block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(
	global_q[q_compute_chunk_idx].get_gpu_chunk(),
	global_k[kv_compute_chunk_idx].get_gpu_chunk(),
	global_v[kv_compute_chunk_idx].get_gpu_chunk(),
	ctx.dropout_p,
	ctx.softmax_scale,
	causal=causal_chunk,
	window_size=ctx.window_size,
	alibi_slopes=ctx.alibi_slopes,
	return_softmax=False)
	cur_attn_output, cur_attn_lse = update_out_and_lse(cur_attn_output, cur_attn_lse, block_out,
	block_lse)

	can_offload_kv = True
	if k_i != (len(global_k) - 1) or i != (num_chunks - 1):
	if k_i != (len(global_k) - 1):
	next_kv_compute_chunk_idx = k_i + 1
	else:
	next_kv_compute_chunk_idx = 0

	if next_kv_compute_chunk_idx == kv_compute_chunk_idx:
	can_offload_kv = False
	else:
	if next_kv_compute_chunk_idx != (len(global_k) - 1):
	with get_accelerator().stream(offload_stream):
	global_k[next_kv_compute_chunk_idx].load_to_gpu()
	global_v[next_kv_compute_chunk_idx].load_to_gpu()

	if i == num_chunks - 1 and k_i == num_chunks - 1:
	with get_accelerator().stream(offload_stream):
	global_q[0].load_to_gpu()
	global_k[0].load_to_gpu()
	global_v[0].load_to_gpu()
	global_o[0].load_to_gpu()
	global_lse[0].load_to_gpu()

	compute_stream.wait_stream(offload_stream)
	compute_stream.synchronize()

	if can_offload_kv:
	global_k[kv_compute_chunk_idx].offload()
	global_v[kv_compute_chunk_idx].offload()
	kv_compute_chunk_idx = next_kv_compute_chunk_idx

	global_q[q_compute_chunk_idx].offload()
	q_compute_chunk_idx += 1

	all2all_output = single_all_to_all(
	cur_attn_output.to(ctx.dtype).contiguous(), gather_idx, scatter_idx, 0, spg)
	final_output.append(all2all_output)
	with get_accelerator().stream(general_offload_stream):
	global_o.append(SequenceChunk(cur_attn_output.to(ctx.dtype)))
	global_lse.append(SequenceChunk(cur_attn_lse[:, :, :, 0].permute(0, 2, 1).contiguous()))

	compute_stream.wait_stream(general_offload_stream)
	compute_stream.synchronize()

	final_output = torch.cat(final_output, dim=1)

	head_dim = final_output.shape[-1]

	if do_save:
	ctx.layernorm_output = layernorm_output_cpu
	ctx.global_q = global_q
	ctx.global_k = global_k
	ctx.global_v = global_v
	ctx.attn_output = global_o
	ctx.attn_lse = global_lse
	ctx.head_dim = head_dim
	ctx.batch_size = batch_size

	ctx.qkv_linear_weight = qkv_linear_weight
	ctx.qkv_linear_bias = qkv_linear_bias

	return final_output

	@staticmethod
	def backward(ctx, grad_output):
	num_chunks = ctx.num_chunks
	device = grad_output.device
	dtype = ctx.dtype
	spg = ctx.spg
	scatter_idx = ctx.scatter_idx
	gather_idx = ctx.gather_idx
	softmax_scale = ctx.softmax_scale
	dropout_p = ctx.dropout_p
	window_size = ctx.window_size
	alibi_slopes = ctx.alibi_slopes

	projection_size = ctx.projection_size
	kv_projection_size = ctx.kv_projection_size

	layernorm_output = ctx.layernorm_output

	global_q = ctx.global_q
	global_k = ctx.global_k
	global_v = ctx.global_v
	attn_output = ctx.attn_output
	lse = ctx.attn_lse

	qkv_linear_weight = ctx.qkv_linear_weight
	qkv_linear_bias = ctx.qkv_linear_bias

	offload_stream = get_accelerator().Stream()
	general_offload_stream = get_accelerator().Stream()
	compute_stream = get_accelerator().default_stream()

	chunk_size = grad_output.shape[1] // num_chunks
	assert chunk_size == layernorm_output[0].cpu_chunk.shape[0]

	grad_layernorm_output = [
	torch.zeros(layernorm_output[0].chunk_shape, device=device, dtype=dtype) for _ in range(num_chunks)
	]

	grad_global_attn_output = [None for _ in range(num_chunks)]

	q_compute_chunk_idx = 0
	kv_compute_chunk_idx = 0
	last_q_accum_idx = 0

	with get_accelerator().stream(general_offload_stream):
	layernorm_output[0].load_to_gpu()
	grad_qkv_linear_weight = torch.zeros(qkv_linear_weight.shape,
	device=qkv_linear_weight.device,
	dtype=torch.float)
	grad_qkv_linear_bias = torch.zeros(qkv_linear_bias.shape,
	device=qkv_linear_weight.device,
	dtype=torch.float)

	grad_global_attn_output_chunk = single_all_to_all(grad_output[:, :chunk_size].contiguous(), scatter_idx,
	gather_idx, 0, spg)
	get_accelerator().synchronize()
	grad_output = grad_output[:, chunk_size:]

	with get_accelerator().stream(offload_stream):
	grad_global_attn_output[0] = SequenceChunk(grad_global_attn_output_chunk, is_in_use=True)
	dq = [
	SequenceChunk(torch.zeros(global_q[0].chunk_shape, dtype=torch.float, device=device), is_in_use=True)
	] + [
	SequenceChunk(torch.zeros(global_q[0].chunk_shape, dtype=torch.float, device='cpu', pin_memory=True),
	device) for _ in range(num_chunks - 1)
	]
	dk_accum = torch.zeros(global_k[0].chunk_shape, dtype=torch.float, device=device)
	dv_accum = torch.zeros(global_v[0].chunk_shape, dtype=torch.float, device=device)

	for i in range(num_chunks):
	for q_i in range(num_chunks):
	no_computation = q_i < i
	if no_computation:
	continue

	causal_chunk = q_i == i

	dq_this = torch.zeros(global_q[0].chunk_shape, dtype=dtype, device=device)
	dk_this = torch.zeros(global_k[0].chunk_shape, dtype=dtype, device=device)
	dv_this = torch.zeros(global_v[0].chunk_shape, dtype=dtype, device=device)

	with get_accelerator().stream(compute_stream):
	if flash_attn_version >= version.parse("2.6.0"):
	_flash_attn_backward(grad_global_attn_output[q_compute_chunk_idx].get_gpu_chunk(),
	global_q[q_compute_chunk_idx].get_gpu_chunk(),
	global_k[kv_compute_chunk_idx].get_gpu_chunk(),
	global_v[kv_compute_chunk_idx].get_gpu_chunk(),
	attn_output[q_compute_chunk_idx].get_gpu_chunk(),
	lse[q_compute_chunk_idx].get_gpu_chunk(),
	dq_this,
	dk_this,
	dv_this,
	dropout_p,
	softmax_scale,
	causal_chunk,
	window_size,
	softcap=0.0,
	alibi_slopes=alibi_slopes,
	deterministic=False,
	rng_state=None)
	else:
	_flash_attn_backward(grad_global_attn_output[q_compute_chunk_idx].get_gpu_chunk(),
	global_q[q_compute_chunk_idx].get_gpu_chunk(),
	global_k[kv_compute_chunk_idx].get_gpu_chunk(),
	global_v[kv_compute_chunk_idx].get_gpu_chunk(),
	attn_output[q_compute_chunk_idx].get_gpu_chunk(),
	lse[q_compute_chunk_idx].get_gpu_chunk(),
	dq_this,
	dk_this,
	dv_this,
	dropout_p,
	softmax_scale,
	causal_chunk,
	window_size,
	alibi_slopes=alibi_slopes,
	deterministic=False,
	rng_state=None)

	if i != (len(global_k) - 1):
	if q_i != (len(global_q) - 1):
	next_q_compute_chunk_idx = q_i + 1
	else:
	next_q_compute_chunk_idx = i + 1

	can_offload_q = True

	if next_q_compute_chunk_idx == q_compute_chunk_idx:
	can_offload_q = False
	else:
	with get_accelerator().stream(offload_stream):
	if i > 0 or q_i > 0:
	if can_offload_q and last_q_accum_idx != i: # the first q chunk calculate in the loop will be sent out, therefore we do not offload it
	dq[last_q_accum_idx].offload()
	dq[next_q_compute_chunk_idx].load_to_gpu()
	global_q[next_q_compute_chunk_idx].load_to_gpu()
	attn_output[next_q_compute_chunk_idx].load_to_gpu()
	lse[next_q_compute_chunk_idx].load_to_gpu()
	if grad_global_attn_output[next_q_compute_chunk_idx] is not None:
	grad_global_attn_output[next_q_compute_chunk_idx].load_to_gpu()

	if grad_global_attn_output[next_q_compute_chunk_idx] is None:
	grad_global_attn_output_chunk = single_all_to_all(grad_output[:, :chunk_size].contiguous(),
	scatter_idx, gather_idx, 0, spg)
	dist.barrier()
	grad_output = grad_output[:, chunk_size:]
	grad_global_attn_output[next_q_compute_chunk_idx] = SequenceChunk(
	grad_global_attn_output_chunk, is_in_use=True)

	compute_stream.wait_stream(offload_stream)
	compute_stream.synchronize()

	with get_accelerator().stream(compute_stream):
	dq[q_compute_chunk_idx].check_gpu_chunk()
	dq[q_compute_chunk_idx].gpu_chunk.add_(dq_this)
	dk_accum.add_(dk_this)
	dv_accum.add_(dv_this)

	offload_stream.wait_stream(compute_stream)
	with get_accelerator().stream(offload_stream):
	dq[q_compute_chunk_idx].overwrite_to_cpu()

	if can_offload_q:
	global_q[q_compute_chunk_idx].offload()
	attn_output[q_compute_chunk_idx].offload()
	lse[q_compute_chunk_idx].offload()
	grad_global_attn_output[q_compute_chunk_idx].offload()

	last_q_accum_idx = q_compute_chunk_idx
	q_compute_chunk_idx = next_q_compute_chunk_idx

	compute_stream.wait_stream(offload_stream)
	compute_stream.synchronize()

	dk_seq_len = dk_accum.shape[1]

	if ctx.pos_emb_cos is not None:
	dq_accum = apply_rotary_pos_emb_backward(dq[kv_compute_chunk_idx].get_gpu_chunk().to(dtype),
	ctx.pos_emb_cos[:, dk_seq_len * i:dk_seq_len * (i + 1)],
	ctx.pos_emb_sin[:, dk_seq_len * i:dk_seq_len * (i + 1)])
	dk_accum = apply_rotary_pos_emb_backward(dk_accum.to(dtype),
	ctx.pos_emb_cos[:, dk_seq_len * i:dk_seq_len * (i + 1)],
	ctx.pos_emb_sin[:, dk_seq_len * i:dk_seq_len * (i + 1)])
	else:
	dq_accum = dq[kv_compute_chunk_idx].get_gpu_chunk().to(dtype)
	dk_accum = dk_accum.to(dtype)
	dv_accum = dv_accum.to(dtype)

	dq_accum = single_all_to_all(dq_accum.contiguous(), gather_idx, scatter_idx, 0, spg)
	dk_accum = single_all_to_all(dk_accum.contiguous(), gather_idx, scatter_idx, 0, spg)
	dv_accum = single_all_to_all(dv_accum.contiguous(), gather_idx, scatter_idx, 0, spg)

	general_offload_stream.synchronize()
	compute_stream.wait_stream(general_offload_stream)
	dist.barrier()

	with get_accelerator().stream(compute_stream):
	input_chunk = layernorm_output[i].get_gpu_chunk().reshape(-1, layernorm_output[i].chunk_shape[-1])

	dq_accum = dq_accum.flatten(2).permute(1, 0, 2)
	dk_accum = dk_accum.flatten(2).permute(1, 0, 2)
	dv_accum = dv_accum.flatten(2).permute(1, 0, 2)

	l, b = dk_accum.shape[0], dk_accum.shape[1]

	grad_qkv_linear_weight[:projection_size].add_(
	torch.matmul(dq_accum.reshape(l * b, -1).t(), input_chunk))
	grad_qkv_linear_weight[projection_size:projection_size + kv_projection_size].add_(
	torch.matmul(dk_accum.reshape(l * b, -1).t(), input_chunk))
	grad_qkv_linear_weight[projection_size + kv_projection_size:].add_(
	torch.matmul(dv_accum.reshape(l * b, -1).t(), input_chunk))

	grad_qkv_linear_bias[:projection_size].add_(dq_accum.sum(0).sum(0))
	grad_qkv_linear_bias[projection_size:projection_size + kv_projection_size].add_(dk_accum.sum(0).sum(0))
	grad_qkv_linear_bias[projection_size + kv_projection_size:].add_(dv_accum.sum(0).sum(0))

	grad_layernorm_output[i].add_(torch.matmul(dq_accum, qkv_linear_weight[:projection_size]))
	grad_layernorm_output[i].add_(
	torch.matmul(dk_accum, qkv_linear_weight[projection_size:projection_size + kv_projection_size]))
	grad_layernorm_output[i].add_(
	torch.matmul(dv_accum, qkv_linear_weight[projection_size + kv_projection_size:]))

	del dq_accum, dk_accum, dv_accum
	dk_accum = torch.zeros(global_k[i].chunk_shape, dtype=torch.float, device=device)
	dv_accum = torch.zeros(global_v[i].chunk_shape, dtype=torch.float, device=device)
	dq[kv_compute_chunk_idx].offload()
	dq[kv_compute_chunk_idx] = None

	if i != (len(global_k) - 1):
	next_kv_compute_chunk_idx = kv_compute_chunk_idx + 1
	with get_accelerator().stream(offload_stream):
	global_k[next_kv_compute_chunk_idx].load_to_gpu()
	global_v[next_kv_compute_chunk_idx].load_to_gpu()

	with get_accelerator().stream(general_offload_stream):
	layernorm_output[next_kv_compute_chunk_idx].load_to_gpu()

	compute_stream.wait_stream(offload_stream)
	compute_stream.synchronize()

	layernorm_output[kv_compute_chunk_idx].offload()
	global_k[kv_compute_chunk_idx].offload()
	global_v[kv_compute_chunk_idx].offload()
	kv_compute_chunk_idx = next_kv_compute_chunk_idx

	return torch.cat(
	grad_layernorm_output,
	dim=0).to(dtype), None, None, None, None, None, None, None, None, None, None, grad_qkv_linear_weight.to(
	dtype), grad_qkv_linear_bias.to(dtype), None, None, None


	class FPDT_Attention(torch.nn.Module):

	def __init__(self,
	config,
	first_weight,
	first_bias,
	second_weight,
	second_bias,
	sequence_process_group,
	gather_idx: int = 0,
	scatter_idx: int = 2,
	return_bias=True,
	chunk_size=65536,
	enable_offloading=True) -> None:

	super(FPDT_Attention, self).__init__()
	if _flash_attn_forward is None or _flash_attn_backward is None:
	raise ImportError(
	"DeepSpeed FPDT requires flash-attn 2.6.3. Please install it with `pip install flash-attn --no-build-isolation`."
	)

	self.spg = sequence_process_group
	self.scatter_idx = scatter_idx
	self.gather_idx = gather_idx
	self.config = config

	self.projection_size = config.kv_channels * config.num_attention_heads
	self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
	self.kv_projection_size = config.kv_channels * config.num_key_value_heads
	self.hidden_size = config.hidden_size

	self.qkv_linear_weight = first_weight
	self.qkv_linear_bias = first_bias
	self.qkv_dense_weight = second_weight
	self.qkv_dense_bias = second_bias

	self.reture_bias = return_bias
	self.dropout = config.attention_dropout

	self.chunk_size = chunk_size
	self.double_buffer = enable_offloading

	def forward(self,
	layernorm_output,
	attention_mask,
	inference_params,
	rotary_pos_emb,
	cpu_offloading=True) -> Tensor:
	self.num_chunks_attn = layernorm_output.shape[0] * dist.get_world_size(self.spg) // self.chunk_size

	if not cpu_offloading or self.num_chunks_attn == 1:
	output = _FPDTGPUAttentionImpl_.apply(layernorm_output, attention_mask, inference_params, rotary_pos_emb,
	self.spg, self.scatter_idx, self.gather_idx, self.hidden_size,
	self.projection_size, self.hidden_size_per_attention_head,
	self.kv_projection_size, self.qkv_linear_weight,
	self.qkv_linear_bias, self.dropout, self.num_chunks_attn,
	cpu_offloading)
	else:
	output = _FPDTGPUOffloadingAttentionImpl_.apply(
	layernorm_output, attention_mask, inference_params, rotary_pos_emb, self.spg, self.scatter_idx,
	self.gather_idx, self.hidden_size, self.projection_size, self.hidden_size_per_attention_head,
	self.kv_projection_size, self.qkv_linear_weight, self.qkv_linear_bias, self.dropout,
	self.num_chunks_attn, cpu_offloading)

	output = output.flatten(2).permute(1, 0, 2).contiguous()

	output = torch.matmul(output, self.qkv_dense_weight.t())
	if not self.reture_bias:
	output += self.qkv_dense_bias
	return output, self.qkv_dense_bias if self.reture_bias else None


	@torch.jit.script
	def bias_gelu(x):
	return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))


	@torch.jit.script
	def bias_gelu_back(g, x):
	tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
	# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
	ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
	return ff * g


	class FPDT_FFN(torch.autograd.Function):
	generate_vmap_rule = False

	@staticmethod
	def forward(ctx: Any, x, w1, b1, w2, b2, add_bias, chunk_size):
	do_save = x.requires_grad
	ctx.add_bias = add_bias
	device = x.device

	with torch.no_grad():
	num_chunk = x.shape[0] // chunk_size
	ctx.num_chunk = num_chunk
	result = torch.empty(x.shape, device=device, dtype=x.dtype)
	assert chunk_size * num_chunk == x.shape[0]
	for i in range(num_chunk):
	st = i * chunk_size
	ed = st + chunk_size
	x_ = torch.matmul(x[st:ed], w1.t()) + b1
	x_ = bias_gelu(x_)
	if add_bias:
	result[st:ed] = torch.matmul(x_, w2.t()) + b2
	else:
	result[st:ed] = torch.matmul(x_, w2.t())

	del x_

	if do_save:
	ctx.device = device
	ctx.dtype = x.dtype
	ctx.save_for_backward(x, w1, b1, w2, b2)
	ctx.grad_x_shape = x.shape
	return result.to(x.dtype), b2 if not add_bias else None

	@staticmethod
	def backward(ctx, grad_output, grad_bias):
	x, w1, b1, w2, b2 = ctx.saved_tensors
	device = ctx.device
	dtype = ctx.dtype
	add_bias = ctx.add_bias

	num_chunk = ctx.num_chunk
	chunk_size = x.shape[0] // num_chunk
	assert chunk_size * num_chunk == grad_output.shape[0]

	grad_w2 = torch.zeros(w2.shape, device=device, dtype=torch.float)
	grad_b2 = torch.zeros(b2.shape, device=device, dtype=torch.float)
	grad_w1 = torch.zeros(w1.shape, device=device, dtype=torch.float)
	grad_b1 = torch.zeros(b1.shape, device=device, dtype=torch.float)

	for i in range(num_chunk):
	st = i * chunk_size
	ed = st + chunk_size
	x_chunk = x[st:ed]

	before_act = (torch.matmul(x_chunk, w1.t()) + b1)
	before_act_2 = before_act**2
	tanh_out = torch.tanh(0.79788456 * before_act * (1 + 0.044715 * before_act_2))
	ff = 0.5 * before_act * ((1 - tanh_out * tanh_out) *
	(0.79788456 + 0.1070322243 * before_act_2)) + 0.5 * (1 + tanh_out)
	grad_w2.add_(
	torch.matmul(grad_output[st:ed].reshape(-1, grad_output.shape[2]).t(),
	(before_act * 0.5 * (1 + tanh_out)).reshape(-1, before_act.shape[2])))
	del before_act, before_act_2, tanh_out

	grad_inter = torch.matmul(grad_output[st:ed], w2) * ff
	del ff

	grad_w1.add_(torch.matmul(
	grad_inter.reshape(-1, grad_inter.shape[2]).t(), x_chunk.reshape(-1, x.shape[2])))
	grad_b1.add_(grad_inter.sum(0).sum(0))

	x[st:ed].copy_(torch.matmul(grad_inter, w1))

	del grad_inter

	if add_bias:
	grad_b2.add_(grad_output[st:ed].sum(0).sum(0))

	return x, grad_w1.to(dtype), grad_b1.to(dtype), grad_w2.to(dtype), grad_b2.to(dtype), None, None


	class FPDT_LogitsLoss(torch.autograd.Function):
	generate_vmap_rule = False

	@staticmethod
	def forward(ctx: Any, lm_output, labels, logit_weights, rank, spg_size, spg, num_chunk):
	labels = labels.t()
	chunk_size = lm_output.shape[0] // num_chunk
	assert chunk_size * num_chunk == lm_output.shape[0]
	batch_size, local_seq_len = lm_output.shape[1], lm_output.shape[0]
	loss = torch.empty((batch_size, local_seq_len), dtype=torch.float, device=lm_output.device)

	ctx.num_chunk = num_chunk
	ctx.chunk_size = chunk_size
	ctx.device = lm_output.device
	ctx.dtype = lm_output.dtype

	ctx.rank = rank
	ctx.local_seq_len = local_seq_len
	with torch.no_grad():
	for i in range(num_chunk):
	st = i * chunk_size
	ed = st + chunk_size
	logits_chunk = torch.matmul(lm_output[st:ed], logit_weights.t()).float()

	vocab_size = logits_chunk.size(2)
	# nll
	softmax = torch.nn.functional.softmax(logits_chunk, dim=-1)
	loss_chunk = torch.nn.functional.nll_loss(softmax.log().reshape(-1, vocab_size).contiguous(),
	labels[st:ed, :].reshape(-1).contiguous(),
	reduction='none')
	loss[:, st:ed] = loss_chunk.reshape(chunk_size, batch_size).t()

	del logits_chunk
	ctx.save_for_backward(lm_output.to('cpu'), labels)
	ctx.logit_weights = logit_weights

	seqlen = local_seq_len * spg_size
	batch_size = loss.size(0)
	loss = loss.t().contiguous()
	loss_all = torch.empty(seqlen, batch_size, dtype=loss.dtype, device=loss.device).contiguous()

	dist.allgather_fn(loss_all, loss, group=spg)

	return loss_all

	@staticmethod
	def backward(ctx, grad_output):
	lm_output, labels = ctx.saved_tensors
	logit_weights = ctx.logit_weights
	device = ctx.device
	dtype = ctx.dtype
	num_chunk = ctx.num_chunk
	chunk_size = ctx.chunk_size

	rank = ctx.rank
	local_seq_len = ctx.local_seq_len

	grad_output = grad_output[rank * local_seq_len:(rank + 1) * local_seq_len]
	grad_lm_output = [None for _ in range(num_chunk)]
	grad_logit_weights = torch.zeros(logit_weights.shape, device=grad_output.device, dtype=torch.float)
	for i in range(num_chunk):
	st = i * chunk_size
	ed = st + chunk_size
	lm_output_chunk = lm_output[st:ed].to(device)
	logits_chunk = torch.matmul(lm_output_chunk, logit_weights.t()).float()

	# nll
	softmax = torch.nn.functional.softmax(logits_chunk, dim=-1)
	vocab_size = logits_chunk.size(2)

	grad_input = softmax
	grad_2d = grad_input.reshape(-1, vocab_size).contiguous()
	arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=device)

	grad_2d[arange_1d, labels[st:ed, :].reshape(-1).contiguous()] -= 1
	grad_input.mul_(grad_output[:chunk_size, :].unsqueeze(dim=-1))
	grad_input = grad_input.to(dtype)

	grad_output = grad_output[chunk_size:].contiguous()

	grad_lm_output_chunk = torch.matmul(grad_input, logit_weights)
	grad_lm_output[i] = grad_lm_output_chunk

	grad_logit_weights.add_(
	torch.matmul(
	grad_input.reshape(-1, grad_input.shape[2]).t(),
	lm_output_chunk.reshape(-1, lm_output_chunk.shape[2])))

	return torch.cat(grad_lm_output, dim=0).to(dtype), None, grad_logit_weights.to(dtype), None, None, None, None