Spaces:

fffiloni
/

Stand-In

Running on Zero

App Files Files Community

Stand-In / models /wan_video_image_encoder.py

fffiloni

Migrated from GitHub

26557da verified 2 months ago

raw

history blame contribute delete

28.2 kB

	"""
	Concise re-implementation of
	``https://github.com/openai/CLIP'' and
	``https://github.com/mlfoundations/open_clip''.
	"""

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision.transforms as T
	from .wan_video_dit import flash_attention


	class SelfAttention(nn.Module):
	def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
	assert dim % num_heads == 0
	super().__init__()
	self.dim = dim
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.eps = eps

	# layers
	self.q = nn.Linear(dim, dim)
	self.k = nn.Linear(dim, dim)
	self.v = nn.Linear(dim, dim)
	self.o = nn.Linear(dim, dim)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x, mask):
	"""
	x: [B, L, C].
	"""
	b, s, c, n, d = *x.size(), self.num_heads, self.head_dim

	# compute query, key, value
	q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
	k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
	v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)

	# compute attention
	p = self.dropout.p if self.training else 0.0
	x = F.scaled_dot_product_attention(q, k, v, mask, p)
	x = x.permute(0, 2, 1, 3).reshape(b, s, c)

	# output
	x = self.o(x)
	x = self.dropout(x)
	return x


	class AttentionBlock(nn.Module):
	def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
	super().__init__()
	self.dim = dim
	self.num_heads = num_heads
	self.post_norm = post_norm
	self.eps = eps

	# layers
	self.attn = SelfAttention(dim, num_heads, dropout, eps)
	self.norm1 = nn.LayerNorm(dim, eps=eps)
	self.ffn = nn.Sequential(
	nn.Linear(dim, dim * 4),
	nn.GELU(),
	nn.Linear(dim * 4, dim),
	nn.Dropout(dropout),
	)
	self.norm2 = nn.LayerNorm(dim, eps=eps)

	def forward(self, x, mask):
	if self.post_norm:
	x = self.norm1(x + self.attn(x, mask))
	x = self.norm2(x + self.ffn(x))
	else:
	x = x + self.attn(self.norm1(x), mask)
	x = x + self.ffn(self.norm2(x))
	return x


	class XLMRoberta(nn.Module):
	"""
	XLMRobertaModel with no pooler and no LM head.
	"""

	def __init__(
	self,
	vocab_size=250002,
	max_seq_len=514,
	type_size=1,
	pad_id=1,
	dim=1024,
	num_heads=16,
	num_layers=24,
	post_norm=True,
	dropout=0.1,
	eps=1e-5,
	):
	super().__init__()
	self.vocab_size = vocab_size
	self.max_seq_len = max_seq_len
	self.type_size = type_size
	self.pad_id = pad_id
	self.dim = dim
	self.num_heads = num_heads
	self.num_layers = num_layers
	self.post_norm = post_norm
	self.eps = eps

	# embeddings
	self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
	self.type_embedding = nn.Embedding(type_size, dim)
	self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
	self.dropout = nn.Dropout(dropout)

	# blocks
	self.blocks = nn.ModuleList(
	[
	AttentionBlock(dim, num_heads, post_norm, dropout, eps)
	for _ in range(num_layers)
	]
	)

	# norm layer
	self.norm = nn.LayerNorm(dim, eps=eps)

	def forward(self, ids):
	"""
	ids: [B, L] of torch.LongTensor.
	"""
	b, s = ids.shape
	mask = ids.ne(self.pad_id).long()

	# embeddings
	x = (
	self.token_embedding(ids)
	+ self.type_embedding(torch.zeros_like(ids))
	+ self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
	)
	if self.post_norm:
	x = self.norm(x)
	x = self.dropout(x)

	# blocks
	mask = torch.where(mask.view(b, 1, 1, s).gt(0), 0.0, torch.finfo(x.dtype).min)
	for block in self.blocks:
	x = block(x, mask)

	# output
	if not self.post_norm:
	x = self.norm(x)
	return x


	def xlm_roberta_large(pretrained=False, return_tokenizer=False, device="cpu", **kwargs):
	"""
	XLMRobertaLarge adapted from Huggingface.
	"""
	# params
	cfg = dict(
	vocab_size=250002,
	max_seq_len=514,
	type_size=1,
	pad_id=1,
	dim=1024,
	num_heads=16,
	num_layers=24,
	post_norm=True,
	dropout=0.1,
	eps=1e-5,
	)
	cfg.update(**kwargs)

	# init model
	if pretrained:
	from sora import DOWNLOAD_TO_CACHE

	# init a meta model
	with torch.device("meta"):
	model = XLMRoberta(**cfg)

	# load checkpoint
	model.load_state_dict(
	torch.load(
	DOWNLOAD_TO_CACHE("models/xlm_roberta/xlm_roberta_large.pth"),
	map_location=device,
	),
	assign=True,
	)
	else:
	# init a model on device
	with torch.device(device):
	model = XLMRoberta(**cfg)

	# init tokenizer
	if return_tokenizer:
	from sora.data import HuggingfaceTokenizer

	tokenizer = HuggingfaceTokenizer(
	name="xlm-roberta-large", seq_len=model.text_len, clean="whitespace"
	)
	return model, tokenizer
	else:
	return model


	def pos_interpolate(pos, seq_len):
	if pos.size(1) == seq_len:
	return pos
	else:
	src_grid = int(math.sqrt(pos.size(1)))
	tar_grid = int(math.sqrt(seq_len))
	n = pos.size(1) - src_grid * src_grid
	return torch.cat(
	[
	pos[:, :n],
	F.interpolate(
	pos[:, n:]
	.float()
	.reshape(1, src_grid, src_grid, -1)
	.permute(0, 3, 1, 2),
	size=(tar_grid, tar_grid),
	mode="bicubic",
	align_corners=False,
	)
	.flatten(2)
	.transpose(1, 2),
	],
	dim=1,
	)


	class QuickGELU(nn.Module):
	def forward(self, x):
	return x * torch.sigmoid(1.702 * x)


	class LayerNorm(nn.LayerNorm):
	def forward(self, x):
	return super().forward(x).type_as(x)


	class SelfAttention(nn.Module):
	def __init__(
	self, dim, num_heads, causal=False, attn_dropout=0.0, proj_dropout=0.0
	):
	assert dim % num_heads == 0
	super().__init__()
	self.dim = dim
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.causal = causal
	self.attn_dropout = attn_dropout
	self.proj_dropout = proj_dropout

	# layers
	self.to_qkv = nn.Linear(dim, dim * 3)
	self.proj = nn.Linear(dim, dim)

	def forward(self, x):
	"""
	x: [B, L, C].
	"""
	# compute query, key, value
	q, k, v = self.to_qkv(x).chunk(3, dim=-1)

	# compute attention
	x = flash_attention(q, k, v, num_heads=self.num_heads, compatibility_mode=True)

	# output
	x = self.proj(x)
	x = F.dropout(x, self.proj_dropout, self.training)
	return x


	class SwiGLU(nn.Module):
	def __init__(self, dim, mid_dim):
	super().__init__()
	self.dim = dim
	self.mid_dim = mid_dim

	# layers
	self.fc1 = nn.Linear(dim, mid_dim)
	self.fc2 = nn.Linear(dim, mid_dim)
	self.fc3 = nn.Linear(mid_dim, dim)

	def forward(self, x):
	x = F.silu(self.fc1(x)) * self.fc2(x)
	x = self.fc3(x)
	return x


	class AttentionBlock(nn.Module):
	def __init__(
	self,
	dim,
	mlp_ratio,
	num_heads,
	post_norm=False,
	causal=False,
	activation="quick_gelu",
	attn_dropout=0.0,
	proj_dropout=0.0,
	norm_eps=1e-5,
	):
	assert activation in ["quick_gelu", "gelu", "swi_glu"]
	super().__init__()
	self.dim = dim
	self.mlp_ratio = mlp_ratio
	self.num_heads = num_heads
	self.post_norm = post_norm
	self.causal = causal
	self.norm_eps = norm_eps

	# layers
	self.norm1 = LayerNorm(dim, eps=norm_eps)
	self.attn = SelfAttention(dim, num_heads, causal, attn_dropout, proj_dropout)
	self.norm2 = LayerNorm(dim, eps=norm_eps)
	if activation == "swi_glu":
	self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
	else:
	self.mlp = nn.Sequential(
	nn.Linear(dim, int(dim * mlp_ratio)),
	QuickGELU() if activation == "quick_gelu" else nn.GELU(),
	nn.Linear(int(dim * mlp_ratio), dim),
	nn.Dropout(proj_dropout),
	)

	def forward(self, x):
	if self.post_norm:
	x = x + self.norm1(self.attn(x))
	x = x + self.norm2(self.mlp(x))
	else:
	x = x + self.attn(self.norm1(x))
	x = x + self.mlp(self.norm2(x))
	return x


	class AttentionPool(nn.Module):
	def __init__(
	self,
	dim,
	mlp_ratio,
	num_heads,
	activation="gelu",
	proj_dropout=0.0,
	norm_eps=1e-5,
	):
	assert dim % num_heads == 0
	super().__init__()
	self.dim = dim
	self.mlp_ratio = mlp_ratio
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.proj_dropout = proj_dropout
	self.norm_eps = norm_eps

	# layers
	gain = 1.0 / math.sqrt(dim)
	self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
	self.to_q = nn.Linear(dim, dim)
	self.to_kv = nn.Linear(dim, dim * 2)
	self.proj = nn.Linear(dim, dim)
	self.norm = LayerNorm(dim, eps=norm_eps)
	self.mlp = nn.Sequential(
	nn.Linear(dim, int(dim * mlp_ratio)),
	QuickGELU() if activation == "quick_gelu" else nn.GELU(),
	nn.Linear(int(dim * mlp_ratio), dim),
	nn.Dropout(proj_dropout),
	)

	def forward(self, x):
	"""
	x: [B, L, C].
	"""
	b, s, c, n, d = *x.size(), self.num_heads, self.head_dim

	# compute query, key, value
	q = self.to_q(self.cls_embedding).view(1, 1, n * d).expand(b, -1, -1)
	k, v = self.to_kv(x).chunk(2, dim=-1)

	# compute attention
	x = flash_attention(q, k, v, num_heads=self.num_heads, compatibility_mode=True)
	x = x.reshape(b, 1, c)

	# output
	x = self.proj(x)
	x = F.dropout(x, self.proj_dropout, self.training)

	# mlp
	x = x + self.mlp(self.norm(x))
	return x[:, 0]


	class VisionTransformer(nn.Module):
	def __init__(
	self,
	image_size=224,
	patch_size=16,
	dim=768,
	mlp_ratio=4,
	out_dim=512,
	num_heads=12,
	num_layers=12,
	pool_type="token",
	pre_norm=True,
	post_norm=False,
	activation="quick_gelu",
	attn_dropout=0.0,
	proj_dropout=0.0,
	embedding_dropout=0.0,
	norm_eps=1e-5,
	):
	if image_size % patch_size != 0:
	print("[WARNING] image_size is not divisible by patch_size", flush=True)
	assert pool_type in ("token", "token_fc", "attn_pool")
	out_dim = out_dim or dim
	super().__init__()
	self.image_size = image_size
	self.patch_size = patch_size
	self.num_patches = (image_size // patch_size) ** 2
	self.dim = dim
	self.mlp_ratio = mlp_ratio
	self.out_dim = out_dim
	self.num_heads = num_heads
	self.num_layers = num_layers
	self.pool_type = pool_type
	self.post_norm = post_norm
	self.norm_eps = norm_eps

	# embeddings
	gain = 1.0 / math.sqrt(dim)
	self.patch_embedding = nn.Conv2d(
	3, dim, kernel_size=patch_size, stride=patch_size, bias=not pre_norm
	)
	if pool_type in ("token", "token_fc"):
	self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
	self.pos_embedding = nn.Parameter(
	gain
	* torch.randn(
	1,
	self.num_patches + (1 if pool_type in ("token", "token_fc") else 0),
	dim,
	)
	)
	self.dropout = nn.Dropout(embedding_dropout)

	# transformer
	self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
	self.transformer = nn.Sequential(
	*[
	AttentionBlock(
	dim,
	mlp_ratio,
	num_heads,
	post_norm,
	False,
	activation,
	attn_dropout,
	proj_dropout,
	norm_eps,
	)
	for _ in range(num_layers)
	]
	)
	self.post_norm = LayerNorm(dim, eps=norm_eps)

	# head
	if pool_type == "token":
	self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
	elif pool_type == "token_fc":
	self.head = nn.Linear(dim, out_dim)
	elif pool_type == "attn_pool":
	self.head = AttentionPool(
	dim, mlp_ratio, num_heads, activation, proj_dropout, norm_eps
	)

	def forward(self, x, interpolation=False, use_31_block=False):
	b = x.size(0)

	# embeddings
	x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
	if self.pool_type in ("token", "token_fc"):
	x = torch.cat(
	[
	self.cls_embedding.expand(b, -1, -1).to(
	dtype=x.dtype, device=x.device
	),
	x,
	],
	dim=1,
	)
	if interpolation:
	e = pos_interpolate(self.pos_embedding, x.size(1))
	else:
	e = self.pos_embedding
	e = e.to(dtype=x.dtype, device=x.device)
	x = self.dropout(x + e)
	if self.pre_norm is not None:
	x = self.pre_norm(x)

	# transformer
	if use_31_block:
	x = self.transformer[:-1](x)
	return x
	else:
	x = self.transformer(x)
	return x


	class CLIP(nn.Module):
	def __init__(
	self,
	embed_dim=512,
	image_size=224,
	patch_size=16,
	vision_dim=768,
	vision_mlp_ratio=4,
	vision_heads=12,
	vision_layers=12,
	vision_pool="token",
	vision_pre_norm=True,
	vision_post_norm=False,
	vocab_size=49408,
	text_len=77,
	text_dim=512,
	text_mlp_ratio=4,
	text_heads=8,
	text_layers=12,
	text_causal=True,
	text_pool="argmax",
	text_head_bias=False,
	logit_bias=None,
	activation="quick_gelu",
	attn_dropout=0.0,
	proj_dropout=0.0,
	embedding_dropout=0.0,
	norm_eps=1e-5,
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.image_size = image_size
	self.patch_size = patch_size
	self.vision_dim = vision_dim
	self.vision_mlp_ratio = vision_mlp_ratio
	self.vision_heads = vision_heads
	self.vision_layers = vision_layers
	self.vision_pool = vision_pool
	self.vision_pre_norm = vision_pre_norm
	self.vision_post_norm = vision_post_norm
	self.vocab_size = vocab_size
	self.text_len = text_len
	self.text_dim = text_dim
	self.text_mlp_ratio = text_mlp_ratio
	self.text_heads = text_heads
	self.text_layers = text_layers
	self.text_causal = text_causal
	self.text_pool = text_pool
	self.text_head_bias = text_head_bias
	self.norm_eps = norm_eps

	# models
	self.visual = VisionTransformer(
	image_size=image_size,
	patch_size=patch_size,
	dim=vision_dim,
	mlp_ratio=vision_mlp_ratio,
	out_dim=embed_dim,
	num_heads=vision_heads,
	num_layers=vision_layers,
	pool_type=vision_pool,
	pre_norm=vision_pre_norm,
	post_norm=vision_post_norm,
	activation=activation,
	attn_dropout=attn_dropout,
	proj_dropout=proj_dropout,
	embedding_dropout=embedding_dropout,
	norm_eps=norm_eps,
	)
	self.textual = TextTransformer(
	vocab_size=vocab_size,
	text_len=text_len,
	dim=text_dim,
	mlp_ratio=text_mlp_ratio,
	out_dim=embed_dim,
	num_heads=text_heads,
	num_layers=text_layers,
	causal=text_causal,
	pool_type=text_pool,
	head_bias=text_head_bias,
	activation=activation,
	attn_dropout=attn_dropout,
	proj_dropout=proj_dropout,
	embedding_dropout=embedding_dropout,
	norm_eps=norm_eps,
	)
	self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
	if logit_bias is not None:
	self.logit_bias = nn.Parameter(logit_bias * torch.ones([]))

	# initialize weights
	self.init_weights()

	def forward(self, imgs, txt_ids):
	"""
	imgs: [B, 3, H, W] of torch.float32.
	- mean: [0.48145466, 0.4578275, 0.40821073]
	- std: [0.26862954, 0.26130258, 0.27577711]
	txt_ids: [B, L] of torch.long. Encoded by data.CLIPTokenizer.
	"""
	xi = self.visual(imgs)
	xt = self.textual(txt_ids)
	return xi, xt

	def init_weights(self):
	# embeddings
	nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
	nn.init.normal_(self.visual.patch_embedding.weight, std=0.1)

	# attentions
	for modality in ["visual", "textual"]:
	dim = self.vision_dim if modality == "visual" else self.text_dim
	transformer = getattr(self, modality).transformer
	proj_gain = (1.0 / math.sqrt(dim)) * (1.0 / math.sqrt(2 * len(transformer)))
	attn_gain = 1.0 / math.sqrt(dim)
	mlp_gain = 1.0 / math.sqrt(2.0 * dim)
	for block in transformer:
	nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
	nn.init.normal_(block.attn.proj.weight, std=proj_gain)
	nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
	nn.init.normal_(block.mlp[2].weight, std=proj_gain)

	def param_groups(self):
	groups = [
	{
	"params": [
	p
	for n, p in self.named_parameters()
	if "norm" in n or n.endswith("bias")
	],
	"weight_decay": 0.0,
	},
	{
	"params": [
	p
	for n, p in self.named_parameters()
	if not ("norm" in n or n.endswith("bias"))
	]
	},
	]
	return groups


	class XLMRobertaWithHead(XLMRoberta):
	def __init__(self, **kwargs):
	self.out_dim = kwargs.pop("out_dim")
	super().__init__(**kwargs)

	# head
	mid_dim = (self.dim + self.out_dim) // 2
	self.head = nn.Sequential(
	nn.Linear(self.dim, mid_dim, bias=False),
	nn.GELU(),
	nn.Linear(mid_dim, self.out_dim, bias=False),
	)

	def forward(self, ids):
	# xlm-roberta
	x = super().forward(ids)

	# average pooling
	mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
	x = (x * mask).sum(dim=1) / mask.sum(dim=1)

	# head
	x = self.head(x)
	return x


	class XLMRobertaCLIP(nn.Module):
	def __init__(
	self,
	embed_dim=1024,
	image_size=224,
	patch_size=14,
	vision_dim=1280,
	vision_mlp_ratio=4,
	vision_heads=16,
	vision_layers=32,
	vision_pool="token",
	vision_pre_norm=True,
	vision_post_norm=False,
	activation="gelu",
	vocab_size=250002,
	max_text_len=514,
	type_size=1,
	pad_id=1,
	text_dim=1024,
	text_heads=16,
	text_layers=24,
	text_post_norm=True,
	text_dropout=0.1,
	attn_dropout=0.0,
	proj_dropout=0.0,
	embedding_dropout=0.0,
	norm_eps=1e-5,
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.image_size = image_size
	self.patch_size = patch_size
	self.vision_dim = vision_dim
	self.vision_mlp_ratio = vision_mlp_ratio
	self.vision_heads = vision_heads
	self.vision_layers = vision_layers
	self.vision_pre_norm = vision_pre_norm
	self.vision_post_norm = vision_post_norm
	self.activation = activation
	self.vocab_size = vocab_size
	self.max_text_len = max_text_len
	self.type_size = type_size
	self.pad_id = pad_id
	self.text_dim = text_dim
	self.text_heads = text_heads
	self.text_layers = text_layers
	self.text_post_norm = text_post_norm
	self.norm_eps = norm_eps

	# models
	self.visual = VisionTransformer(
	image_size=image_size,
	patch_size=patch_size,
	dim=vision_dim,
	mlp_ratio=vision_mlp_ratio,
	out_dim=embed_dim,
	num_heads=vision_heads,
	num_layers=vision_layers,
	pool_type=vision_pool,
	pre_norm=vision_pre_norm,
	post_norm=vision_post_norm,
	activation=activation,
	attn_dropout=attn_dropout,
	proj_dropout=proj_dropout,
	embedding_dropout=embedding_dropout,
	norm_eps=norm_eps,
	)
	self.textual = None
	self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))

	def forward(self, imgs, txt_ids):
	"""
	imgs: [B, 3, H, W] of torch.float32.
	- mean: [0.48145466, 0.4578275, 0.40821073]
	- std: [0.26862954, 0.26130258, 0.27577711]
	txt_ids: [B, L] of torch.long.
	Encoded by data.CLIPTokenizer.
	"""
	xi = self.visual(imgs)
	xt = self.textual(txt_ids)
	return xi, xt

	def param_groups(self):
	groups = [
	{
	"params": [
	p
	for n, p in self.named_parameters()
	if "norm" in n or n.endswith("bias")
	],
	"weight_decay": 0.0,
	},
	{
	"params": [
	p
	for n, p in self.named_parameters()
	if not ("norm" in n or n.endswith("bias"))
	]
	},
	]
	return groups


	def _clip(
	pretrained=False,
	pretrained_name=None,
	model_cls=CLIP,
	return_transforms=False,
	return_tokenizer=False,
	tokenizer_padding="eos",
	dtype=torch.float32,
	device="cpu",
	**kwargs,
	):
	# init model
	if pretrained and pretrained_name:
	from sora import BUCKET, DOWNLOAD_TO_CACHE

	# init a meta model
	with torch.device("meta"):
	model = model_cls(**kwargs)

	# checkpoint path
	checkpoint = f"models/clip/{pretrained_name}"
	if dtype in (torch.float16, torch.bfloat16):
	suffix = "-" + {torch.float16: "fp16", torch.bfloat16: "bf16"}[dtype]
	if object_exists(BUCKET, f"{checkpoint}{suffix}.pth"):
	checkpoint = f"{checkpoint}{suffix}"
	checkpoint += ".pth"

	# load
	model.load_state_dict(
	torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device),
	assign=True,
	strict=False,
	)
	else:
	# init a model on device
	with torch.device(device):
	model = model_cls(**kwargs)

	# set device
	output = (model,)

	# init transforms
	if return_transforms:
	# mean and std
	if "siglip" in pretrained_name.lower():
	mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
	else:
	mean = [0.48145466, 0.4578275, 0.40821073]
	std = [0.26862954, 0.26130258, 0.27577711]

	# transforms
	transforms = T.Compose(
	[
	T.Resize(
	(model.image_size, model.image_size),
	interpolation=T.InterpolationMode.BICUBIC,
	),
	T.ToTensor(),
	T.Normalize(mean=mean, std=std),
	]
	)
	output += (transforms,)

	# init tokenizer
	if return_tokenizer:
	from sora import data

	if "siglip" in pretrained_name.lower():
	tokenizer = data.HuggingfaceTokenizer(
	name=f"timm/{pretrained_name}",
	seq_len=model.text_len,
	clean="canonicalize",
	)
	elif "xlm" in pretrained_name.lower():
	tokenizer = data.HuggingfaceTokenizer(
	name="xlm-roberta-large",
	seq_len=model.max_text_len - 2,
	clean="whitespace",
	)
	elif "mba" in pretrained_name.lower():
	tokenizer = data.HuggingfaceTokenizer(
	name="facebook/xlm-roberta-xl",
	seq_len=model.max_text_len - 2,
	clean="whitespace",
	)
	else:
	tokenizer = data.CLIPTokenizer(
	seq_len=model.text_len, padding=tokenizer_padding
	)
	output += (tokenizer,)
	return output[0] if len(output) == 1 else output


	def clip_xlm_roberta_vit_h_14(
	pretrained=False,
	pretrained_name="open-clip-xlm-roberta-large-vit-huge-14",
	**kwargs,
	):
	cfg = dict(
	embed_dim=1024,
	image_size=224,
	patch_size=14,
	vision_dim=1280,
	vision_mlp_ratio=4,
	vision_heads=16,
	vision_layers=32,
	vision_pool="token",
	activation="gelu",
	vocab_size=250002,
	max_text_len=514,
	type_size=1,
	pad_id=1,
	text_dim=1024,
	text_heads=16,
	text_layers=24,
	text_post_norm=True,
	text_dropout=0.1,
	attn_dropout=0.0,
	proj_dropout=0.0,
	embedding_dropout=0.0,
	)
	cfg.update(**kwargs)
	return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)


	class WanImageEncoder(torch.nn.Module):
	def __init__(self):
	super().__init__()
	# init model
	self.model, self.transforms = clip_xlm_roberta_vit_h_14(
	pretrained=False,
	return_transforms=True,
	return_tokenizer=False,
	dtype=torch.float32,
	device="cpu",
	)

	def encode_image(self, videos):
	# preprocess
	size = (self.model.image_size,) * 2
	videos = torch.cat(
	[
	F.interpolate(u, size=size, mode="bicubic", align_corners=False)
	for u in videos
	]
	)
	videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))

	# forward
	dtype = next(iter(self.model.visual.parameters())).dtype
	videos = videos.to(dtype)
	out = self.model.visual(videos, use_31_block=True)
	return out

	@staticmethod
	def state_dict_converter():
	return WanImageEncoderStateDictConverter()


	class WanImageEncoderStateDictConverter:
	def __init__(self):
	pass

	def from_diffusers(self, state_dict):
	return state_dict

	def from_civitai(self, state_dict):
	state_dict_ = {}
	for name, param in state_dict.items():
	if name.startswith("textual."):
	continue
	name = "model." + name
	state_dict_[name] = param
	return state_dict_