Spaces:

sleeper371
/

bark_with_batch_inference

Running on Zero

App Files Files Community

bark_with_batch_inference / core /bark /encodec.py

sleeper371

add code

37a9836 about 1 month ago

raw

history blame contribute delete

1.97 kB

	import torch
	import numpy as np

	from encodec import EncodecModel
	from encodec.utils import convert_audio
	from core.memory import model_manager, ModelEnum, env
	from core.bark.custom_context import inference_mode


	def encodec_decode_fine_tokens_to_audio(fine_tokens: torch.Tensor) -> np.ndarray:
	"""
	expecting fine_tokens shape [codebook_size, timestep], concretely [8, 75*duration_in_sec]
	Decode the given fine_tokens using the Encodec's decoder
	Returns the audio sample array as an np.ndarray
	Returns
	np.ndarray of shape (B, C, T), C = 1 for mono audio
	"""
	model_info = ModelEnum.ENCODEC24k.value

	model_wrapper = model_manager.get_model(model_info)
	model: EncodecModel = model_wrapper.model

	device = next(model.parameters()).device

	input_tensor = fine_tokens.transpose(0, 1).to(device)

	emb = model.quantizer.decode(input_tensor)

	output: torch.Tensor = model.decoder(emb)
	audio_arr = output.detach().cpu().numpy()

	del input_tensor, emb, output

	return audio_arr


	def encodec_encode_audio(
	audio_sample: torch.Tensor, audio_sample_rate: int
	) -> torch.Tensor:
	"""
	Encode the given audio sample using the encodec model
	audio_sample expected shape: (channels, sample)

	Returns codes as a tensor shape [n_q, T]
	where n_q typically is 8 and T is the compressed time step dimension (75 per second for 24khz model)
	"""
	model_wrapper = model_manager.get_model(ModelEnum.ENCODEC24k.value)
	model: EncodecModel = model_wrapper.model

	device = next(model.parameters()).device

	wav = convert_audio(
	audio_sample, audio_sample_rate, model.sample_rate, model.channels
	)
	wav = wav.unsqueeze(0).float().to(device)

	# Extract discrete codes from EnCodec
	with inference_mode():
	encoded_frames = model.encode(wav)

	codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T]

	return codes[0, :, :]