from transformers import CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, UNet2DConditionModel from diffusers import LMSDiscreteScheduler import torch from tqdm.auto import tqdm from PIL import Image import gradio as gr from IPython.display import display tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16) text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda") # Here we use a different VAE to the original release, which has been fine-tuned for more steps vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema", torch_dtype=torch.float16).to("cuda") unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to("cuda") beta_start,beta_end = 0.00085,0.012 height = 512 width = 512 num_inference_steps = 70 guidance_scale = 7.5 batch_size = 1 scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear", num_train_timesteps=1000) #prompt = ["a photograph of an astronaut riding a horse"] def text_enc(prompts, maxlen=None): if maxlen is None: maxlen = tokenizer.model_max_length inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt") return text_encoder(inp.input_ids.to("cuda"))[0].half() def do_both(prompts): def mk_img(t): image = (t/2+0.5).clamp(0,1).detach().cpu().permute(1, 2, 0).numpy() return Image.fromarray((image*255).round().astype("uint8")) def mk_samples(prompts, g=7.5, seed=100, steps=70): bs = len(prompts) text = text_enc(prompts) uncond = text_enc([""] * bs, text.shape[1]) emb = torch.cat([uncond, text]) if seed: torch.manual_seed(seed) latents = torch.randn((bs, unet.config.in_channels, height//8, width//8)) scheduler.set_timesteps(steps) latents = latents.to("cuda").half() * scheduler.init_noise_sigma for i,ts in enumerate(tqdm(scheduler.timesteps)): inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts) with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2) pred = u + g*(t-u) latents = scheduler.step(pred, ts, latents).prev_sample with torch.no_grad(): return vae.decode(1 / 0.18215 * latents).sample images = mk_samples([prompts]) for img in images: return(mk_img(img)) # do_both(prompt) # images = mk_samples(prompt) #iface = gr.Interface(fn=do_both, inputs=gr.inputs.Textbox(lines=2, label="Enter text prompt"), outputs=gr.outputs.Image(type="numpy", label="Generated Image")).launch() gr.Interface(do_both, gr.Text(), gr.Image(), title = 'Stable Diffusion model from scratch').launch(share = True, debug = True) # for img in images: display(mk_img(img))