markhristov commited on
Commit
ffd1a8e
·
1 Parent(s): b32485e

Add application file

Browse files
Files changed (2) hide show
  1. app.py +62 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import CLIPTextModel, CLIPTokenizer
2
+ from diffusers import AutoencoderKL, UNet2DConditionModel
3
+ from diffusers import LMSDiscreteScheduler
4
+ import torch
5
+ from tqdm.auto import tqdm
6
+ from PIL import Image
7
+ import gradio as gr
8
+ from IPython.display import display
9
+
10
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16)
11
+ text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")
12
+
13
+ # Here we use a different VAE to the original release, which has been fine-tuned for more steps
14
+ vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema", torch_dtype=torch.float16).to("cuda")
15
+ unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to("cuda")
16
+
17
+ beta_start,beta_end = 0.00085,0.012
18
+ height = 512
19
+ width = 512
20
+ num_inference_steps = 70
21
+ guidance_scale = 7.5
22
+ batch_size = 1
23
+ scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear", num_train_timesteps=1000)
24
+
25
+ #prompt = ["a photograph of an astronaut riding a horse"]
26
+
27
+ def text_enc(prompts, maxlen=None):
28
+ if maxlen is None: maxlen = tokenizer.model_max_length
29
+ inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
30
+ return text_encoder(inp.input_ids.to("cuda"))[0].half()
31
+
32
+ def do_both(prompts):
33
+ def mk_img(t):
34
+ image = (t/2+0.5).clamp(0,1).detach().cpu().permute(1, 2, 0).numpy()
35
+ return Image.fromarray((image*255).round().astype("uint8"))
36
+
37
+ def mk_samples(prompts, g=7.5, seed=100, steps=70):
38
+ bs = len(prompts)
39
+ text = text_enc(prompts)
40
+ uncond = text_enc([""] * bs, text.shape[1])
41
+ emb = torch.cat([uncond, text])
42
+ if seed: torch.manual_seed(seed)
43
+
44
+ latents = torch.randn((bs, unet.config.in_channels, height//8, width//8))
45
+ scheduler.set_timesteps(steps)
46
+ latents = latents.to("cuda").half() * scheduler.init_noise_sigma
47
+
48
+ for i,ts in enumerate(tqdm(scheduler.timesteps)):
49
+ inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
50
+ with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
51
+ pred = u + g*(t-u)
52
+ latents = scheduler.step(pred, ts, latents).prev_sample
53
+
54
+ with torch.no_grad(): return vae.decode(1 / 0.18215 * latents).sample
55
+ images = mk_samples([prompts])
56
+ for img in images: return(mk_img(img))
57
+
58
+ # do_both(prompt)
59
+ # images = mk_samples(prompt)
60
+ #iface = gr.Interface(fn=do_both, inputs=gr.inputs.Textbox(lines=2, label="Enter text prompt"), outputs=gr.outputs.Image(type="numpy", label="Generated Image")).launch()
61
+ gr.Interface(do_both, gr.Text(), gr.Image(), title = 'Stable Diffusion model from scratch').launch(share = True, debug = True)
62
+ # for img in images: display(mk_img(img))
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ diffusers
2
+ transformers
3
+ torch