inoculatemedia commited on
Commit
cff3d7d
·
verified ·
1 Parent(s): 1797626

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -0
app.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from audioldm import text_to_audio, build_model
4
+
5
+ # from share_btn import community_icon_html, loading_icon_html, share_js
6
+
7
+ model_id = "haoheliu/AudioLDM-S-Full"
8
+
9
+ audioldm = None
10
+ current_model_name = None
11
+ # audioldm=None
12
+
13
+ # def predict(input, history=[]):
14
+ # # tokenize the new input sentence
15
+ # new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt')
16
+
17
+ # # append the new user input tokens to the chat history
18
+ # bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
19
+
20
+ # # generate a response
21
+ # history = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).tolist()
22
+
23
+ # # convert the tokens to text, and then split the responses into lines
24
+ # response = tokenizer.decode(history[0]).split("<|endoftext|>")
25
+ # response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
26
+ # return response, history
27
+
28
+ def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name):
29
+ global audioldm, current_model_name
30
+
31
+ if audioldm is None or model_name != current_model_name:
32
+ audioldm=build_model(model_name=model_name)
33
+ current_model_name = model_name
34
+
35
+ # print(text, length, guidance_scale)
36
+ waveform = text_to_audio(
37
+ latent_diffusion=audioldm,
38
+ text=text,
39
+ seed=random_seed,
40
+ duration=duration,
41
+ guidance_scale=guidance_scale,
42
+ n_candidate_gen_per_text=int(n_candidates),
43
+ ) # [bs, 1, samples]
44
+ waveform = [
45
+ gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
46
+ ]
47
+ # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
48
+ if len(waveform) == 1:
49
+ waveform = waveform[0]
50
+ return waveform
51
+
52
+
53
+ # iface = gr.Interface(fn=text2audio, inputs=[
54
+ # gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
55
+ # gr.Slider(2.5, 10, value=5, step=2.5),
56
+ # gr.Slider(0, 5, value=2.5, step=0.5),
57
+ # gr.Number(value=42)
58
+ # ], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")],
59
+ # allow_flagging="never"
60
+ # )
61
+ # iface.launch(share=True)
62
+
63
+
64
+ css = """
65
+ a {
66
+ color: inherit;
67
+ text-decoration: underline;
68
+ }
69
+ .gradio-container {
70
+ font-family: 'IBM Plex Sans', sans-serif;
71
+ }
72
+ .gr-button {
73
+ color: white;
74
+ border-color: #000000;
75
+ background: #000000;
76
+ }
77
+ input[type='range'] {
78
+ accent-color: #000000;
79
+ }
80
+ .dark input[type='range'] {
81
+ accent-color: #dfdfdf;
82
+ }
83
+ .container {
84
+ max-width: 730px;
85
+ margin: auto;
86
+ padding-top: 1.5rem;
87
+ }
88
+ #gallery {
89
+ min-height: 22rem;
90
+ margin-bottom: 15px;
91
+ margin-left: auto;
92
+ margin-right: auto;
93
+ border-bottom-right-radius: .5rem !important;
94
+ border-bottom-left-radius: .5rem !important;
95
+ }
96
+ #gallery>div>.h-full {
97
+ min-height: 20rem;
98
+ }
99
+ .details:hover {
100
+ text-decoration: underline;
101
+ }
102
+ .gr-button {
103
+ white-space: nowrap;
104
+ }
105
+ .gr-button:focus {
106
+ border-color: rgb(147 197 253 / var(--tw-border-opacity));
107
+ outline: none;
108
+ box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
109
+ --tw-border-opacity: 1;
110
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
111
+ --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
112
+ --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
113
+ --tw-ring-opacity: .5;
114
+ }
115
+ #advanced-btn {
116
+ font-size: .7rem !important;
117
+ line-height: 19px;
118
+ margin-top: 12px;
119
+ margin-bottom: 12px;
120
+ padding: 2px 8px;
121
+ border-radius: 14px !important;
122
+ }
123
+ #advanced-options {
124
+ margin-bottom: 20px;
125
+ }
126
+ .footer {
127
+ margin-bottom: 45px;
128
+ margin-top: 35px;
129
+ text-align: center;
130
+ border-bottom: 1px solid #e5e5e5;
131
+ }
132
+ .footer>p {
133
+ font-size: .8rem;
134
+ display: inline-block;
135
+ padding: 0 10px;
136
+ transform: translateY(10px);
137
+ background: white;
138
+ }
139
+ .dark .footer {
140
+ border-color: #303030;
141
+ }
142
+ .dark .footer>p {
143
+ background: #0b0f19;
144
+ }
145
+ .acknowledgments h4{
146
+ margin: 1.25em 0 .25em 0;
147
+ font-weight: bold;
148
+ font-size: 115%;
149
+ }
150
+ #container-advanced-btns{
151
+ display: flex;
152
+ flex-wrap: wrap;
153
+ justify-content: space-between;
154
+ align-items: center;
155
+ }
156
+ .animate-spin {
157
+ animation: spin 1s linear infinite;
158
+ }
159
+ @keyframes spin {
160
+ from {
161
+ transform: rotate(0deg);
162
+ }
163
+ to {
164
+ transform: rotate(360deg);
165
+ }
166
+ }
167
+ #share-btn-container {
168
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
169
+ margin-top: 10px;
170
+ margin-left: auto;
171
+ }
172
+ #share-btn {
173
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
174
+ }
175
+ #share-btn * {
176
+ all: unset;
177
+ }
178
+ #share-btn-container div:nth-child(-n+2){
179
+ width: auto !important;
180
+ min-height: 0px !important;
181
+ }
182
+ #share-btn-container .wrap {
183
+ display: none !important;
184
+ }
185
+ .gr-form{
186
+ flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
187
+ }
188
+ #prompt-container{
189
+ gap: 0;
190
+ }
191
+ #generated_id{
192
+ min-height: 700px
193
+ }
194
+ #setting_id{
195
+ margin-bottom: 12px;
196
+ text-align: center;
197
+ font-weight: 900;
198
+ }
199
+ """
200
+ iface = gr.Blocks(css=css)
201
+
202
+ with iface:
203
+ gr.HTML(
204
+ """
205
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
206
+ <div
207
+ style="
208
+ display: inline-flex;
209
+ align-items: center;
210
+ gap: 0.8rem;
211
+ font-size: 1.75rem;
212
+ "
213
+ >
214
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
215
+ AudioLDM: Text-to-Audio Generation with Latent Diffusion Models
216
+ </h1>
217
+ </div>
218
+ <p style="margin-bottom: 10px; font-size: 94%">
219
+ <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project page]</a>
220
+ </p>
221
+ </div>
222
+ """
223
+ )
224
+ with gr.Group():
225
+ with gr.Box():
226
+ ############# Input
227
+ textbox = gr.Textbox(
228
+ value="A hammer is hitting a wooden surface",
229
+ max_lines=1,
230
+ label="Input your text here. Please ensure it is descriptive and of moderate length.",
231
+ elem_id="prompt-in",
232
+ )
233
+
234
+ with gr.Accordion("Click to modify detailed configurations", open=False):
235
+ seed = gr.Number(
236
+ value=42,
237
+ label="Change this value (any integer number) will lead to a different generation result.",
238
+ )
239
+ duration = gr.Slider(
240
+ 2.5, 10, value=5, step=2.5, label="Duration (seconds)"
241
+ )
242
+ guidance_scale = gr.Slider(
243
+ 0,
244
+ 5,
245
+ value=2.5,
246
+ step=0.5,
247
+ label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
248
+ )
249
+ n_candidates = gr.Slider(
250
+ 1,
251
+ 5,
252
+ value=3,
253
+ step=1,
254
+ label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
255
+ )
256
+ model_name = gr.Dropdown(
257
+ ["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2","audioldm-m-text-ft", "audioldm-s-text-ft", "audioldm-m-full"], value="audioldm-m-full", label="Choose the model to use. audioldm-m-text-ft and audioldm-s-text-ft are recommanded. -s- means small, -m- means medium and -l- means large",
258
+ )
259
+ ############# Output
260
+ # outputs=gr.Audio(label="Output", type="numpy")
261
+ outputs = gr.Video(label="Output", elem_id="output-video")
262
+
263
+ # with gr.Group(elem_id="container-advanced-btns"):
264
+ # # advanced_button = gr.Button("Advanced options", elem_id="advanced-btn")
265
+ # with gr.Group(elem_id="share-btn-container"):
266
+ # community_icon = gr.HTML(community_icon_html, visible=False)
267
+ # loading_icon = gr.HTML(loading_icon_html, visible=False)
268
+ # share_button = gr.Button("Share to community", elem_id="share-btn", visible=False)
269
+ # outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
270
+ btn = gr.Button("Submit").style(full_width=True)
271
+
272
+ # with gr.Group(elem_id="share-btn-container", visible=False):
273
+ # community_icon = gr.HTML(community_icon_html)
274
+ # loading_icon = gr.HTML(loading_icon_html)
275
+ # share_button = gr.Button("Share to community", elem_id="share-btn")
276
+
277
+ btn.click(
278
+ text2audio,
279
+ inputs=[textbox, duration, guidance_scale, seed, n_candidates, model_name],
280
+ outputs=[outputs],
281
+ )
282
+
283
+ # share_button.click(None, [], [], _js=share_js)
284
+ gr.HTML(
285
+ """
286
+ <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
287
+ <p>Follow the latest update of AudioLDM on our<a href="https://github.com/haoheliu/AudioLDM" style="text-decoration: underline;" target="_blank"> Github repo</a>
288
+ </p>
289
+ <br>
290
+ <p>Model by <a href="https://twitter.com/LiuHaohe" style="text-decoration: underline;" target="_blank">Haohe Liu</a></p>
291
+ <br>
292
+ </div>
293
+ """
294
+ )
295
+ # gr.Examples(
296
+ # [
297
+ # ["A hammer is hitting a wooden surface", 5, 2.5, 45, 3, "audioldm-s-full"],
298
+ # [
299
+ # "Peaceful and calming ambient music with singing bowl and other instruments.",
300
+ # 5,
301
+ # 2.5,
302
+ # 45,
303
+ # 3,
304
+ # "audioldm-s-full"
305
+ # ],
306
+ # ["A man is speaking in a small room.", 5, 2.5, 45, 3, "audioldm-s-full"],
307
+ # ["A female is speaking followed by footstep sound", 5, 2.5, 45, 3, "audioldm-s-full"],
308
+ # [
309
+ # "Wooden table tapping sound followed by water pouring sound.",
310
+ # 5,
311
+ # 2.5,
312
+ # 45,
313
+ # 3,
314
+ # "audioldm-s-full"
315
+ # ],
316
+ # ],
317
+ # fn=text2audio,
318
+ # inputs=[textbox, duration, guidance_scale, seed, n_candidates, model_name],
319
+ # outputs=[outputs],
320
+ # cache_examples=True,
321
+ # )
322
+ with gr.Accordion("Additional information", open=False):
323
+ gr.HTML(
324
+ """
325
+ <div class="acknowledgments">
326
+ <p> We build the model with data from <a href="http://research.google.com/audioset/">AudioSet</a>, <a href="https://freesound.org/">Freesound</a> and <a href="https://sound-effects.bbcrewind.co.uk/">BBC Sound Effect library</a>. We share this demo based on the <a href="https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/375954/Research.pdf">UK copyright exception</a> of data for academic research. </p>
327
+ </div>
328
+ """
329
+ )
330
+ # <p>This demo is strictly for research demo purpose only. For commercial use please <a href="haoheliu@gmail.com">contact us</a>.</p>
331
+
332
+ iface.queue(concurrency_count=3)
333
+ # iface.launch(debug=True)
334
+ iface.launch(debug=True, share=False)