Spaces:
Sleeping
Sleeping
File size: 8,344 Bytes
7453da0 cbbd024 c657020 9738ed3 5e703c7 88cbac0 b16b354 5e703c7 396214f 80aa4e5 508045d 9e731de 9a8e899 9e731de 72aa658 508045d 6122bcd 027ffa8 455006b 027ffa8 dd150ca 027ffa8 26ae92c 027ffa8 958f37c f680781 26ae92c e377751 6122bcd 508045d fc0768e 508045d 6122bcd f7b9ef5 508045d 80aa4e5 aef38d7 682c6da 418f0de f680781 88cbac0 f680781 88cbac0 f680781 906ee65 418f0de b686b04 418f0de dd150ca b686b04 dd150ca b686b04 dd150ca 1f91b20 c540930 1f91b20 2a421af 5813089 1f91b20 5813089 2a421af 5813089 c204afa 1f91b20 c540930 dd150ca 1f91b20 2a421af 9962de6 2a421af 418f0de ace78d6 dd150ca 418f0de a56420e 396214f dd150ca 2a421af a56420e dd150ca b686b04 2a421af a56420e 2a421af a56420e 508045d f680781 9a8e899 b9bed89 b89ea66 b9bed89 3127104 b9bed89 b89ea66 fc13525 d36a529 b9bed89 e377751 b9bed89 e377751 b9bed89 72aa658 b9bed89 776a974 e377751 776a974 418f0de f680781 776a974 f680781 776a974 e377751 8b03fdc 447fd1f e377751 776a974 e377751 5bddbaf f680781 fc13525 5bddbaf e377751 f680781 b686b04 958f37c 2a421af a56420e b16b354 b9bed89 f680781 b9bed89 5bddbaf 447fd1f 2a421af a56420e dd150ca b9bed89 dd150ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
import gradio as gr
import spaces
import json
import re
import random
import numpy as np
from gradio_client import Client, handle_file
MAX_SEED = np.iinfo(np.int32).max
import re
import torch
from transformers import pipeline
zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
#mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
standard_sys = f"""
You are an AI Art Director that specializes in translating music and audio descriptions into visually expressive fashion outfit ideas.
Your task:
- Given a description of a piece of music or sound, generate a **single outfit suggestion** that captures the mood, tempo, and emotional tone of the audio.
- Be specific. Mention the type of clothing, colors, materials, accessories, and any stylistic flourishes.
- The response must be friendly but concise (max 1-2 sentences), directly delivering the outfit description.
- **Only return the outfit in the following exact format**, within double quotes:
"A person dressed in [...]."
Do not include any explanations or extra commentary.
Examples:
Input:
"This song features a female vocalist singing a beautiful and emotional melody. The melody is accompanied by the sound of a piano playing a slow and melancholic tune. The song has a dreamy and ethereal feel to it. The lyrics of the song are about the beauty of love and the joy it brings to one's life."
Output:
"A person dressed with a flowy, pastel-colored dress paired with strappy sandals and a wide-brimmed hat, accessorized with delicate jewelry, such as dainty earrings and a necklace."
Input:
"A hard-hitting techno track with industrial beats, glitchy textures, and a driving, relentless rhythm."
Output:
"A person dressed in a black leather jacket over a mesh top, paired with chunky combat boots and silver accessories, with bold eyeliner completing the edgy, cyberpunk look."
Always output in this format and stop immediately.
"""
@spaces.GPU
def get_outfit_prompt(user_prompt):
agent_maker_sys = standard_sys
instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
prompt = f"{instruction.strip()}\n{user_prompt}</s>"
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
print(f"SUGGESTED Musical prompt: {cleaned_text}")
return cleaned_text.lstrip("\n")
def get_salmonn(audio_in, prompt, token):
client = Client("fffiloni/SALMONN-7B-gradio", hf_token=token)
result = client.predict(
speech=handle_file(audio_in),
text_input=prompt,
num_beams=4,
temperature=1,
top_p=0.9,
api_name="/gradio_answer_1"
)
print(result)
return result
def sdxl_image(suggested_outfit_prompt, token):
client = Client("ByteDance/SDXL-Lightning", hf_token=token)
result = client.predict(
prompt=suggested_outfit_prompt,
ckpt="4-Step",
api_name="/generate_image"
)
print(result)
return result
def extract_json(text):
"""
Extracts the first JSON object found in a string and parses it.
Returns a dictionary or {} if parsing fails.
"""
try:
# Attempt fast parse first
return json.loads(text)
except json.JSONDecodeError:
# Fallback: Extract {...} content using regex
match = re.search(r'\{.*\}', text, re.DOTALL)
if match:
try:
return json.loads(match.group(0))
except json.JSONDecodeError as e:
print("⚠️ JSON decode failed after match:", e)
return {}
@spaces.GPU
def get_parsed_outfit_items(outfit_sentence):
parser_sys = """
You are a fashion assistant AI that helps e-commerce designers turn full outfit descriptions into individual product image prompts.
Your task:
- Given an outfit description (1 sentence), break it into key labeled parts: dress, top, bottom, shoes, outerwear, jewelry, hat, accessories.
- Write one short, specific image-generation prompt per part.
- Focus on describing each item visually and clearly as it would appear in a product photo.
- Respond only in raw JSON like this:
{
"shoes": "High-quality product image of brown leather boots, white background",
"hat": "Studio photo of a navy beret on a stand, isolated on white"
}
Respond only with a valid JSON object. Ensure the JSON is properly formatted with correct commas between fields.
Do not forget commas between entries. Validate before finishing your response.
Do not include any explanations or markdown syntax. No commentary. No extra text.
Start directly with `{` and end with `}`.
"""
prompt = f"""<|system|>
{parser_sys}</s>
<|user|>
"{outfit_sentence}"</s>
"""
outputs = pipe(prompt, max_new_tokens=512, temperature=0.7, top_k=50, top_p=0.9)
pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
print(f"\n🧾 Raw LLM response:\n{cleaned_text}")
item_dict = extract_json(cleaned_text)
print(f"\n🧩 Parsed outfit parts:\n{json.dumps(item_dict, indent=2)}")
return item_dict
def generate_sdxl_images_list_dynamic(item_prompts, hf_token):
images = []
for part, prompt in item_prompts.items():
print(f"Generating image for {part}...")
result = sdxl_image(prompt, hf_token)
images.append((result, part))
return images
def infer(audio_in, oauth_token: gr.OAuthToken):
gradio_auth_token = oauth_token.token
salmonn_prompt = "Please describe the audio in detail."
gr.Info("Calling SALMONN to understand audio...")
salmonn_res = get_salmonn(audio_in, salmonn_prompt, gradio_auth_token)
yield None, salmonn_res, None
gr.Info("Creating an outfit suggestion based on audio understanding...")
outfit_sentence = get_outfit_prompt(salmonn_res)
yield outfit_sentence, salmonn_res, None
gr.Info("Generate an image with SDXL Lightning...")
outfit_image = sdxl_image(outfit_sentence, gradio_auth_token)
"""
gr.Info("Get outfit parts...")
item_prompts = get_parsed_outfit_items(outfit_sentence)
gr.Info("Generate shopping gallery...")
images_with_labels = generate_sdxl_images_list_dynamic(item_prompts, gradio_auth_token)
"""
yield outfit_sentence, salmonn_res, outfit_image
demo_title = "Music to Outfit"
description = "Get an outfit idea from audio/music input"
css = """
#col-container {
margin: 0 auto;
max-width: 980px;
text-align: left;
}
#inspi-prompt textarea {
font-size: 20px;
line-height: 24px;
font-weight: 600;
}
/* fix examples gallery width on mobile */
div#component-11 > .gallery > .gallery-item > .container > img {
width: auto!important;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML(f"""
<h2 style="text-align: center;">{demo_title}</h2>
<p style="text-align: center;">{description}</p>
""")
with gr.Row():
with gr.Column():
gr.LoginButton()
audio_in = gr.Audio(
label = "Audio reference",
type = "filepath",
elem_id = "audio-in"
)
submit_btn = gr.Button("Make an outfit from my sound !")
salmonn_desc = gr.Textbox(label="Salmonn audio undestanding")
with gr.Column():
caption = gr.Textbox(
label = "Inspirational outfit prompt",
interactive = False,
elem_id = "inspi-prompt"
)
result = gr.Image(
label = "Outfit propal"
)
#clothes_gallery = gr.Gallery()
submit_btn.click(
fn = infer,
inputs = [
audio_in
],
outputs =[
caption,
salmonn_desc,
result,
#clothes_gallery
]
)
demo.queue().launch(show_api=False, show_error=True, ssr_mode=False) |