renhang commited on
Commit
a1ddd2f
·
1 Parent(s): deaa9a6
Files changed (2) hide show
  1. app.py +8 -3
  2. utils.py +31 -5
app.py CHANGED
@@ -95,6 +95,11 @@ def load_example(example_idx, examples):
95
  # Load examples at startup
96
  examples = load_examples()
97
 
 
 
 
 
 
98
  # Gradio interface
99
  with gr.Blocks() as demo:
100
  gr.Markdown("# Jamify: Music Generation from Lyrics and Style")
@@ -116,12 +121,12 @@ with gr.Blocks() as demo:
116
  label="Lyrics",
117
  lines=10,
118
  placeholder="Enter lyrics in format: word[start:end] word[start:end]...\nExample: It's[4.96:5.52] a[5.52:5.84] long[5.84:6.16] way[6.16:6.48]...",
119
- value=""
120
  )
121
- duration_slider = gr.Slider(minimum=5, maximum=230, value=120, step=30, label="Duration (seconds)")
122
 
123
  with gr.Tab("Style from Audio"):
124
- reference_audio = gr.File(label="Reference Audio (.mp3, .wav)", type="filepath")
125
  with gr.Tab("Style from Text"):
126
  style_prompt = gr.Textbox(label="Style Prompt", lines=3, placeholder="e.g., A high-energy electronic dance track with a strong bassline and euphoric synths.")
127
 
 
95
  # Load examples at startup
96
  examples = load_examples()
97
 
98
+ # Get default values from first example
99
+ default_audio = examples[0]['audio_path'] if examples else None
100
+ default_lyrics = examples[0]['lyrics_text'] if examples else ""
101
+ default_duration = examples[0]['duration'] if examples else 120
102
+
103
  # Gradio interface
104
  with gr.Blocks() as demo:
105
  gr.Markdown("# Jamify: Music Generation from Lyrics and Style")
 
121
  label="Lyrics",
122
  lines=10,
123
  placeholder="Enter lyrics in format: word[start:end] word[start:end]...\nExample: It's[4.96:5.52] a[5.52:5.84] long[5.84:6.16] way[6.16:6.48]...",
124
+ value=default_lyrics
125
  )
126
+ duration_slider = gr.Slider(minimum=5, maximum=230, value=default_duration, step=30, label="Duration (seconds)")
127
 
128
  with gr.Tab("Style from Audio"):
129
+ reference_audio = gr.File(label="Reference Audio (.mp3, .wav)", type="filepath", value=default_audio)
130
  with gr.Tab("Style from Text"):
131
  style_prompt = gr.Textbox(label="Style Prompt", lines=3, placeholder="e.g., A high-energy electronic dance track with a strong bassline and euphoric synths.")
132
 
utils.py CHANGED
@@ -95,7 +95,10 @@ def words_to_text(words: list[dict]) -> str:
95
  word_text = word.get('word', '')
96
  start = word.get('start', 0.0)
97
  end = word.get('end', 0.0)
98
- text_parts.append(f"{word_text}[{start}:{end}]")
 
 
 
99
 
100
  return " ".join(text_parts)
101
 
@@ -104,30 +107,53 @@ def json_to_text(json_data: dict) -> str:
104
  """
105
  Convert JSON lyrics data to text format for display.
106
  Only uses the 'word' layer from the JSON structure.
 
107
 
108
  Args:
109
  json_data: Dictionary with 'word' key containing list of word objects
110
 
111
  Returns:
112
- String in format "word[start:end] word[start:end]..."
113
  """
114
  if not isinstance(json_data, dict) or 'word' not in json_data:
115
  return ""
116
 
117
  words = json_data['word']
118
- return words_to_text(words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
 
121
  def text_to_json(text: str) -> dict:
122
  """
123
  Convert text format to JSON structure expected by the model.
124
  Creates the 'word' layer that the model needs.
 
125
 
126
  Args:
127
- text: String in format "word[start:end] word[start:end]..."
128
 
129
  Returns:
130
  Dictionary with 'word' key containing list of word objects
131
  """
132
- words = text_to_words(text)
 
 
133
  return {"word": words}
 
95
  word_text = word.get('word', '')
96
  start = word.get('start', 0.0)
97
  end = word.get('end', 0.0)
98
+ # Format timestamps to max 2 decimal places
99
+ start_str = f"{start:.2f}".rstrip('0').rstrip('.')
100
+ end_str = f"{end:.2f}".rstrip('0').rstrip('.')
101
+ text_parts.append(f"{word_text}[{start_str}:{end_str}]")
102
 
103
  return " ".join(text_parts)
104
 
 
107
  """
108
  Convert JSON lyrics data to text format for display.
109
  Only uses the 'word' layer from the JSON structure.
110
+ Groups words into sentences/lines for better readability.
111
 
112
  Args:
113
  json_data: Dictionary with 'word' key containing list of word objects
114
 
115
  Returns:
116
+ String with words grouped into lines: "word[start:end] word[start:end]...\nword[start:end]..."
117
  """
118
  if not isinstance(json_data, dict) or 'word' not in json_data:
119
  return ""
120
 
121
  words = json_data['word']
122
+
123
+ # Group words into segments using the existing regroup_words function
124
+ segments = regroup_words(words, max_len=15.0, gap=0.50)
125
+
126
+ # Convert each segment to text format
127
+ segment_lines = []
128
+ for seg in segments:
129
+ # Extract words for this segment based on time range
130
+ seg_words = []
131
+ for word in words:
132
+ if seg['start'] <= word['start'] < seg['end'] or (
133
+ word['start'] <= seg['start'] < word['end']
134
+ ):
135
+ seg_words.append(word)
136
+
137
+ if seg_words:
138
+ segment_text = words_to_text(seg_words)
139
+ segment_lines.append(segment_text)
140
+
141
+ return '\n'.join(segment_lines)
142
 
143
 
144
  def text_to_json(text: str) -> dict:
145
  """
146
  Convert text format to JSON structure expected by the model.
147
  Creates the 'word' layer that the model needs.
148
+ Handles multi-line input by joining lines.
149
 
150
  Args:
151
+ text: String in format "word[start:end] word[start:end]..." (can be multi-line)
152
 
153
  Returns:
154
  Dictionary with 'word' key containing list of word objects
155
  """
156
+ # Join multiple lines into single line for parsing
157
+ single_line_text = ' '.join(line.strip() for line in text.split('\n') if line.strip())
158
+ words = text_to_words(single_line_text)
159
  return {"word": words}