Spaces:

declare-lab
/

JAM

Running on Zero

JAM

File size: 10,681 Bytes


def regroup_words(
    words: list[dict],               
    max_len: float = 15.0,           
    gap: float = 0.50,               
) -> list[dict]:
    """
    Returns a list of segments with keys:
        'start', 'end', 'text', 'words'
    """

    if not words:
        return []

    segs, seg_words = [], []
    seg_start = words[0]["start"]
    last_end  = seg_start

    for w in words:
        over_max  = (w["end"] - seg_start) > max_len
        long_gap  = (w["start"] - last_end) > gap

        if (seg_words and (over_max or long_gap)):
            segs.append({
                "start": seg_start,
                "end":   last_end,
                "segment":  " ".join(x["word"] for x in seg_words),
            })
            seg_words = []
            seg_start = w["start"]

        seg_words.append(w)
        last_end = w["end"]

    # flush final segment
    segs.append({
        "start": seg_start,
        "end":   last_end,
        "segment":  " ".join(x["word"] for x in seg_words),
    })
    return segs


def text_to_words(text: str) -> list[dict]:
    """
    Convert text format like "word[start:end] word[start:end]..." to word list.
    
    Args:
        text: String in format "It's[4.96:5.52] a[5.52:5.84] long[5.84:6.16]..."
    
    Returns:
        List of word dictionaries with keys: 'word', 'start', 'end'
    """
    import re
    
    if not text.strip():
        return []
    
    # Pattern to match word[start:end] format
    pattern = r'(\S+?)\[([^:]+):([^\]]+)\]'
    matches = re.findall(pattern, text)
    
    words = []
    for word, start_str, end_str in matches:
        try:
            start = float(start_str) if start_str != 'xxx' else 0.0
            end = float(end_str) if end_str != 'xxx' else 0.0
            words.append({
                'word': word,
                'start': start,
                'end': end
            })
        except ValueError:
            # Skip invalid entries
            continue
    
    return words


def words_to_text(words: list[dict]) -> str:
    """
    Convert word list to text format "word[start:end] word[start:end]...".
    
    Args:
        words: List of word dictionaries with keys: 'word', 'start', 'end'
    
    Returns:
        String in format "It's[4.96:5.52] a[5.52:5.84] long[5.84:6.16]..."
    """
    if not words:
        return ""
    
    text_parts = []
    for word in words:
        word_text = word.get('word', '')
        start = word.get('start', 0.0)
        end = word.get('end', 0.0)
        # Format timestamps to max 2 decimal places
        start_str = f"{start:.2f}".rstrip('0').rstrip('.')
        end_str = f"{end:.2f}".rstrip('0').rstrip('.')
        text_parts.append(f"{word_text}[{start_str}:{end_str}]")
    
    return " ".join(text_parts)


def json_to_text(json_data: dict) -> str:
    """
    Convert JSON lyrics data to text format for display.
    Only uses the 'word' layer from the JSON structure.
    Groups words into sentences/lines for better readability.
    
    Args:
        json_data: Dictionary with 'word' key containing list of word objects
    
    Returns:
        String with words grouped into lines: "word[start:end] word[start:end]...\nword[start:end]..."
    """
    if not isinstance(json_data, dict) or 'word' not in json_data:
        return ""
    
    words = json_data['word']
    
    # Group words into segments using the existing regroup_words function
    segments = regroup_words(words, max_len=5, gap=0.50)
    
    # Convert each segment to text format
    segment_lines = []
    for seg in segments:
        # Extract words for this segment based on time range
        seg_words = []
        for word in words:
            if seg['start'] <= word['start'] < seg['end'] or (
                word['start'] <= seg['start'] < word['end']
            ):
                seg_words.append(word)
        
        if seg_words:
            segment_text = words_to_text(seg_words)
            segment_lines.append(segment_text)
    
    return '\n\n'.join(segment_lines)


def round_to_quarter_beats(beat_position: float) -> float:
    """Round beat position to nearest quarter note for sample display."""
    return round(beat_position * 4) / 4


def beats_to_seconds(beat_position: float, bpm: float) -> float:
    """Convert beat position to time in seconds."""
    return (beat_position * 60.0) / bpm


def seconds_to_beats(time_seconds: float, bpm: float) -> float:
    """Convert time in seconds to beat position."""
    return (time_seconds * bpm) / 60.0


def convert_text_time_to_beats(text: str, bpm: float, round_to_quarters: bool = False) -> str:
    """
    Convert time-based text format to beats-based format.
    
    Args:
        text: String in format "word[start_sec:end_sec] ..."
        bpm: Beats per minute for conversion
        round_to_quarters: If True, round beats to quarter notes (for sample display)
        
    Returns:
        String in format "word[start_beat:end_beat] ..."
    """
    if not text.strip():
        return ""
    
    words = text_to_words(text)
    beat_words = []
    
    for word in words:
        start_beat = seconds_to_beats(word['start'], bpm)
        end_beat = seconds_to_beats(word['end'], bpm)
        
        # Round to quarter notes for sample display
        if round_to_quarters:
            start_beat = round_to_quarter_beats(start_beat)
            end_beat = round_to_quarter_beats(end_beat)
        
        # Format to reasonable precision
        start_str = f"{start_beat:.2f}".rstrip('0').rstrip('.')
        end_str = f"{end_beat:.2f}".rstrip('0').rstrip('.')
        
        beat_words.append(f"{word['word']}[{start_str}:{end_str}]")
    
    return " ".join(beat_words)


def beats_to_text_with_regrouping(text: str, bpm: float, round_to_quarters: bool = False) -> str:
    """
    Convert time-based text to beats format with regrouping (like time mode).
    
    Args:
        text: String in format "word[start_sec:end_sec] ..."
        bpm: Beats per minute for conversion
        round_to_quarters: If True, round beats to quarter notes (for sample display)
        
    Returns:
        String with beats format grouped into lines
    """
    if not text.strip():
        return ""
    
    # First convert to beats format
    words = text_to_words(text)
    beat_words = []
    
    for word in words:
        start_beat = seconds_to_beats(word['start'], bpm)
        end_beat = seconds_to_beats(word['end'], bpm)
        
        # Round to quarter notes for sample display
        if round_to_quarters:
            start_beat = round_to_quarter_beats(start_beat)
            end_beat = round_to_quarter_beats(end_beat)
        
        beat_words.append({
            'word': word['word'],
            'start': start_beat,
            'end': end_beat
        })
    
    # Group beats into segments (using beat positions instead of seconds)
    segments = regroup_words(beat_words, max_len=20, gap=2.0)  # 20 beats max, 2 beat gap
    
    # Convert each segment to text format
    segment_lines = []
    for seg in segments:
        # Extract words for this segment based on beat range
        seg_words = []
        for word in beat_words:
            if seg['start'] <= word['start'] < seg['end'] or (
                word['start'] <= seg['start'] < word['end']
            ):
                seg_words.append(word)
        
        if seg_words:
            segment_text = words_to_text(seg_words)  # This will format as word[beat:beat]
            segment_lines.append(segment_text)
    
    return '\n\n'.join(segment_lines)


def convert_text_beats_to_time(text: str, bpm: float) -> str:
    """
    Convert beats-based text format to time-based format.
    
    Args:
        text: String in format "word[start_beat:end_beat] ..."
        bpm: Beats per minute for conversion
        
    Returns:
        String in format "word[start_sec:end_sec] ..."
    """
    if not text.strip():
        return ""
    
    # Parse beats format (same pattern as time format)
    words = text_to_words(text)
    time_words = []
    
    for word in words:
        # Convert beat positions to time
        start_time = beats_to_seconds(word['start'], bpm)
        end_time = beats_to_seconds(word['end'], bpm)
        
        # Format to reasonable precision
        start_str = f"{start_time:.2f}".rstrip('0').rstrip('.')
        end_str = f"{end_time:.2f}".rstrip('0').rstrip('.')
        
        time_words.append(f"{word['word']}[{start_str}:{end_str}]")
    
    return " ".join(time_words)


def convert_text_beats_to_time_with_regrouping(text: str, bpm: float) -> str:
    """
    Convert beats-based text format to time-based format while preserving line structure.
    
    Args:
        text: String in format "word[start_beat:end_beat] ..." (can be multi-line)
        bpm: Beats per minute for conversion
        
    Returns:
        String in format "word[start_sec:end_sec] ..." with preserved line breaks
    """
    if not text.strip():
        return ""
    
    # Process each line separately to preserve segmentation
    lines = text.split('\n')
    converted_lines = []
    
    for line in lines:
        line = line.strip()
        if not line:
            # Preserve empty lines
            converted_lines.append("")
            continue
            
        # Convert this line from beats to time
        words = text_to_words(line)
        time_words = []
        
        for word in words:
            # Convert beat positions to time
            start_time = beats_to_seconds(word['start'], bpm)
            end_time = beats_to_seconds(word['end'], bpm)
            
            # Format to reasonable precision
            start_str = f"{start_time:.2f}".rstrip('0').rstrip('.')
            end_str = f"{end_time:.2f}".rstrip('0').rstrip('.')
            
            time_words.append(f"{word['word']}[{start_str}:{end_str}]")
        
        if time_words:
            converted_lines.append(" ".join(time_words))
    
    return "\n".join(converted_lines)


def text_to_json(text: str) -> dict:
    """
    Convert text format to JSON structure expected by the model.
    Creates the 'word' layer that the model needs.
    Handles multi-line input by joining lines.
    
    Args:
        text: String in format "word[start:end] word[start:end]..." (can be multi-line)
    
    Returns:
        Dictionary with 'word' key containing list of word objects
    """
    # Join multiple lines into single line for parsing
    single_line_text = ' '.join(line.strip() for line in text.split('\n') if line.strip())
    words = text_to_words(single_line_text)
    return {"word": words}