File size: 4,115 Bytes
c301481
 
 
72ae2e5
8279ac9
7ca09b0
 
8279ac9
7ca09b0
 
 
c301481
8279ac9
 
 
 
72ae2e5
 
 
 
 
 
8279ac9
 
72ae2e5
 
 
 
 
 
 
 
 
 
 
 
 
8279ac9
72ae2e5
 
 
 
 
 
 
 
8279ac9
 
 
 
 
 
 
 
 
 
 
 
 
72ae2e5
 
 
c301481
 
 
 
8dbb6cc
c301481
 
8dbb6cc
 
05a5c90
8dbb6cc
c301481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02e83fb
c301481
 
 
 
 
 
 
 
 
 
 
8279ac9
 
 
c301481
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import xml.etree.ElementTree as ET
from datetime import datetime
import os
from huggingface_hub import InferenceClient
import re

client = InferenceClient(
    "meta-llama/Llama-3.1-8B-Instruct",
    provider="hf-inference",
    token=os.getenv("HF_TOKEN"),
)

def clean_label(line):
    # Remove common label patterns
    return re.sub(r"^\s*(\*\*?)?(Headline|Description)\:?\*?\*?\s*", "", line, flags=re.IGNORECASE)

def generate_headline_and_description(subject: str, steering_question: str | None = None) -> tuple[str, str]:
    """Ask the LLM for a headline and a short description for the podcast episode."""
    prompt = f"""You are a world-class podcast producer. Given the following paper or topic, generate:
1. A catchy, informative headline for a podcast episode about it (max 15 words).
2. A short, engaging description (2-3 sentences, max 60 words) that summarizes what listeners will learn or why the topic is exciting.

Output ONLY the headline on the first line, and the description on the second line. Do NOT include any labels, markdown, or extra formatting.

Here is the topic:
{subject[:10000]}
"""
    messages = [
        {"role": "system", "content": "You are a world-class podcast producer."},
        {"role": "user", "content": prompt},
    ]
    response = client.chat_completion(
        messages,
        max_tokens=512,
    )
    full_text = response.choices[0].message.content.strip()
    # Try to split headline and description
    lines = [clean_label(l.strip()) for l in full_text.splitlines() if l.strip()]
    if len(lines) >= 2:
        headline = lines[0]
        description = " ".join(lines[1:])
    else:
        headline = full_text[:80]
        description = full_text
    return headline, description

def indent(elem, level=0):
    i = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        for child in elem:
            indent(child, level + 1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

# -----------------------------------------------------------------------------
# UPDATE RSS
# -----------------------------------------------------------------------------
def get_next_episode_number(podcast_dir="podcasts"):
    files = [f for f in os.listdir(podcast_dir) if f.endswith(".wav")]
    return len(files) + 1

def update_rss(subject, audio_url, audio_length, paper_id=None, rss_path="rss.xml"):
    # Generate headline and description automatically
    title, description = generate_headline_and_description(subject)
    if paper_id:
        paper_url = f"https://huggingface.co/papers/{paper_id}"
        description += f'\n\n<a href="{paper_url}">[Read the paper on Hugging Face]</a>'
        
    tree = ET.parse(rss_path)
    root = tree.getroot()
    channel = root.find("channel")
    
    # Update lastBuildDate
    last_build_date = channel.find("lastBuildDate")
    now_rfc2822 = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")
    if last_build_date is not None:
        last_build_date.text = now_rfc2822
    
    # Create new item
    item = ET.Element("item")
    ET.SubElement(item, "title").text = title
    ET.SubElement(item, "description").text = description
    ET.SubElement(item, "pubDate").text = now_rfc2822
    ET.SubElement(item, "enclosure", url=audio_url, length=str(audio_length), type="audio/wav")
    ET.SubElement(item, "guid").text = audio_url
    ET.SubElement(item, "itunes:explicit").text = "false"
    
    # Insert new item after lastBuildDate (i.e., as the first item)
    # Find the first <item> and insert before it, or append if none exist
    items = channel.findall("item")
    if items:
        channel.insert(list(channel).index(items[0]), item)
    else:
        channel.append(item)
    
    # Write back to file with pretty formatting
    indent(root)
    ET.register_namespace('itunes', "http://www.itunes.com/dtds/podcast-1.0.dtd")
    tree.write(rss_path, encoding="utf-8", xml_declaration=True)