podcast-jobs / update_rss.py
fdaudens's picture
fdaudens HF Staff
paper url in show notes
05a5c90
import xml.etree.ElementTree as ET
from datetime import datetime
import os
from huggingface_hub import InferenceClient
import re
client = InferenceClient(
"meta-llama/Llama-3.1-8B-Instruct",
provider="hf-inference",
token=os.getenv("HF_TOKEN"),
)
def clean_label(line):
# Remove common label patterns
return re.sub(r"^\s*(\*\*?)?(Headline|Description)\:?\*?\*?\s*", "", line, flags=re.IGNORECASE)
def generate_headline_and_description(subject: str, steering_question: str | None = None) -> tuple[str, str]:
"""Ask the LLM for a headline and a short description for the podcast episode."""
prompt = f"""You are a world-class podcast producer. Given the following paper or topic, generate:
1. A catchy, informative headline for a podcast episode about it (max 15 words).
2. A short, engaging description (2-3 sentences, max 60 words) that summarizes what listeners will learn or why the topic is exciting.
Output ONLY the headline on the first line, and the description on the second line. Do NOT include any labels, markdown, or extra formatting.
Here is the topic:
{subject[:10000]}
"""
messages = [
{"role": "system", "content": "You are a world-class podcast producer."},
{"role": "user", "content": prompt},
]
response = client.chat_completion(
messages,
max_tokens=512,
)
full_text = response.choices[0].message.content.strip()
# Try to split headline and description
lines = [clean_label(l.strip()) for l in full_text.splitlines() if l.strip()]
if len(lines) >= 2:
headline = lines[0]
description = " ".join(lines[1:])
else:
headline = full_text[:80]
description = full_text
return headline, description
def indent(elem, level=0):
i = "\n" + level * " "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
for child in elem:
indent(child, level + 1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
# -----------------------------------------------------------------------------
# UPDATE RSS
# -----------------------------------------------------------------------------
def get_next_episode_number(podcast_dir="podcasts"):
files = [f for f in os.listdir(podcast_dir) if f.endswith(".wav")]
return len(files) + 1
def update_rss(subject, audio_url, audio_length, paper_id=None, rss_path="rss.xml"):
# Generate headline and description automatically
title, description = generate_headline_and_description(subject)
if paper_id:
paper_url = f"https://huggingface.co/papers/{paper_id}"
description += f'\n\n<a href="{paper_url}">[Read the paper on Hugging Face]</a>'
tree = ET.parse(rss_path)
root = tree.getroot()
channel = root.find("channel")
# Update lastBuildDate
last_build_date = channel.find("lastBuildDate")
now_rfc2822 = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")
if last_build_date is not None:
last_build_date.text = now_rfc2822
# Create new item
item = ET.Element("item")
ET.SubElement(item, "title").text = title
ET.SubElement(item, "description").text = description
ET.SubElement(item, "pubDate").text = now_rfc2822
ET.SubElement(item, "enclosure", url=audio_url, length=str(audio_length), type="audio/wav")
ET.SubElement(item, "guid").text = audio_url
ET.SubElement(item, "itunes:explicit").text = "false"
# Insert new item after lastBuildDate (i.e., as the first item)
# Find the first <item> and insert before it, or append if none exist
items = channel.findall("item")
if items:
channel.insert(list(channel).index(items[0]), item)
else:
channel.append(item)
# Write back to file with pretty formatting
indent(root)
ET.register_namespace('itunes', "http://www.itunes.com/dtds/podcast-1.0.dtd")
tree.write(rss_path, encoding="utf-8", xml_declaration=True)