import xml.etree.ElementTree as ET from datetime import datetime import os from huggingface_hub import InferenceClient import re client = InferenceClient( "meta-llama/Llama-3.1-8B-Instruct", provider="hf-inference", token=os.getenv("HF_TOKEN"), ) def clean_label(line): # Remove common label patterns return re.sub(r"^\s*(\*\*?)?(Headline|Description)\:?\*?\*?\s*", "", line, flags=re.IGNORECASE) def generate_headline_and_description(subject: str, steering_question: str | None = None) -> tuple[str, str]: """Ask the LLM for a headline and a short description for the podcast episode.""" prompt = f"""You are a world-class podcast producer. Given the following paper or topic, generate: 1. A catchy, informative headline for a podcast episode about it (max 15 words). 2. A short, engaging description (2-3 sentences, max 60 words) that summarizes what listeners will learn or why the topic is exciting. Output ONLY the headline on the first line, and the description on the second line. Do NOT include any labels, markdown, or extra formatting. Here is the topic: {subject[:10000]} """ messages = [ {"role": "system", "content": "You are a world-class podcast producer."}, {"role": "user", "content": prompt}, ] response = client.chat_completion( messages, max_tokens=512, ) full_text = response.choices[0].message.content.strip() # Try to split headline and description lines = [clean_label(l.strip()) for l in full_text.splitlines() if l.strip()] if len(lines) >= 2: headline = lines[0] description = " ".join(lines[1:]) else: headline = full_text[:80] description = full_text return headline, description def indent(elem, level=0): i = "\n" + level * " " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " for child in elem: indent(child, level + 1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i # ----------------------------------------------------------------------------- # UPDATE RSS # ----------------------------------------------------------------------------- def get_next_episode_number(podcast_dir="podcasts"): files = [f for f in os.listdir(podcast_dir) if f.endswith(".wav")] return len(files) + 1 def update_rss(subject, audio_url, audio_length, paper_id=None, rss_path="rss.xml"): # Generate headline and description automatically title, description = generate_headline_and_description(subject) if paper_id: paper_url = f"https://huggingface.co/papers/{paper_id}" description += f'\n\n[Read the paper on Hugging Face]' tree = ET.parse(rss_path) root = tree.getroot() channel = root.find("channel") # Update lastBuildDate last_build_date = channel.find("lastBuildDate") now_rfc2822 = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") if last_build_date is not None: last_build_date.text = now_rfc2822 # Create new item item = ET.Element("item") ET.SubElement(item, "title").text = title ET.SubElement(item, "description").text = description ET.SubElement(item, "pubDate").text = now_rfc2822 ET.SubElement(item, "enclosure", url=audio_url, length=str(audio_length), type="audio/wav") ET.SubElement(item, "guid").text = audio_url ET.SubElement(item, "itunes:explicit").text = "false" # Insert new item after lastBuildDate (i.e., as the first item) # Find the first and insert before it, or append if none exist items = channel.findall("item") if items: channel.insert(list(channel).index(items[0]), item) else: channel.append(item) # Write back to file with pretty formatting indent(root) ET.register_namespace('itunes', "http://www.itunes.com/dtds/podcast-1.0.dtd") tree.write(rss_path, encoding="utf-8", xml_declaration=True)