Spaces:

fdaudens
/

podcast-jobs

Running on Zero

App Files Files Community

podcast-jobs / update_rss.py

fdaudens HF Staff

paper url in show notes

05a5c90 about 1 month ago

raw

history blame contribute delete

4.12 kB

	import xml.etree.ElementTree as ET
	from datetime import datetime
	import os
	from huggingface_hub import InferenceClient
	import re

	client = InferenceClient(
	"meta-llama/Llama-3.1-8B-Instruct",
	provider="hf-inference",
	token=os.getenv("HF_TOKEN"),
	)

	def clean_label(line):
	# Remove common label patterns
	return re.sub(r"^\s(\\?)?(Headline\|Description)\:?\?\?\s", "", line, flags=re.IGNORECASE)

	def generate_headline_and_description(subject: str, steering_question: str \| None = None) -> tuple[str, str]:
	"""Ask the LLM for a headline and a short description for the podcast episode."""
	prompt = f"""You are a world-class podcast producer. Given the following paper or topic, generate:
	1. A catchy, informative headline for a podcast episode about it (max 15 words).
	2. A short, engaging description (2-3 sentences, max 60 words) that summarizes what listeners will learn or why the topic is exciting.

	Output ONLY the headline on the first line, and the description on the second line. Do NOT include any labels, markdown, or extra formatting.

	Here is the topic:
	{subject[:10000]}
	"""
	messages = [
	{"role": "system", "content": "You are a world-class podcast producer."},
	{"role": "user", "content": prompt},
	]
	response = client.chat_completion(
	messages,
	max_tokens=512,
	)
	full_text = response.choices[0].message.content.strip()
	# Try to split headline and description
	lines = [clean_label(l.strip()) for l in full_text.splitlines() if l.strip()]
	if len(lines) >= 2:
	headline = lines[0]
	description = " ".join(lines[1:])
	else:
	headline = full_text[:80]
	description = full_text
	return headline, description

	def indent(elem, level=0):
	i = "\n" + level * " "
	if len(elem):
	if not elem.text or not elem.text.strip():
	elem.text = i + " "
	for child in elem:
	indent(child, level + 1)
	if not elem.tail or not elem.tail.strip():
	elem.tail = i
	else:
	if level and (not elem.tail or not elem.tail.strip()):
	elem.tail = i

	# -----------------------------------------------------------------------------
	# UPDATE RSS
	# -----------------------------------------------------------------------------
	def get_next_episode_number(podcast_dir="podcasts"):
	files = [f for f in os.listdir(podcast_dir) if f.endswith(".wav")]
	return len(files) + 1

	def update_rss(subject, audio_url, audio_length, paper_id=None, rss_path="rss.xml"):
	# Generate headline and description automatically
	title, description = generate_headline_and_description(subject)
	if paper_id:
	paper_url = f"https://huggingface.co/papers/{paper_id}"
	description += f'\n\n<a href="{paper_url}">[Read the paper on Hugging Face]</a>'

	tree = ET.parse(rss_path)
	root = tree.getroot()
	channel = root.find("channel")

	# Update lastBuildDate
	last_build_date = channel.find("lastBuildDate")
	now_rfc2822 = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")
	if last_build_date is not None:
	last_build_date.text = now_rfc2822

	# Create new item
	item = ET.Element("item")
	ET.SubElement(item, "title").text = title
	ET.SubElement(item, "description").text = description
	ET.SubElement(item, "pubDate").text = now_rfc2822
	ET.SubElement(item, "enclosure", url=audio_url, length=str(audio_length), type="audio/wav")
	ET.SubElement(item, "guid").text = audio_url
	ET.SubElement(item, "itunes:explicit").text = "false"

	# Insert new item after lastBuildDate (i.e., as the first item)
	# Find the first <item> and insert before it, or append if none exist
	items = channel.findall("item")
	if items:
	channel.insert(list(channel).index(items[0]), item)
	else:
	channel.append(item)

	# Write back to file with pretty formatting
	indent(root)
	ET.register_namespace('itunes', "http://www.itunes.com/dtds/podcast-1.0.dtd")
	tree.write(rss_path, encoding="utf-8", xml_declaration=True)