Spaces:
Running
Running
from typing import Dict, Any | |
from langchain_core.documents import Document | |
from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult | |
from ask_candid.retrieval.sources.utils import get_context | |
YoutubeConfig = ElasticSourceConfig( | |
index_name="search-semantic-youtube-elser_ve1", | |
text_fields=("captions_cleaned", "description_cleaned", "title"), | |
excluded_fields=("captions", "description", "text_cleaned") | |
) | |
def process_youtube_hit(hit: ElasticHitsResult) -> Document: | |
title = hit.source.get("title", "") | |
# we only need to process long texts | |
description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12) | |
captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12) | |
return Document( | |
page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]), | |
metadata={ | |
"title": title, | |
"source": "Candid YouTube", | |
"source_id": hit.source['video_id'], | |
"url": f"https://www.youtube.com/watch?v={hit.source['video_id']}" | |
} | |
) | |
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str: | |
url = f"https://www.youtube.com/watch?v={doc['video_id']}" | |
fields = ["title", "description_cleaned"] | |
fields_dict = {} | |
fields_len = 0 | |
for field in fields: | |
if doc.get(field, None) is not None: | |
fields_dict[field] = doc[field] | |
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>" | |
if (fields_len + len(doc[field])) > 999: | |
rest_text_len = 999 - fields_len | |
if rest_text_len > 0: | |
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>" | |
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>" | |
fields_len = fields_len + len(doc[field]) | |
else: | |
fields_dict[field] = "" | |
fields_dict[field + "_txt"] = "" | |
html = f""" | |
<div style='height: {height_px}px; padding: 5px;'> | |
<div style='height: {height_px}px; border: 1px solid #febe10;'> | |
<span style='padding-left: 10px; display: inline-block; width: 100%;'> | |
<div> | |
<span> | |
<b>Candid Youtube video:</b> | |
<a href='{url}' target='_blank' style='text-decoration: none;'> | |
{doc['title']} | |
</a> | |
</span> | |
<iframe | |
width="426" | |
height="240" | |
src="https://www.youtube.com/embed/{doc['video_id']}?si=0-y6eRrOzXTUSBDY" | |
title="YouTube video player" | |
frameborder="0" | |
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" | |
referrerpolicy="strict-origin-when-cross-origin" | |
allowfullscreen | |
style="display: inline-block; float: left;padding-right: 10px;padding-top: 5px;"> | |
</iframe> | |
<br> | |
<br> | |
{fields_dict["description_cleaned_txt"]} | |
</div> | |
</span> | |
</div> | |
</div> | |
""" | |
return html | |