brainsqueeze's picture
Smarter document context retrieval
f86d7f2 verified
from typing import Dict, Any
from langchain_core.documents import Document
from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
from ask_candid.retrieval.sources.utils import get_context
YoutubeConfig = ElasticSourceConfig(
index_name="search-semantic-youtube-elser_ve1",
text_fields=("captions_cleaned", "description_cleaned", "title"),
excluded_fields=("captions", "description", "text_cleaned")
)
def process_youtube_hit(hit: ElasticHitsResult) -> Document:
title = hit.source.get("title", "")
# we only need to process long texts
description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
return Document(
page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
metadata={
"title": title,
"source": "Candid YouTube",
"source_id": hit.source['video_id'],
"url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
}
)
def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
url = f"https://www.youtube.com/watch?v={doc['video_id']}"
fields = ["title", "description_cleaned"]
fields_dict = {}
fields_len = 0
for field in fields:
if doc.get(field, None) is not None:
fields_dict[field] = doc[field]
fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"
if (fields_len + len(doc[field])) > 999:
rest_text_len = 999 - fields_len
if rest_text_len > 0:
fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
fields_len = fields_len + len(doc[field])
else:
fields_dict[field] = ""
fields_dict[field + "_txt"] = ""
html = f"""
<div style='height: {height_px}px; padding: 5px;'>
<div style='height: {height_px}px; border: 1px solid #febe10;'>
<span style='padding-left: 10px; display: inline-block; width: 100%;'>
<div>
<span>
<b>Candid Youtube video:</b>
<a href='{url}' target='_blank' style='text-decoration: none;'>
{doc['title']}
</a>
</span>
<iframe
width="426"
height="240"
src="https://www.youtube.com/embed/{doc['video_id']}?si=0-y6eRrOzXTUSBDY"
title="YouTube video player"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
referrerpolicy="strict-origin-when-cross-origin"
allowfullscreen
style="display: inline-block; float: left;padding-right: 10px;padding-top: 5px;">
</iframe>
<br>
<br>
{fields_dict["description_cleaned_txt"]}
</div>
</span>
</div>
</div>
"""
return html