File size: 3,454 Bytes
a0e37e2
cc80c3d
f86d7f2
 
 
 
cc80c3d
 
 
 
 
 
a0e37e2
 
f86d7f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0e37e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from typing import Dict, Any

from langchain_core.documents import Document

from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
from ask_candid.retrieval.sources.utils import get_context

YoutubeConfig = ElasticSourceConfig(
    index_name="search-semantic-youtube-elser_ve1",
    text_fields=("captions_cleaned", "description_cleaned", "title"),
    excluded_fields=("captions", "description", "text_cleaned")
)


def process_youtube_hit(hit: ElasticHitsResult) -> Document:
    title = hit.source.get("title", "")
    # we only need to process long texts
    description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
    captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
    return Document(
        page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
        metadata={
            "title": title,
            "source": "Candid YouTube",
            "source_id": hit.source['video_id'],
            "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
        }
    )


def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
    url = f"https://www.youtube.com/watch?v={doc['video_id']}"
    fields = ["title", "description_cleaned"]

    fields_dict = {}
    fields_len = 0
    for field in fields:
        if doc.get(field, None) is not None:
            fields_dict[field] = doc[field]
            fields_dict[field + "_txt"] = f"<div>{doc[field]}</div>"

            if (fields_len + len(doc[field])) > 999:
                rest_text_len = 999 - fields_len
                if rest_text_len > 0:
                    fields_dict[field + "_txt"] = f"<div>{doc[field][:rest_text_len] + '[...]'}</div>"
                else: fields_dict[field + "_txt"] = f"<span>{'[...]'}</span>"
            fields_len = fields_len + len(doc[field])
        else:
            fields_dict[field] = ""
            fields_dict[field + "_txt"] = ""
    html = f"""
    <div style='height: {height_px}px; padding: 5px;'>
        <div style='height: {height_px}px; border: 1px solid #febe10;'>
            <span style='padding-left: 10px; display: inline-block; width: 100%;'>
                <div>
                    <span>
                        <b>Candid Youtube video:</b>
                        <a href='{url}' target='_blank' style='text-decoration: none;'>
                            {doc['title']} 
                        </a>
                    </span>
                    <iframe 
                        width="426"
                        height="240"
                        src="https://www.youtube.com/embed/{doc['video_id']}?si=0-y6eRrOzXTUSBDY"
                        title="YouTube video player"
                        frameborder="0"
                        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
                        referrerpolicy="strict-origin-when-cross-origin"
                        allowfullscreen
                        style="display: inline-block; float: left;padding-right: 10px;padding-top: 5px;">
                    </iframe>
                    <br>
                    <br>
                    {fields_dict["description_cleaned_txt"]}
                </div>
            </span>
        </div>
    </div>
    """
    return html