File size: 1,836 Bytes
f86d7f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21b3016
f86d7f2
21b3016
f86d7f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from ask_candid.retrieval.sources.schema import ElasticHitsResult


def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
    """Pads the relevant chunk of text with context before and after

    Parameters
    ----------
    field_name : str
        a field with the long text that was chunked into pieces
    hit : ElasticHitsResult
    context_length : int, optional
        length of text to add before and after the chunk, by default 1024

    Returns
    -------
    str
        longer chunks stuffed together
    """

    chunks = []
    # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
    long_text = hit.source.get(field_name) or ""
    long_text = long_text.lower()

    inner_hits_field = f"embeddings.{field_name}.chunks"
    found_chunks = hit.inner_hits.get(inner_hits_field, {})
    if found_chunks:
        hits = found_chunks.get("hits", {}).get("hits", [])
        for h in hits:
            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]

            # cutting the middle because we may have tokenizing artifacts there
            chunk = chunk[3: -3]

            if add_context:
                # Find the start and end indices of the chunk in the large text
                start_index = long_text.find(chunk[:20])

                # Chunk is found
                if start_index != -1:
                    end_index = start_index + len(chunk)
                    pre_start_index = max(0, start_index - context_length)
                    post_end_index = min(len(long_text), end_index + context_length)
                    chunks.append(long_text[pre_start_index:post_end_index])
            else:
                chunks.append(chunk)
    return '\n\n'.join(chunks)