Spaces:
Running
Running
from ask_candid.retrieval.sources.schema import ElasticHitsResult | |
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str: | |
"""Pads the relevant chunk of text with context before and after | |
Parameters | |
---------- | |
field_name : str | |
a field with the long text that was chunked into pieces | |
hit : ElasticHitsResult | |
context_length : int, optional | |
length of text to add before and after the chunk, by default 1024 | |
Returns | |
------- | |
str | |
longer chunks stuffed together | |
""" | |
chunks = [] | |
# NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization | |
long_text = hit.source.get(field_name) or "" | |
long_text = long_text.lower() | |
inner_hits_field = f"embeddings.{field_name}.chunks" | |
found_chunks = hit.inner_hits.get(inner_hits_field, {}) | |
if found_chunks: | |
hits = found_chunks.get("hits", {}).get("hits", []) | |
for h in hits: | |
chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0] | |
# cutting the middle because we may have tokenizing artifacts there | |
chunk = chunk[3: -3] | |
if add_context: | |
# Find the start and end indices of the chunk in the large text | |
start_index = long_text.find(chunk[:20]) | |
# Chunk is found | |
if start_index != -1: | |
end_index = start_index + len(chunk) | |
pre_start_index = max(0, start_index - context_length) | |
post_end_index = min(len(long_text), end_index + context_length) | |
chunks.append(long_text[pre_start_index:post_end_index]) | |
else: | |
chunks.append(chunk) | |
return '\n\n'.join(chunks) | |