brainsqueeze's picture
updated wrong utils.py
21b3016 verified
from ask_candid.retrieval.sources.schema import ElasticHitsResult
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
"""Pads the relevant chunk of text with context before and after
Parameters
----------
field_name : str
a field with the long text that was chunked into pieces
hit : ElasticHitsResult
context_length : int, optional
length of text to add before and after the chunk, by default 1024
Returns
-------
str
longer chunks stuffed together
"""
chunks = []
# NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
long_text = hit.source.get(field_name) or ""
long_text = long_text.lower()
inner_hits_field = f"embeddings.{field_name}.chunks"
found_chunks = hit.inner_hits.get(inner_hits_field, {})
if found_chunks:
hits = found_chunks.get("hits", {}).get("hits", [])
for h in hits:
chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
# cutting the middle because we may have tokenizing artifacts there
chunk = chunk[3: -3]
if add_context:
# Find the start and end indices of the chunk in the large text
start_index = long_text.find(chunk[:20])
# Chunk is found
if start_index != -1:
end_index = start_index + len(chunk)
pre_start_index = max(0, start_index - context_length)
post_end_index = min(len(long_text), end_index + context_length)
chunks.append(long_text[pre_start_index:post_end_index])
else:
chunks.append(chunk)
return '\n\n'.join(chunks)