Spaces:

CandidAI
/

ask-candid

Running

updated wrong utils.py

21b3016 verified 5 months ago

1.84 kB

	from ask_candid.retrieval.sources.schema import ElasticHitsResult


	def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
	"""Pads the relevant chunk of text with context before and after

	Parameters
	----------
	field_name : str
	a field with the long text that was chunked into pieces
	hit : ElasticHitsResult
	context_length : int, optional
	length of text to add before and after the chunk, by default 1024

	Returns
	-------
	str
	longer chunks stuffed together
	"""

	chunks = []
	# NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
	long_text = hit.source.get(field_name) or ""
	long_text = long_text.lower()

	inner_hits_field = f"embeddings.{field_name}.chunks"
	found_chunks = hit.inner_hits.get(inner_hits_field, {})
	if found_chunks:
	hits = found_chunks.get("hits", {}).get("hits", [])
	for h in hits:
	chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]

	# cutting the middle because we may have tokenizing artifacts there
	chunk = chunk[3: -3]

	if add_context:
	# Find the start and end indices of the chunk in the large text
	start_index = long_text.find(chunk[:20])

	# Chunk is found
	if start_index != -1:
	end_index = start_index + len(chunk)
	pre_start_index = max(0, start_index - context_length)
	post_end_index = min(len(long_text), end_index + context_length)
	chunks.append(long_text[pre_start_index:post_end_index])
	else:
	chunks.append(chunk)
	return '\n\n'.join(chunks)