Spaces:

ConradLinus
/

AML_UG_ChatBot

Running

Upload folder using huggingface_hub

d631808 verified 2 months ago

1.31 kB

	import re
	from typing import Callable


	def get_sentence_based_splitter(
	min_sentence_length: int = 20,
	) -> Callable[[str], tuple[str, str]]:
	"""Returns a function that splits text into chunks based on sentence boundaries.

	Args:
	min_sentence_length: The minimum length of a sentence to be included in a chunk.

	Returns:
	A function that splits text into chunks based on sentence boundaries.
	"""

	def sentence_based_text_splitter(text_buffer: str) -> tuple[str, str]:
	"""
	A function to split the text into chunks. This is useful if you want to split the text into
	chunks before sending it to the TTS model rather than waiting for the whole text to be
	processed.

	Args:
	text_buffer: The text to split.

	Returns:
	A tuple of the text to process and the remaining text buffer.
	"""
	sentences = re.split(r"(?<=[.!?])\s+", text_buffer.strip())
	if len(sentences) >= 1:
	combined_sentences = " ".join(sentences[:-1])
	if len(combined_sentences) >= min_sentence_length:
	remaining_text_buffer = sentences[-1]
	return combined_sentences, remaining_text_buffer
	return "", text_buffer

	return sentence_based_text_splitter