Spaces:

spark-nlp
/

SparkNLP_NER

Build error

App Files Files Community

SparkNLP_NER / _highlight.py

aemin

Upload _highlight.py

092c3fc over 3 years ago

raw

history blame

3.21 kB

	import re
	from rich.console import Console
	from rich.highlighter import RegexHighlighter
	from typing import Tuple, List


	class NullHighlighter(RegexHighlighter):
	"""Apply style to anything that looks like an email."""

	base_style = ""
	highlights = [r""]


	def highlight_document(doc: str,
	keywords: List[Tuple[str, float]]):
	""" Highlight keywords in a document
	Arguments:
	doc: The document for which to extract keywords/keyphrases
	keywords: the top n keywords for a document with their respective distances
	to the input document
	Returns:
	highlighted_text: The document with additional tags to highlight keywords
	according to the rich package
	"""
	keywords_only = [keyword for keyword, _ in keywords]
	max_len = max([len(token.split(" ")) for token in keywords_only])

	if max_len == 1:
	highlighted_text = _highlight_one_gram(doc, keywords_only)
	else:
	highlighted_text = _highlight_n_gram(doc, keywords_only)


	return highlighted_text


	def _highlight_one_gram(doc: str,
	keywords: List[str]) -> str:
	""" Highlight 1-gram keywords in a document
	Arguments:
	doc: The document for which to extract keywords/keyphrases
	keywords: the top n keywords for a document
	Returns:
	highlighted_text: The document with additional tags to highlight keywords
	according to the rich package
	"""
	tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).split(" ")

	highlighted_text = " ".join([f'<span style="background-color: #FFFF00">{token}</span>'
	if token.lower() in keywords
	else f"{token}"
	for token in tokens]).strip()


	return highlighted_text


	def _highlight_n_gram(doc: str,
	keywords: List[str]) -> str:
	""" Highlight n-gram keywords in a document
	Arguments:
	doc: The document for which to extract keywords/keyphrases
	keywords: the top n keywords for a document
	Returns:
	highlighted_text: The document with additional tags to highlight keywords
	according to the rich package
	"""
	max_len = max([len(token.split(" ")) for token in keywords])
	tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).strip().split(" ")
	n_gram_tokens = [[" ".join(tokens[i: i + max_len][0: j + 1]) for j in range(max_len)] for i, _ in enumerate(tokens)]
	highlighted_text = []
	skip = False

	for n_grams in n_gram_tokens:
	candidate = False

	if not skip:
	for index, n_gram in enumerate(n_grams):

	if n_gram.lower() in keywords:
	candidate = f'<span style="background-color: #FFFF00">{n_gram}</span>' + n_grams[-1].split(n_gram)[-1]
	skip = index + 1

	if not candidate:
	candidate = n_grams[0]

	highlighted_text.append(candidate)

	else:
	skip = skip - 1
	highlighted_text = " ".join(highlighted_text)
	return highlighted_text