# Copyright The Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Copyright 2017 Maja Popovic # The code is derived from https://github.com/m-popovic/chrF/blob/6d3c384/chrF%2B%2B.py # The original author and copyright holder have agreed to relicense the derived code under the Apache License, # Version 2.0 (the "License") # Reference to the approval: https://github.com/Lightning-AI/torchmetrics/pull/2701#issuecomment-2316891785 from collections import defaultdict from collections.abc import Sequence from itertools import chain from typing import List, Optional, Union import torch from torch import Tensor, tensor from torchmetrics.functional.text.helper import _validate_inputs _EPS_SMOOTHING = tensor(1e-16) # Taken from https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py _PUNCTUATIONS = set("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") def _prepare_n_grams_dicts( n_char_order: int, n_word_order: int ) -> tuple[ dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor] ]: """Prepare dictionaries with default zero values for total ref, hypothesis and matching character and word n-grams. Args: n_char_order: A character n-gram order. n_word_order: A word n-gram order. Return: Dictionaries with default zero values for total reference, hypothesis and matching character and word n-grams. """ total_preds_char_n_grams: dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} total_preds_word_n_grams: dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} total_target_char_n_grams: dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} total_target_word_n_grams: dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} total_matching_char_n_grams: dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_char_order)} total_matching_word_n_grams: dict[int, Tensor] = {n + 1: tensor(0.0) for n in range(n_word_order)} return ( total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, ) def _get_characters(sentence: str, whitespace: bool) -> list[str]: """Split sentence into individual characters. Args: sentence: An input sentence to split. whitespace: An indication whether to keep whitespaces during character n-gram extraction. Return: A list of separated characters. """ if whitespace: return list(sentence) return list(sentence.strip().replace(" ", "")) def _separate_word_and_punctuation(word: str) -> list[str]: """Separates out punctuations from beginning and end of words for chrF. Adapted from https://github.com/m-popovic/chrF and https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py. Args: word: An input word to be separated from a punctuation if present. Return: A list of a single word or a separated word and punctuation. """ if len(word) == 1: return [word] if word[-1] in _PUNCTUATIONS: return [word[:-1], word[-1]] if word[0] in _PUNCTUATIONS: return [word[0], word[1:]] return [word] def _get_words_and_punctuation(sentence: str) -> list[str]: """Separates out punctuations from beginning and end of words for chrF for all words in the sentence. Args: sentence: An input sentence to split Return: An aggregated list of separated words and punctuations. """ return list(chain.from_iterable(_separate_word_and_punctuation(word) for word in sentence.strip().split())) def _ngram_counts(char_or_word_list: list[str], n_gram_order: int) -> dict[int, dict[tuple[str, ...], Tensor]]: """Calculate n-gram counts. Args: char_or_word_list: A list of characters of words n_gram_order: The largest number of n-gram. Return: A dictionary of dictionaries with a counts of given n-grams. """ ngrams: dict[int, dict[tuple[str, ...], Tensor]] = defaultdict(lambda: defaultdict(lambda: tensor(0.0))) for n in range(1, n_gram_order + 1): for ngram in (tuple(char_or_word_list[i : i + n]) for i in range(len(char_or_word_list) - n + 1)): ngrams[n][ngram] += tensor(1) return ngrams def _get_n_grams_counts_and_total_ngrams( sentence: str, n_char_order: int, n_word_order: int, lowercase: bool, whitespace: bool ) -> tuple[ dict[int, dict[tuple[str, ...], Tensor]], dict[int, dict[tuple[str, ...], Tensor]], dict[int, Tensor], dict[int, Tensor], ]: """Get n-grams and total n-grams. Args: sentence: An input sentence n_char_order: A character n-gram order. n_word_order: A word n-gram order. lowercase: An indication whether to enable case-insensitivity. whitespace: An indication whether to keep whitespaces during character n-gram extraction. Return: char_n_grams_counts: A dictionary of dictionaries with sentence character n-grams. word_n_grams_counts: A dictionary of dictionaries with sentence word n-grams. total_char_n_grams: A dictionary containing a total number of sentence character n-grams. total_word_n_grams: A dictionary containing a total number of sentence word n-grams. """ def _char_and_word_ngrams_counts( sentence: str, n_char_order: int, n_word_order: int, lowercase: bool ) -> tuple[dict[int, dict[tuple[str, ...], Tensor]], dict[int, dict[tuple[str, ...], Tensor]]]: """Get a dictionary of dictionaries with a counts of given n-grams.""" if lowercase: sentence = sentence.lower() char_n_grams_counts = _ngram_counts(_get_characters(sentence, whitespace), n_char_order) word_n_grams_counts = _ngram_counts(_get_words_and_punctuation(sentence), n_word_order) return char_n_grams_counts, word_n_grams_counts def _get_total_ngrams(n_grams_counts: dict[int, dict[tuple[str, ...], Tensor]]) -> dict[int, Tensor]: """Get total sum of n-grams over n-grams w.r.t n.""" total_n_grams: dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) for n in n_grams_counts: total_n_grams[n] = sum(n_grams_counts[n].values()).detach().clone() # type: ignore return total_n_grams char_n_grams_counts, word_n_grams_counts = _char_and_word_ngrams_counts( sentence, n_char_order, n_word_order, lowercase ) total_char_n_grams = _get_total_ngrams(char_n_grams_counts) total_word_n_grams = _get_total_ngrams(word_n_grams_counts) return char_n_grams_counts, word_n_grams_counts, total_char_n_grams, total_word_n_grams def _get_ngram_matches( hyp_n_grams_counts: dict[int, dict[tuple[str, ...], Tensor]], ref_n_grams_counts: dict[int, dict[tuple[str, ...], Tensor]], ) -> dict[int, Tensor]: """Get a number of n-gram matches between reference and hypothesis n-grams. Args: hyp_n_grams_counts: n-grams counts for hypothesis ref_n_grams_counts: n-grams counts for reference Return: matching_n_grams """ matching_n_grams: dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) for n in hyp_n_grams_counts: min_n_grams = [ torch.min(ref_n_grams_counts[n][n_gram], hyp_n_grams_counts[n][n_gram]) for n_gram in hyp_n_grams_counts[n] ] matching_n_grams[n] = sum(min_n_grams).detach().clone() # type: ignore return matching_n_grams def _sum_over_dicts(total_n_grams: dict[int, Tensor], n_grams: dict[int, Tensor]) -> dict[int, Tensor]: """Aggregate total n-grams to keep corpus-level statistics. Args: total_n_grams: A dictionary containing a total corpus-level number of n-grams. n_grams: A dictionary containing a sentence-level number of n-grams. Return: A dictionary containing a total corpus-level number of n-grams. """ for n in n_grams: total_n_grams[n] += n_grams[n] return total_n_grams def _calculate_fscore( matching_char_n_grams: dict[int, Tensor], matching_word_n_grams: dict[int, Tensor], hyp_char_n_grams: dict[int, Tensor], hyp_word_n_grams: dict[int, Tensor], ref_char_n_grams: dict[int, Tensor], ref_word_n_grams: dict[int, Tensor], n_order: float, beta: float, ) -> Tensor: """Calculate sentence-level chrF/chrF++ score. For given hypothesis and reference statistics (either sentence-level or corpus-level) the chrF/chrF++ score is returned. Args: matching_char_n_grams: A total number of matching character n-grams between the best matching reference and hypothesis. matching_word_n_grams: A total number of matching word n-grams between the best matching reference and hypothesis. hyp_char_n_grams: A total number of hypothesis character n-grams. hyp_word_n_grams: A total number of hypothesis word n-grams. ref_char_n_grams: A total number of reference character n-grams. ref_word_n_grams: A total number of reference word n-grams. n_order: A sum of character and word n-gram order. beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal. Return: A chrF/chrF++ score. This function is universal both for sentence-level and corpus-level calculation. """ def _get_n_gram_fscore( matching_n_grams: dict[int, Tensor], ref_n_grams: dict[int, Tensor], hyp_n_grams: dict[int, Tensor], beta: float ) -> dict[int, Tensor]: """Get n-gram level f-score.""" precision: dict[int, Tensor] = { n: matching_n_grams[n] / hyp_n_grams[n] if hyp_n_grams[n] > 0 else tensor(0.0) for n in matching_n_grams } recall: dict[int, Tensor] = { n: matching_n_grams[n] / ref_n_grams[n] if ref_n_grams[n] > 0 else tensor(0.0) for n in matching_n_grams } denominator: dict[int, Tensor] = { n: torch.max(beta**2 * precision[n] + recall[n], _EPS_SMOOTHING) for n in matching_n_grams } f_score: dict[int, Tensor] = { n: (1 + beta**2) * precision[n] * recall[n] / denominator[n] for n in matching_n_grams } return f_score char_n_gram_f_score = _get_n_gram_fscore(matching_char_n_grams, ref_char_n_grams, hyp_char_n_grams, beta) word_n_gram_f_score = _get_n_gram_fscore(matching_word_n_grams, ref_word_n_grams, hyp_word_n_grams, beta) return (sum(char_n_gram_f_score.values()) + sum(word_n_gram_f_score.values())) / tensor(n_order) def _calculate_sentence_level_chrf_score( targets: list[str], pred_char_n_grams_counts: dict[int, dict[tuple[str, ...], Tensor]], pred_word_n_grams_counts: dict[int, dict[tuple[str, ...], Tensor]], pred_char_n_grams: dict[int, Tensor], pred_word_n_grams: dict[int, Tensor], n_char_order: int, n_word_order: int, n_order: float, beta: float, lowercase: bool, whitespace: bool, ) -> tuple[Tensor, dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor]]: """Calculate the best sentence-level chrF/chrF++ score. For a given pre-processed hypothesis, all references are evaluated and score and statistics for the best matching reference is returned. Args: targets: An iterable of references. pred_char_n_grams_counts: A dictionary of dictionaries with hypothesis character n-grams. pred_word_n_grams_counts: A dictionary of dictionaries with hypothesis word n-grams. pred_char_n_grams: A total number of hypothesis character n-grams. pred_word_n_grams: A total number of hypothesis word n-grams. n_char_order: A character n-gram order. n_word_order: A word n-gram order. n_order: A sum of character and word n-gram order. beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal. lowercase: An indication whether to enable case-insensitivity. whitespace: An indication whether to keep whitespaces during character n-gram extraction. Return: Return chrF/chrF++ score and statistics for the best matching hypothesis and reference. f_score: A sentence-level chrF/chrF++ score. matching_char_n_grams: A total number of matching character n-grams between the best matching reference and hypothesis. matching_word_n_grams: A total number of matching word n-grams between the best matching reference and hypothesis. target_char_n_grams: A total number of reference character n-grams. target_word_n_grams: A total number of reference word n-grams. """ best_f_score = tensor(0.0) best_matching_char_n_grams: dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) best_matching_word_n_grams: dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) best_target_char_n_grams: dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) best_target_word_n_grams: dict[int, Tensor] = defaultdict(lambda: tensor(0.0)) for target in targets: ( target_char_n_grams_counts, target_word_n_grams_counts, target_char_n_grams, target_word_n_grams, ) = _get_n_grams_counts_and_total_ngrams(target, n_char_order, n_word_order, lowercase, whitespace) matching_char_n_grams = _get_ngram_matches(target_char_n_grams_counts, pred_char_n_grams_counts) matching_word_n_grams = _get_ngram_matches(target_word_n_grams_counts, pred_word_n_grams_counts) f_score = _calculate_fscore( matching_char_n_grams, matching_word_n_grams, pred_char_n_grams, pred_word_n_grams, target_char_n_grams, target_word_n_grams, n_order, beta, ) if f_score > best_f_score: best_f_score = f_score best_matching_char_n_grams = matching_char_n_grams best_matching_word_n_grams = matching_word_n_grams best_target_char_n_grams = target_char_n_grams best_target_word_n_grams = target_word_n_grams return ( best_f_score, best_matching_char_n_grams, best_matching_word_n_grams, best_target_char_n_grams, best_target_word_n_grams, ) def _chrf_score_update( preds: Union[str, Sequence[str]], target: Union[Sequence[str], Sequence[Sequence[str]]], total_preds_char_n_grams: dict[int, Tensor], total_preds_word_n_grams: dict[int, Tensor], total_target_char_n_grams: dict[int, Tensor], total_target_word_n_grams: dict[int, Tensor], total_matching_char_n_grams: dict[int, Tensor], total_matching_word_n_grams: dict[int, Tensor], n_char_order: int, n_word_order: int, n_order: float, beta: float, lowercase: bool, whitespace: bool, sentence_chrf_score: Optional[List[Tensor]] = None, ) -> tuple[ dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], dict[int, Tensor], Optional[List[Tensor]], ]: """Update function for chrf score. Args: preds: An iterable of hypothesis corpus. target: An iterable of iterables of reference corpus. total_preds_char_n_grams: A dictionary containing a total number of hypothesis character n-grams. total_preds_word_n_grams: A dictionary containing a total number of hypothesis word n-grams. total_target_char_n_grams: A dictionary containing a total number of reference character n-grams. total_target_word_n_grams: A dictionary containing a total number of reference word n-grams. total_matching_char_n_grams: A dictionary containing a total number of matching character n-grams between references and hypotheses. total_matching_word_n_grams: A dictionary containing a total number of total matching word n-grams between references and hypotheses. n_char_order: A character n-gram order. n_word_order: A word n-gram order. n_order: Sum of character and word n-gram order. beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal. lowercase: An indication whether to enable case-insensitivity. whitespace: An indication whether to keep whitespaces during character n-gram extraction. sentence_chrf_score: A list of sentence-level chrF/chrF++ scores. Return: total_target_char_n_grams: number of reference character n-grams. total_target_word_n_grams: number of reference word n-grams. total_preds_char_n_grams: number of hypothesis character n-grams. total_preds_word_n_grams: number of hypothesis word n-grams. total_matching_char_n_grams: number of matching character n-grams between references and hypotheses. total_matching_word_n_grams: number of total matching word n-grams between references and hypotheses. sentence_chrf_score: A list of sentence-level chrF/chrF++ scores. Raises: ValueError: If length of ``preds`` and ``target`` differs. """ target_corpus, preds = _validate_inputs(target, preds) for pred, targets in zip(preds, target_corpus): ( pred_char_n_grams_counts, pred_word_n_grams_counts, pred_char_n_grams, pred_word_n_grams, ) = _get_n_grams_counts_and_total_ngrams(pred, n_char_order, n_word_order, lowercase, whitespace) total_preds_char_n_grams = _sum_over_dicts(total_preds_char_n_grams, pred_char_n_grams) total_preds_word_n_grams = _sum_over_dicts(total_preds_word_n_grams, pred_word_n_grams) ( sentence_level_f_score, matching_char_n_grams, matching_word_n_grams, target_char_n_grams, target_word_n_grams, ) = _calculate_sentence_level_chrf_score( targets, # type: ignore pred_char_n_grams_counts, pred_word_n_grams_counts, pred_char_n_grams, pred_word_n_grams, n_char_order, n_word_order, n_order, beta, lowercase, whitespace, ) if sentence_chrf_score is not None: sentence_chrf_score.append(sentence_level_f_score.unsqueeze(0)) total_target_char_n_grams = _sum_over_dicts(total_target_char_n_grams, target_char_n_grams) total_target_word_n_grams = _sum_over_dicts(total_target_word_n_grams, target_word_n_grams) total_matching_char_n_grams = _sum_over_dicts(total_matching_char_n_grams, matching_char_n_grams) total_matching_word_n_grams = _sum_over_dicts(total_matching_word_n_grams, matching_word_n_grams) return ( total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, sentence_chrf_score, ) def _chrf_score_compute( total_preds_char_n_grams: dict[int, Tensor], total_preds_word_n_grams: dict[int, Tensor], total_target_char_n_grams: dict[int, Tensor], total_target_word_n_grams: dict[int, Tensor], total_matching_char_n_grams: dict[int, Tensor], total_matching_word_n_grams: dict[int, Tensor], n_order: float, beta: float, ) -> Tensor: """Compute chrF/chrF++ score based on pre-computed target, prediction and matching character and word n-grams. Args: total_preds_char_n_grams: number of hypothesis character n-grams. total_preds_word_n_grams: number of hypothesis word n-grams. total_target_char_n_grams: number of reference character n-grams. total_target_word_n_grams: number of reference word n-grams. total_matching_char_n_grams: number of matching character n-grams between references and hypotheses. total_matching_word_n_grams: number of total matching word n-grams between references and hypotheses. n_order: A sum of character and word n-gram order. beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal. Return: A corpus-level chrF/chrF++ score. """ return _calculate_fscore( total_matching_char_n_grams, total_matching_word_n_grams, total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, n_order, beta, ) def chrf_score( preds: Union[str, Sequence[str]], target: Sequence[Union[str, Sequence[str]]], n_char_order: int = 6, n_word_order: int = 2, beta: float = 2.0, lowercase: bool = False, whitespace: bool = False, return_sentence_level_score: bool = False, ) -> Union[Tensor, tuple[Tensor, Tensor]]: """Calculate `chrF score`_ of machine translated text with one or more references. This implementation supports both chrF score computation introduced in [1] and chrF++ score introduced in `chrF++ score`_. This implementation follows the implementations from https://github.com/m-popovic/chrF and https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/chrf.py. Args: preds: An iterable of hypothesis corpus. target: An iterable of iterables of reference corpus. n_char_order: A character n-gram order. If `n_char_order=6`, the metrics refers to the official chrF/chrF++. n_word_order: A word n-gram order. If `n_word_order=2`, the metric refers to the official chrF++. If `n_word_order=0`, the metric is equivalent to the original chrF. beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal. lowercase: An indication whether to enable case-insensitivity. whitespace: An indication whether to keep whitespaces during character n-gram extraction. return_sentence_level_score: An indication whether a sentence-level chrF/chrF++ score to be returned. Return: A corpus-level chrF/chrF++ score. (Optionally) A list of sentence-level chrF/chrF++ scores if `return_sentence_level_score=True`. Raises: ValueError: If ``n_char_order`` is not an integer greater than or equal to 1. ValueError: If ``n_word_order`` is not an integer greater than or equal to 0. ValueError: If ``beta`` is smaller than 0. Example: >>> from torchmetrics.functional.text import chrf_score >>> preds = ['the cat is on the mat'] >>> target = [['there is a cat on the mat', 'a cat is on the mat']] >>> chrf_score(preds, target) tensor(0.8640) References: [1] chrF: character n-gram F-score for automatic MT evaluation by Maja Popović `chrF score`_ [2] chrF++: words helping character n-grams by Maja Popović `chrF++ score`_ """ if not isinstance(n_char_order, int) or n_char_order < 1: raise ValueError("Expected argument `n_char_order` to be an integer greater than or equal to 1.") if not isinstance(n_word_order, int) or n_word_order < 0: raise ValueError("Expected argument `n_word_order` to be an integer greater than or equal to 0.") if beta < 0: raise ValueError("Expected argument `beta` to be greater than 0.") n_order = float(n_char_order + n_word_order) ( total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, ) = _prepare_n_grams_dicts(n_char_order, n_word_order) sentence_chrf_score: Optional[List[Tensor]] = [] if return_sentence_level_score else None ( total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, sentence_chrf_score, ) = _chrf_score_update( preds, target, total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, n_char_order, n_word_order, n_order, beta, lowercase, whitespace, sentence_chrf_score, ) chrf_f_score = _chrf_score_compute( total_preds_char_n_grams, total_preds_word_n_grams, total_target_char_n_grams, total_target_word_n_grams, total_matching_char_n_grams, total_matching_word_n_grams, n_order, beta, ) if sentence_chrf_score: return chrf_f_score, torch.cat(sentence_chrf_score) return chrf_f_score