# MIT License # Copyright (c) 2024 The HuggingFace Team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import re from dataclasses import dataclass, field, replace from functools import lru_cache from itertools import groupby from typing import Literal, Sequence from math_verify.errors import TimeoutException import sympy from sympy import Basic, MatrixBase, Number from latex2sympy2_extended.sets import FiniteSet from sympy.parsing import parse_expr from math_verify.grader import should_treat_as_complex from latex2sympy2_extended.latex2sympy2 import ( NormalizationConfig, normalize_latex, latex2sympy, ) from math_verify.utils import timeout import logging logger = logging.getLogger(__name__) @dataclass(frozen=True) class LatexExtractionConfig: """Config for extracting latex from the prediction. Attributes: try_extract_without_anchor (bool): Whether to try extracting latex without requiring specific anchors like "answer:" or "final answer is" boxed_match_priority (int): Priority for matching boxed expressions (e.g., \boxed{}). - 0: Highest priority (matched first) - 50: Default priority (matched after final answer patterns) - -1: Disable boxed expression matching normalization_config (NormalizationConfig): Configuration for LaTeX normalization. Controls preprocessing of LaTeX expressions including: - Basic LaTeX cleanup - Unit handling - Operator formatting - Boxed expression extraction - Equation parsing Defaults to a comprehensive normalization configuration. """ try_extract_without_anchor: bool = True boxed_match_priority: int = 50 normalization_config: NormalizationConfig = field( default_factory=lambda: NormalizationConfig( basic_latex=True, units=True, malformed_operators=True, nits=True, boxed="all", equations=False, ) ) @dataclass(frozen=True) class ExprExtractionConfig: """Config for extracting mathematical expressions. Attributes: try_extract_without_anchor (bool): Whether to try extracting expressions without requiring specific anchors like "answer:" or "final answer is" """ try_extract_without_anchor: bool = True @dataclass(frozen=True) class StringExtractionConfig: """Config for extracting literal strings. Attributes: strings (tuple[str]): The strings to extract try_extract_without_anchor (bool): Whether to try extracting strings without requiring specific anchors like "answer:" or "final answer is" """ strings: tuple[str, ...] = field(default_factory=lambda: ("A", "B", "C", "D")) try_extract_without_anchor: bool = True lowercase: bool = True ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | StringExtractionConfig @lru_cache(maxsize=10) def lazy_string_regex( string_extraction_config: StringExtractionConfig, ) -> list[tuple[re.Pattern[str], int]]: # First get indices to predict string_keys = f"(?P{'|'.join([re.escape(i) for i in string_extraction_config.strings])})" # The strings are either surrounded with **answer**., or 'answer.' or the same without the dot full_stop_re = r"\." comma_re = r"," colon_re = r":" space_re = r"\s" answer_prefix_re = rf"(^|{space_re})(?:\*\*)?" answer_suffix_re = ( rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|$)" ) answer_re = f"{answer_prefix_re}{string_keys}{answer_suffix_re}" answer_re_start = rf"^(?:\*\*)?{string_keys}{answer_suffix_re}" answer_word = "(?i:answer)" regexes = [] final_answer_prefixed_re = rf"(?i:final answer is)\:?\s*{string_keys}\.?\s?I hope" # To allow stuff like "final answer is to your question" final_answer_prefixed_just_is = ( rf"(?i:final answer.{{0,100}}?)\s+is\:?\s*{string_keys}" ) regexes.extend( [ (final_answer_prefixed_re, 0), (final_answer_prefixed_just_is, 50), ] ) regexes.extend( [ # Most specific patterns first (f"{answer_word}{colon_re}.{{0,50}}?{answer_re}", 100), # Answer word patterns (f"{answer_word}.{{0,50}}?{answer_re}", 200), ] ) if string_extraction_config.try_extract_without_anchor: # Start of line patterns regexes.append((answer_re_start, 250)) # Plain string patterns regexes.append((answer_re, 300)) return [(re.compile(pattern), priority) for pattern, priority in regexes] # All of the regexes are cached, to avoid repeated compiling during processing of same task @lru_cache(maxsize=1) def lazy_expr_regex( expr_config: ExprExtractionConfig, ) -> list[tuple[re.Pattern[str], int]]: # Basic number patterns (no LaTeX) number_re = ( # Format 1: Numbers with thousand separators (e.g., "1,234.56" or "1 234.56") r"(?-?[1-9]\d{0,2}(?:[ ,]\d{3})+)(?P\.\d+)?|" # Format 2: Simple numbers with decimal point or comma (e.g., "123.45" or "123,45") r"(?P-?\d+)(?P[.,]\d+)|" # Format 3: Decimal part only (e.g., ".123") r"(?P\.\d+)|" # Format 4: Integer only (e.g., "123") r"(?P-?\d+)" r")(?P\s*(?:%|[Pp]ercent|\s*[Pp]ercentage|\s*[Pp]ct))?" ) # Expressions such as 1/2 operators = [r"\+", r"\-", r"\*", r"\×", r"\/", r"\^", r"\(", r"\)", r"\÷"] operators_re = "".join(operators) all_expr_chars = r"[\d\.\s" + operators_re + r"]" # Expression should have at minimum at least one operator and must start with a digit expr_re = ( rf"(?P-?\(?-?\d{all_expr_chars}*[{operators_re}]{all_expr_chars}+\)?)" ) # Punctuation regexes full_stop_re = r"\." comma_re = r"," colon_re = r":" space_re = r"\s" currency_units = re.escape("$€£¥₹₽₪₩₫฿₡₢₣₤₥₦₧₨₩₪₫₭₮₯₰₱₲₳₴₵₶₷₸₹₺₻₼₽₾₿") expr_prefix_re = rf"(?:^|{space_re}|\=)(?:\*\*)?" expr_suffix_re = ( rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|\)|\$|$)" ) # Expressions must be prefixed and suffixed while, digits don't need suffix and can have currency units preceeded, this is to ensure # That we can extract stuff like $100 or 100m2, while we don't extract XDY2K as 2 expr_with_anchors = rf"(?:{expr_prefix_re}{expr_re}{expr_suffix_re})" number_with_anchors = rf"(?:{expr_prefix_re}[{currency_units}]?{number_re})" expr_or_number = rf"(?:{expr_with_anchors}|{number_with_anchors})" regexes: list[tuple[str, int]] = [] final_answer_prefixed_re = ( rf"(?i:final answer is)\:?\s*{expr_or_number}\.?\s?I hope" ) final_answer_prefixed_just_is = ( rf"(?i:final answer.{{0,100}}?)\s+is\:?{expr_or_number}" ) regexes.append((final_answer_prefixed_re, 0)) regexes.append((final_answer_prefixed_just_is, 50)) answer_prefix_re = r"(?i:answer)" # Match after the last equals with answer word - require the number pattern, equals_re_colon = rf"{answer_prefix_re}{colon_re}(?:.{{0,100}}=\s*|.{{0,50}}?){expr_or_number}(?!\s*=)" equals_re = ( rf"{answer_prefix_re}(?:.{{0,100}}=\s*|.{{0,50}}?){expr_or_number}(?!\s*=)" ) regexes.extend([(equals_re_colon, 100), (equals_re, 200)]) if expr_config.try_extract_without_anchor: # If everything fails, try to match plain expr/number regexes.append((expr_with_anchors, 300)) regexes.append((number_with_anchors, 300)) return [(re.compile(pattern), priority) for pattern, priority in regexes] def make_latex_env_pattern( prefix: str = "", context: Literal["boxed", "plain"] = "plain" ) -> str: """Creates a LaTeX environment pattern with uniquely prefixed group names. Args: prefix (str): Prefix to add to group names to make them unique context (Literal["boxed", "plain", "fraction"]): Type of content to match inside the environments - "boxed": Match environments containing \boxed{...} - "plain": Match any LaTeX content - "fraction": Match only fractions Returns: str: Regex pattern for matching LaTeX environments with percent suffix """ percent_re_group = rf"(?P<{prefix}percent>(?:\\?%|[Pp]ercent|[Pp]ercentage|[Pp]ct))" # Define base content patterns display_dollar_content = r"(?:[^$]|\$(?!\$))" # Either \ not followed by ] or everything but \ display_content_bracket = r"(?:[^\\]|\\(?!\]))" inline_dollar_content = r"(?:\\[$]|[^\n$])" inline_content_parenthesis = r"(?:[^\\\n]|\\(?!\)))" inline_content_bracket = r"[^\n\]\[]" if context == "boxed": # Rewrite patterns to optionally include boxed content display_dollar_content = rf"{display_dollar_content}*?\\boxed{{{display_dollar_content}+?}}{display_dollar_content}*?" display_content_bracket = rf"{display_content_bracket}*?\\boxed{{{display_content_bracket}+?}}{display_content_bracket}*?" inline_dollar_content = rf"{inline_dollar_content}*?\\boxed{{{inline_dollar_content}+?}}{inline_dollar_content}*?" inline_content_parenthesis = rf"{inline_content_parenthesis}*?\\boxed{{{inline_content_parenthesis}+?}}{inline_content_parenthesis}*?" inline_content_bracket = rf"{inline_content_bracket}*?\\boxed{{{inline_content_bracket}+?}}{inline_content_bracket}*?" else: display_dollar_content = rf"{display_dollar_content}+?" display_content_bracket = rf"{display_content_bracket}+?" inline_dollar_content = rf"{inline_dollar_content}+?" inline_content_parenthesis = rf"{inline_content_parenthesis}+?" inline_content_bracket = rf"{inline_content_bracket}+?" # Build list of regex patterns patterns = [ # Display math environments (allow multiline) rf"(?{display_dollar_content})(?{display_content_bracket})(?{inline_dollar_content})(?{inline_content_parenthesis})(?{inline_content_bracket})\]\s", ] if context == "plain": simple_number = r"-?\d+(?:[.,]\d+)?" patterns.append( rf"(?P<{prefix}latexFraction>-?\\frac{{{simple_number}}}{{{simple_number}}})" ) # Join patterns with | and wrap in parentheses latex_env_re = rf"(?:(?:{'|'.join(patterns)})\s*{percent_re_group}?)" return latex_env_re @lru_cache(maxsize=1) def lazy_latex_regex( latex_config: LatexExtractionConfig, ) -> list[tuple[re.Pattern[str], int]]: # Pattern for multiple latex environments connected by and/or (also considering oxford comma) # Create patterns for up to 5 connected expressions first_latex_group = make_latex_env_pattern("first_") next_groups = "".join( [ rf"(?:\s*(?:,?and|,?or|,)\s*{make_latex_env_pattern(f'next{i}_')})?" for i in range(1, 6) ] ) latex_envs_re = rf"(?:{first_latex_group}{next_groups})" colon_re = r":" answer_prefix_re = r"(?i:answer)" # We first match boxed env, for some reason that's the most common case of output # Then we match the latex with environments, then we try to match the fraction regexes: list[tuple[str, int]] = [] for latex_re in [latex_envs_re]: final_answer_prefixed_re = rf"(?i:final answer is)\:?\s*{latex_re}\.?\s?I hope" final_answer_prefixed_just_is = ( rf"(?i:final answer.{{0,100}}?)\s+is\:?\s*{latex_re}" ) regexes.append((final_answer_prefixed_re, 0)) regexes.append((final_answer_prefixed_just_is, 50)) # Match with answer word - higher priority than plain latex answer_re_colon = f"{answer_prefix_re}{colon_re}.{{0,50}}?{latex_re}" answer_re = f"{answer_prefix_re}.{{0,50}}?{latex_re}" regexes.extend([(answer_re_colon, 100), (answer_re, 200)]) # Match plain LaTeX - lowest priority if latex_config.try_extract_without_anchor: regexes.append((latex_re, 300)) # This ensures that boxed is matched right after the final answer xxxx if latex_config.boxed_match_priority >= 0: latex_re_boxed = make_latex_env_pattern(prefix="first_", context="boxed") next_groups = "".join( [ rf"\s*(?:\s*(?:,?and|,?or|,)\s*{make_latex_env_pattern(f'next{i}_', context='boxed')})?" for i in range(1, 6) ] ) latex_re_boxed = rf"{latex_re_boxed}{next_groups}" regexes.append((latex_re_boxed, latex_config.boxed_match_priority)) # Match plain boxed, the issue with plain boxed is that it's impossible to know where it stops, so if there are # till last }. We do the actuall extraction in the normalization step. regexes.append( (r"(?P\\boxed{.+})", latex_config.boxed_match_priority) ) return [(re.compile(pattern, re.DOTALL), priority) for pattern, priority in regexes] def get_extraction_regexes( target_types: Sequence[ExtractionTarget], ) -> list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]]: extraction_regexes: list[ tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget] ] = [ ( (lazy_latex_regex(target_type), target_type) if isinstance(target_type, LatexExtractionConfig) else ( (lazy_expr_regex(target_type), target_type) if isinstance(target_type, ExprExtractionConfig) else (lazy_string_regex(target_type), target_type) ) ) for target_type in target_types ] return extraction_regexes # Small cache, to catche repeated calls invalid parsing @lru_cache(maxsize=20) def parse_latex_cached(latex: str): # First try to parse the latex as is try: return latex2sympy( latex, is_real=not should_treat_as_complex(latex), convert_degrees=False, normalization_config=None, ) except Exception as e: # If that fails, try to parse just the last equation last_eq_latex = get_last_eq(latex) if last_eq_latex != latex: return latex2sympy( last_eq_latex, is_real=not should_treat_as_complex(last_eq_latex), convert_degrees=False, normalization_config=None, ) else: raise e @lru_cache(maxsize=20) def parse_expr_cached(expr: str): return parse_expr(expr, evaluate=False) def extract_expr(match: re.Match) -> tuple[str | sympy.Expr | None, str]: # First combine the number groups = match.groupdict() # Expr group will always exist because every regex has it expr = groups.get("expr", "") integer = next( (val for name, val in groups.items() if name.startswith("integer") and val), "" ) decimal = next( (val for name, val in groups.items() if name.startswith("decimal") and val), "" ) is_percentage = True if groups.get("percent", None) else False if integer or decimal: # This makes sure we can convert numbers like 0001 to 1. Do note that this can convert 0 to '', so we assume an empty string was 0 and convert it back afterwards. integer = integer.translate(str.maketrans("", "", ", ")).lstrip("0") if len(integer) == 0: integer = "0" decimal = decimal.replace(",", ".") number_str = f"{integer}{decimal}" number = Number(number_str) if is_percentage: number = convert_to_pct(number) return number, number_str # Otherwise just return the expression # Remove new lines and spaces if expr: try: return ( parse_expr_cached(expr.replace("\n", " ").replace("^", "**")), expr, ) except Exception: pass return None, expr def convert_to_pct(number: Number): return sympy.Mul(number, sympy.Rational(1, 100), evaluate=False) equation_split_regex = re.compile(r"(?)=") def get_last_eq(latex: str): # This is to ensure that a=1,b=2 is not splitted if "," not in latex and ";" not in latex: eq_parts = equation_split_regex.split(latex) # We only shorten if there are more than 2 parts, otherwise we keep equation as is if len(eq_parts) > 2: return eq_parts[-1] return latex @lru_cache(maxsize=20) def extract_latex( match: re.Match, latex_config: LatexExtractionConfig ) -> tuple[sympy.Expr | str | None, str]: latex_exprs = [] latex_strs = [] # Get all latex groups (both first_ and nextN_ prefixes) first_latex_group = next( ( (val, name) for name, val in match.groupdict().items() if name.startswith("first_latex") and val ), None, ) # Get all nextN_ groups next_latex_groups = [ next( ( (val, name) for name, val in match.groupdict().items() if name.startswith(f"next{i}_latex") and val ), None, ) for i in range(1, 6) ] all_latex = list( filter(lambda x: x is not None, [first_latex_group] + next_latex_groups) ) for latex, name in all_latex: name_without_prefix = name.split("_")[0] group_name = name.split("_")[1] if len(name.split("_")) > 1 else None is_percentage = ( True if match.groupdict().get(f"{name_without_prefix}_percent") else False ) # Use modified config if group name is 'boxed' config = latex_config.normalization_config if group_name == "latexBoxed": config = replace(config, boxed="last") # Use replace to modify single field normalized_latex = normalize_latex( latex, config=config, ) latex_strs.append(normalized_latex) try: parsed_latex = parse_latex_cached(normalized_latex) if is_percentage: parsed_latex = convert_to_pct(parsed_latex) latex_exprs.append(parsed_latex) except Exception: latex_exprs.append(None) pass if not latex_exprs: return None, "" # If we have multiple expressions and all of them are parsed, wrap them in a Tuple if len(latex_exprs) > 1 and all(expr is not None for expr in latex_exprs): # To handle solution is: 1,2 and 3 all_elements = [] for expr in latex_exprs: if isinstance(expr, FiniteSet): all_elements.extend(expr.args) else: all_elements.append(expr) return FiniteSet(*all_elements), " and ".join(latex_strs) # Otherwise return the single expression return latex_exprs[0], latex_strs[0] def extract_string(match: re.Match, string_config: StringExtractionConfig): extracted_str = match.group("string_keys") parsed_str = extracted_str if string_config.lowercase: parsed_str = extracted_str.lower() return parsed_str, extracted_str def extract_match( match: re.Match, target_type: ExtractionTarget ) -> tuple[Basic | MatrixBase | str | None, str]: """Extracts the match from the regex match. Args: match (re.Match): The regex match object containing the extracted text target_type (ExtractionTarget): The type of extraction to perform (latex, expression, or indices) Returns: tuple[Basic | MatrixBase | str | None, str]: A tuple containing: - The extracted and parsed value (if successful) or None (if parsing failed) - The string representation of the extracted text """ if isinstance(target_type, LatexExtractionConfig): return extract_latex(match, target_type) elif isinstance(target_type, ExprExtractionConfig): return extract_expr(match) elif isinstance(target_type, StringExtractionConfig): return extract_string(match, target_type) def extract_target_from_pred( pred: str, target_res: list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]], fallback_mode: Literal["no_fallback", "first_match"] = "no_fallback", extraction_mode: Literal["first_match", "any_match"] = "any_match", ): """Extracts targets from a prediction string using regex patterns. Returns first sucesffuly extracted match. Args: pred (str): The prediction string to extract from target_res (list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]]): List of regex patterns and their priorities for each target type fallback_mode (Literal["no_fallback", "first_match"], optional): How to handle extraction failures. Defaults to "no_fallback". - "no_fallback": Return only successfully parsed match - "first_match": Additionaly Include the first string match no matter how parsing finished extraction_mode (Literal["first_match", "any_match"], optional): How to handle extraction failures. Defaults to "any_match". - "first_match": Only tries to extract the first match - "any_match": Tries to extract any match Returns: list: List of extracted predictions, with first fallbac string appended if fallback_mode is "first_match" """ extracted_predictions = [] fallbacks = [] # Get all patterns and sort by priority all_patterns = [ (pattern, target_type, priority) for target_patterns, target_type in target_res for pattern, priority in target_patterns ] # Group patterns by priority using itertools.groupby match_found = False sorted_patterns = sorted(all_patterns, key=lambda x: x[2]) grouped_patterns = list( (gr, list(val)) for gr, val in groupby(sorted_patterns, key=lambda x: x[2]) ) for _, patterns_group in grouped_patterns: # Find all matches for each pattern in this priority group matches_with_pos = ( (match, match.start(), match.end(), target_type) for pattern, target_type, _ in patterns_group for match in pattern.finditer(pred) ) # Sort matches by end position (rightmost first) and then by start position (leftmost first) matches_with_pos = sorted( matches_with_pos, key=lambda x: (x[2], -x[1]), reverse=True ) # Try to extract from each match, starting from rightmost for match, _, _, target_type in matches_with_pos: extracted_match, str_fallback = extract_match(match, target_type) match_found = True if str_fallback: fallbacks.append(str_fallback) if extracted_match is not None: extracted_predictions.append(extracted_match) break if extraction_mode == "first_match": break # If we extracted something or found something and we're in first_match mode, stop processing other priorities if extracted_predictions or (match_found and extraction_mode == "first_match"): break if fallback_mode == "first_match" and fallbacks: extracted_predictions += [fallbacks[0]] return extracted_predictions def parse( pred: str, extraction_config: Sequence[ExtractionTarget] = [ LatexExtractionConfig(), ExprExtractionConfig(), ], fallback_mode: Literal["no_fallback", "first_match"] = "first_match", extraction_mode: Literal["first_match", "any_match"] = "any_match", parsing_timeout: int = 5, ): """Extracts and parses mathematical expressions from a prediction string. This function attempts to extract mathematical expressions from text using various strategies (LaTeX, plain expressions, etc.) and converts them to SymPy objects. Args: pred (str): The prediction string to parse. extraction_config (Sequence[ExtractionTarget], optional): Configuration for what types of expressions to extract and how to extract them. Defaults to [LatexExtractionConfig(), ExprExtractionConfig()]. fallback_mode (Literal["no_fallback", "first_match"], optional): How to handle extraction failures. Defaults to "first_match". - "no_fallback": Return only successfully parsed expressions - "first_match": Include the first string match even if parsing failed extraction_mode (Literal["first_match", "any_match"], optional): Strategy for extracting matches. Defaults to "any_match". - "first_match": Stop after finding the first match - "any_match": Try to extract all possible matches, stops after first sucesful parsing attempt parsing_timeout (int, optional): Maximum time in seconds to spend parsing each expression. Defaults to 3. Returns: list: List of extracted predictions. Each prediction can be: - SymPy expression (for successfully parsed mathematical expressions) - String (for fallback matches when fallback_mode="first_match") Empty list if no matches are found. Examples: >>> parse("The answer is $\\frac{1}{2}$") [Rational(1, 2)] >>> parse("The answer is 1/2") [Rational(1, 2)] >>> parse("The answer is A", extraction_config=[StringExtractionConfig()]) ['a'] """ try: target_res = get_extraction_regexes(extraction_config) return timeout(timeout_seconds=parsing_timeout)(extract_target_from_pred)( pred, target_res, fallback_mode=fallback_mode, extraction_mode=extraction_mode, ) except Exception: logger.exception(f"Error parsing: {pred}") return [] except TimeoutException: logger.error(f"Timeout during parsing: {pred}") return []