Spaces:
Running
Running
Tabular is not defined
Browse files- src/utils/latex_converter.py +86 -66
src/utils/latex_converter.py
CHANGED
@@ -28,7 +28,7 @@ class LatexConverter:
|
|
28 |
processed_text = latex_text
|
29 |
|
30 |
# Stage 1: Pre-process tables before standard conversion
|
31 |
-
processed_text = LatexConverter.
|
32 |
|
33 |
# Stage 2: Convert using latex2markdown library
|
34 |
try:
|
@@ -42,63 +42,115 @@ class LatexConverter:
|
|
42 |
# Stage 3: Post-process to fix any remaining issues
|
43 |
processed_text = LatexConverter._postprocess_markdown(processed_text)
|
44 |
|
|
|
|
|
|
|
45 |
return processed_text
|
46 |
|
47 |
@staticmethod
|
48 |
-
def
|
49 |
"""
|
50 |
-
|
51 |
|
52 |
Args:
|
53 |
latex_text: Raw LaTeX text
|
54 |
|
55 |
Returns:
|
56 |
-
|
57 |
"""
|
58 |
processed_text = latex_text
|
|
|
59 |
|
60 |
# Find all tabular environments
|
61 |
table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
|
62 |
tables = re.findall(table_pattern, processed_text, re.DOTALL)
|
63 |
|
64 |
for i, table_content in enumerate(tables):
|
65 |
-
|
66 |
-
|
67 |
-
if not col_spec_match:
|
68 |
-
continue
|
69 |
-
|
70 |
-
# Process the table content
|
71 |
-
rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
|
72 |
|
73 |
-
#
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
84 |
|
85 |
-
#
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
92 |
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
#
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
100 |
|
101 |
-
return
|
102 |
|
103 |
@staticmethod
|
104 |
def _postprocess_markdown(markdown_text: str) -> str:
|
@@ -113,38 +165,6 @@ class LatexConverter:
|
|
113 |
"""
|
114 |
processed_text = markdown_text
|
115 |
|
116 |
-
# Fix common issues with tables
|
117 |
-
# 1. Fix pipe tables that may be malformed
|
118 |
-
table_lines = []
|
119 |
-
in_table = False
|
120 |
-
|
121 |
-
for line in processed_text.split('\n'):
|
122 |
-
if '|' in line and not line.strip().startswith('|') and not in_table:
|
123 |
-
# This might be the start of a table, add the missing pipe
|
124 |
-
line = '| ' + line
|
125 |
-
in_table = True
|
126 |
-
|
127 |
-
if in_table:
|
128 |
-
if '|' in line:
|
129 |
-
# Ensure line ends with pipe
|
130 |
-
if not line.strip().endswith('|'):
|
131 |
-
line = line + ' |'
|
132 |
-
table_lines.append(line)
|
133 |
-
else:
|
134 |
-
# End of table
|
135 |
-
in_table = False
|
136 |
-
|
137 |
-
# If this is a table, add a header separator row after the first row
|
138 |
-
if len(table_lines) > 0:
|
139 |
-
col_count = table_lines[0].count('|') - 1
|
140 |
-
separator = '| ' + ' | '.join(['---'] * col_count) + ' |'
|
141 |
-
table_lines.insert(1, separator)
|
142 |
-
|
143 |
-
# Add the current line and the processed table
|
144 |
-
for table_line in table_lines:
|
145 |
-
processed_text = processed_text.replace(table_line, table_line)
|
146 |
-
table_lines = []
|
147 |
-
|
148 |
# Fix math blocks
|
149 |
processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
|
150 |
processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
|
|
|
28 |
processed_text = latex_text
|
29 |
|
30 |
# Stage 1: Pre-process tables before standard conversion
|
31 |
+
processed_text, tables_dict = LatexConverter._extract_tables(processed_text)
|
32 |
|
33 |
# Stage 2: Convert using latex2markdown library
|
34 |
try:
|
|
|
42 |
# Stage 3: Post-process to fix any remaining issues
|
43 |
processed_text = LatexConverter._postprocess_markdown(processed_text)
|
44 |
|
45 |
+
# Stage 4: Reinsert tables as markdown tables
|
46 |
+
processed_text = LatexConverter._reinsert_tables(processed_text, tables_dict)
|
47 |
+
|
48 |
return processed_text
|
49 |
|
50 |
@staticmethod
|
51 |
+
def _extract_tables(latex_text: str) -> tuple:
|
52 |
"""
|
53 |
+
Extract tables from LaTeX and replace with placeholders.
|
54 |
|
55 |
Args:
|
56 |
latex_text: Raw LaTeX text
|
57 |
|
58 |
Returns:
|
59 |
+
tuple: (processed text with placeholders, dict of tables)
|
60 |
"""
|
61 |
processed_text = latex_text
|
62 |
+
tables_dict = {}
|
63 |
|
64 |
# Find all tabular environments
|
65 |
table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
|
66 |
tables = re.findall(table_pattern, processed_text, re.DOTALL)
|
67 |
|
68 |
for i, table_content in enumerate(tables):
|
69 |
+
placeholder = f"TABLE_PLACEHOLDER_{i}"
|
70 |
+
tables_dict[placeholder] = table_content
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
# Replace the table with a placeholder
|
73 |
+
processed_text = processed_text.replace(
|
74 |
+
f"\\begin{{tabular}}{table_content}\\end{{tabular}}",
|
75 |
+
placeholder
|
76 |
+
)
|
77 |
+
|
78 |
+
return processed_text, tables_dict
|
79 |
+
|
80 |
+
@staticmethod
|
81 |
+
def _reinsert_tables(markdown_text: str, tables_dict: dict) -> str:
|
82 |
+
"""
|
83 |
+
Convert LaTeX tables to Markdown tables and reinsert them.
|
84 |
+
|
85 |
+
Args:
|
86 |
+
markdown_text: Processed markdown text with placeholders
|
87 |
+
tables_dict: Dictionary of tables extracted from LaTeX
|
88 |
|
89 |
+
Returns:
|
90 |
+
str: Markdown text with tables converted and reinserted
|
91 |
+
"""
|
92 |
+
processed_text = markdown_text
|
93 |
+
|
94 |
+
for placeholder, table_content in tables_dict.items():
|
95 |
+
# Convert LaTeX table to Markdown table
|
96 |
+
markdown_table = LatexConverter._convert_table_to_markdown(table_content)
|
97 |
|
98 |
+
# Replace the placeholder with the Markdown table
|
99 |
+
processed_text = processed_text.replace(placeholder, markdown_table)
|
100 |
+
|
101 |
+
return processed_text
|
102 |
+
|
103 |
+
@staticmethod
|
104 |
+
def _convert_table_to_markdown(table_content: str) -> str:
|
105 |
+
"""
|
106 |
+
Convert a LaTeX table to Markdown format.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
table_content: LaTeX table content
|
110 |
|
111 |
+
Returns:
|
112 |
+
str: Markdown table
|
113 |
+
"""
|
114 |
+
# Extract the column specification
|
115 |
+
col_spec_match = re.search(r'{([^}]*)}', table_content)
|
116 |
+
if not col_spec_match:
|
117 |
+
return f"[Table conversion failed]"
|
118 |
|
119 |
+
# Process the table content
|
120 |
+
rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
|
121 |
+
|
122 |
+
# Split into rows by \\ or \hline
|
123 |
+
rows = re.split(r'\\\\|\\hline', rows_text)
|
124 |
+
rows = [row.strip() for row in rows if row.strip()]
|
125 |
+
|
126 |
+
if not rows:
|
127 |
+
return "[Empty table]"
|
128 |
+
|
129 |
+
# Calculate number of columns based on the number of & in the first non-empty row plus 1
|
130 |
+
num_cols = 1 # Default
|
131 |
+
for row in rows:
|
132 |
+
if '&' in row:
|
133 |
+
num_cols = row.count('&') + 1
|
134 |
+
break
|
135 |
+
|
136 |
+
# Build markdown table
|
137 |
+
markdown_table = []
|
138 |
+
|
139 |
+
# Add header row
|
140 |
+
if rows:
|
141 |
+
first_row = rows[0]
|
142 |
+
cells = [cell.strip() for cell in first_row.split('&')]
|
143 |
+
markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
|
144 |
|
145 |
+
# Add separator row
|
146 |
+
markdown_table.append("| " + " | ".join(["---"] * num_cols) + " |")
|
147 |
+
|
148 |
+
# Add data rows
|
149 |
+
for row in rows[1:]:
|
150 |
+
cells = [cell.strip() for cell in row.split('&')]
|
151 |
+
markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
|
152 |
|
153 |
+
return "\n".join(markdown_table)
|
154 |
|
155 |
@staticmethod
|
156 |
def _postprocess_markdown(markdown_text: str) -> str:
|
|
|
165 |
"""
|
166 |
processed_text = markdown_text
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
# Fix math blocks
|
169 |
processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
|
170 |
processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
|