AnseMin commited on
Commit
34d180e
·
1 Parent(s): 33f1b65

Tabular is not defined

Browse files
Files changed (1) hide show
  1. src/utils/latex_converter.py +86 -66
src/utils/latex_converter.py CHANGED
@@ -28,7 +28,7 @@ class LatexConverter:
28
  processed_text = latex_text
29
 
30
  # Stage 1: Pre-process tables before standard conversion
31
- processed_text = LatexConverter._preprocess_tables(processed_text)
32
 
33
  # Stage 2: Convert using latex2markdown library
34
  try:
@@ -42,63 +42,115 @@ class LatexConverter:
42
  # Stage 3: Post-process to fix any remaining issues
43
  processed_text = LatexConverter._postprocess_markdown(processed_text)
44
 
 
 
 
45
  return processed_text
46
 
47
  @staticmethod
48
- def _preprocess_tables(latex_text: str) -> str:
49
  """
50
- Pre-process LaTeX tables to ensure they convert correctly.
51
 
52
  Args:
53
  latex_text: Raw LaTeX text
54
 
55
  Returns:
56
- str: Pre-processed LaTeX text with table modifications
57
  """
58
  processed_text = latex_text
 
59
 
60
  # Find all tabular environments
61
  table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
62
  tables = re.findall(table_pattern, processed_text, re.DOTALL)
63
 
64
  for i, table_content in enumerate(tables):
65
- # Extract the column specification
66
- col_spec_match = re.search(r'{([^}]*)}', table_content)
67
- if not col_spec_match:
68
- continue
69
-
70
- # Process the table content
71
- rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
72
 
73
- # Split into rows by \\ or \hline
74
- rows = re.split(r'\\\\|\\hline', rows_text)
75
- rows = [row.strip() for row in rows if row.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- # Calculate number of columns based on the number of & in the first non-empty row plus 1
78
- for row in rows:
79
- if '&' in row:
80
- num_cols = row.count('&') + 1
81
- break
82
- else:
83
- num_cols = 1 # Default if no & found
 
84
 
85
- # Create a clean tabular environment that's easier to parse
86
- clean_table = f"\\begin{{tabular}}{{{'|'.join(['c'] * num_cols)}}}\n"
 
 
 
 
 
 
 
 
 
 
87
 
88
- for row in rows:
89
- if row.strip():
90
- clean_row = ' & '.join([cell.strip() for cell in row.split('&')])
91
- clean_table += clean_row + " \\\\\n"
 
 
 
92
 
93
- clean_table += "\\end{tabular}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- # Replace the original table with the clean one
96
- processed_text = processed_text.replace(
97
- f"\\begin{tabular}{table_content}\\end{tabular}",
98
- clean_table
99
- )
 
 
100
 
101
- return processed_text
102
 
103
  @staticmethod
104
  def _postprocess_markdown(markdown_text: str) -> str:
@@ -113,38 +165,6 @@ class LatexConverter:
113
  """
114
  processed_text = markdown_text
115
 
116
- # Fix common issues with tables
117
- # 1. Fix pipe tables that may be malformed
118
- table_lines = []
119
- in_table = False
120
-
121
- for line in processed_text.split('\n'):
122
- if '|' in line and not line.strip().startswith('|') and not in_table:
123
- # This might be the start of a table, add the missing pipe
124
- line = '| ' + line
125
- in_table = True
126
-
127
- if in_table:
128
- if '|' in line:
129
- # Ensure line ends with pipe
130
- if not line.strip().endswith('|'):
131
- line = line + ' |'
132
- table_lines.append(line)
133
- else:
134
- # End of table
135
- in_table = False
136
-
137
- # If this is a table, add a header separator row after the first row
138
- if len(table_lines) > 0:
139
- col_count = table_lines[0].count('|') - 1
140
- separator = '| ' + ' | '.join(['---'] * col_count) + ' |'
141
- table_lines.insert(1, separator)
142
-
143
- # Add the current line and the processed table
144
- for table_line in table_lines:
145
- processed_text = processed_text.replace(table_line, table_line)
146
- table_lines = []
147
-
148
  # Fix math blocks
149
  processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
150
  processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
 
28
  processed_text = latex_text
29
 
30
  # Stage 1: Pre-process tables before standard conversion
31
+ processed_text, tables_dict = LatexConverter._extract_tables(processed_text)
32
 
33
  # Stage 2: Convert using latex2markdown library
34
  try:
 
42
  # Stage 3: Post-process to fix any remaining issues
43
  processed_text = LatexConverter._postprocess_markdown(processed_text)
44
 
45
+ # Stage 4: Reinsert tables as markdown tables
46
+ processed_text = LatexConverter._reinsert_tables(processed_text, tables_dict)
47
+
48
  return processed_text
49
 
50
  @staticmethod
51
+ def _extract_tables(latex_text: str) -> tuple:
52
  """
53
+ Extract tables from LaTeX and replace with placeholders.
54
 
55
  Args:
56
  latex_text: Raw LaTeX text
57
 
58
  Returns:
59
+ tuple: (processed text with placeholders, dict of tables)
60
  """
61
  processed_text = latex_text
62
+ tables_dict = {}
63
 
64
  # Find all tabular environments
65
  table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
66
  tables = re.findall(table_pattern, processed_text, re.DOTALL)
67
 
68
  for i, table_content in enumerate(tables):
69
+ placeholder = f"TABLE_PLACEHOLDER_{i}"
70
+ tables_dict[placeholder] = table_content
 
 
 
 
 
71
 
72
+ # Replace the table with a placeholder
73
+ processed_text = processed_text.replace(
74
+ f"\\begin{{tabular}}{table_content}\\end{{tabular}}",
75
+ placeholder
76
+ )
77
+
78
+ return processed_text, tables_dict
79
+
80
+ @staticmethod
81
+ def _reinsert_tables(markdown_text: str, tables_dict: dict) -> str:
82
+ """
83
+ Convert LaTeX tables to Markdown tables and reinsert them.
84
+
85
+ Args:
86
+ markdown_text: Processed markdown text with placeholders
87
+ tables_dict: Dictionary of tables extracted from LaTeX
88
 
89
+ Returns:
90
+ str: Markdown text with tables converted and reinserted
91
+ """
92
+ processed_text = markdown_text
93
+
94
+ for placeholder, table_content in tables_dict.items():
95
+ # Convert LaTeX table to Markdown table
96
+ markdown_table = LatexConverter._convert_table_to_markdown(table_content)
97
 
98
+ # Replace the placeholder with the Markdown table
99
+ processed_text = processed_text.replace(placeholder, markdown_table)
100
+
101
+ return processed_text
102
+
103
+ @staticmethod
104
+ def _convert_table_to_markdown(table_content: str) -> str:
105
+ """
106
+ Convert a LaTeX table to Markdown format.
107
+
108
+ Args:
109
+ table_content: LaTeX table content
110
 
111
+ Returns:
112
+ str: Markdown table
113
+ """
114
+ # Extract the column specification
115
+ col_spec_match = re.search(r'{([^}]*)}', table_content)
116
+ if not col_spec_match:
117
+ return f"[Table conversion failed]"
118
 
119
+ # Process the table content
120
+ rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
121
+
122
+ # Split into rows by \\ or \hline
123
+ rows = re.split(r'\\\\|\\hline', rows_text)
124
+ rows = [row.strip() for row in rows if row.strip()]
125
+
126
+ if not rows:
127
+ return "[Empty table]"
128
+
129
+ # Calculate number of columns based on the number of & in the first non-empty row plus 1
130
+ num_cols = 1 # Default
131
+ for row in rows:
132
+ if '&' in row:
133
+ num_cols = row.count('&') + 1
134
+ break
135
+
136
+ # Build markdown table
137
+ markdown_table = []
138
+
139
+ # Add header row
140
+ if rows:
141
+ first_row = rows[0]
142
+ cells = [cell.strip() for cell in first_row.split('&')]
143
+ markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
144
 
145
+ # Add separator row
146
+ markdown_table.append("| " + " | ".join(["---"] * num_cols) + " |")
147
+
148
+ # Add data rows
149
+ for row in rows[1:]:
150
+ cells = [cell.strip() for cell in row.split('&')]
151
+ markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
152
 
153
+ return "\n".join(markdown_table)
154
 
155
  @staticmethod
156
  def _postprocess_markdown(markdown_text: str) -> str:
 
165
  """
166
  processed_text = markdown_text
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Fix math blocks
169
  processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
170
  processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)