openfree commited on
Commit
ea56eb4
·
verified ·
1 Parent(s): ed3742a

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -629
app.py DELETED
@@ -1,629 +0,0 @@
1
- import base64
2
- import json
3
- import os
4
- import time
5
- import zipfile
6
- from pathlib import Path
7
- import re
8
- import uuid
9
- import pymupdf
10
-
11
- # 이미지 전처리에 필요한 라이브러리
12
- import cv2
13
- import numpy as np
14
-
15
- ###############################
16
- # 환경 설정
17
- ###############################
18
- os.system('pip uninstall -y magic-pdf')
19
- os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
20
- os.system('pip install opencv-python-headless')
21
-
22
- os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
23
- os.system('python download_models_hf.py')
24
-
25
- with open('/home/user/magic-pdf.json', 'r') as file:
26
- data = json.load(file)
27
-
28
- data['device-mode'] = "cuda"
29
- if os.getenv('apikey'):
30
- data['llm-aided-config']['title_aided']['api_key'] = os.getenv('apikey')
31
- data['llm-aided-config']['title_aided']['enable'] = True
32
-
33
- with open('/home/user/magic-pdf.json', 'w') as file:
34
- json.dump(data, file, indent=4)
35
-
36
- os.system('cp -r paddleocr /home/user/.paddleocr')
37
-
38
- ###############################
39
- # 그 외 라이브러리
40
- ###############################
41
- import gradio as gr
42
- from loguru import logger
43
- from gradio_pdf import PDF
44
-
45
- ###############################
46
- # magic_pdf 관련 모듈
47
- ###############################
48
- from magic_pdf.data.data_reader_writer import FileBasedDataReader
49
- from magic_pdf.libs.hash_utils import compute_sha256
50
- from magic_pdf.tools.common import do_parse, prepare_env
51
-
52
- ###############################
53
- # 공통 함수들
54
- ###############################
55
- def create_css():
56
- """
57
- 기본 CSS 스타일.
58
- """
59
- return """
60
- .gradio-container {
61
- width: 100vw !important;
62
- min-height: 100vh !important;
63
- margin: 0 !important;
64
- padding: 0 !important;
65
- background: linear-gradient(135deg, #EFF6FF 0%, #F5F3FF 100%);
66
- display: flex;
67
- flex-direction: column;
68
- overflow-y: auto !important;
69
- }
70
- .title-area {
71
- text-align: center;
72
- margin: 1rem auto;
73
- padding: 1rem;
74
- background: white;
75
- border-radius: 1rem;
76
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
77
- max-width: 800px;
78
- }
79
- .title-area h1 {
80
- background: linear-gradient(90deg, #2563EB 0%, #7C3AED 100%);
81
- -webkit-background-clip: text;
82
- -webkit-text-fill-color: transparent;
83
- font-size: 2.5rem;
84
- font-weight: bold;
85
- margin-bottom: 0.5rem;
86
- }
87
- .title-area p {
88
- color: #6B7280;
89
- font-size: 1.1rem;
90
- }
91
- .invisible {
92
- display: none !important;
93
- }
94
- .gr-block, .gr-box {
95
- padding: 0.5rem !important;
96
- }
97
- """
98
-
99
- def read_fn(path):
100
- disk_rw = FileBasedDataReader(os.path.dirname(path))
101
- return disk_rw.read(os.path.basename(path))
102
-
103
- ###############################
104
- # 이미지 전처리 함수 (이진화 + Deskew)
105
- ###############################
106
- def preprocess_image(image_path):
107
- """
108
- 1) Grayscale + Otsu Binarization
109
- 2) Deskew(기울임 보정)
110
- """
111
- img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
112
- if img is None:
113
- # 이미지가 아니거나 로딩 실패 시 그대로 반환
114
- return image_path
115
-
116
- # (a) 이진화(Otsu)
117
- _, img_bin = cv2.threshold(img, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY)
118
-
119
- # (b) 기울임 보정(deskew)
120
- coords = np.column_stack(np.where(img_bin > 0))
121
- angle = cv2.minAreaRect(coords)[-1]
122
- if angle < -45:
123
- angle = -(90 + angle)
124
- else:
125
- angle = -angle
126
-
127
- (h, w) = img_bin.shape[:2]
128
- center = (w // 2, h // 2)
129
- M = cv2.getRotationMatrix2D(center, angle, 1.0)
130
- img_rotated = cv2.warpAffine(
131
- img_bin, M, (w, h),
132
- flags=cv2.INTER_CUBIC,
133
- borderMode=cv2.BORDER_CONSTANT,
134
- borderValue=255
135
- )
136
-
137
- # 임시 파일로 저장
138
- preprocessed_path = image_path + "_preprocessed.png"
139
- cv2.imwrite(preprocessed_path, img_rotated)
140
- return preprocessed_path
141
-
142
- ###############################
143
- # PDF or Image -> PDF 변환
144
- # (불필요한 f.close() 제거)
145
- ###############################
146
- def to_pdf(file_path):
147
- """
148
- 이미지(JPG/PNG 등)라면 전처리 후 PDF로 변환.
149
- 이미 PDF라면 그대로 반환.
150
- """
151
- with pymupdf.open(file_path) as f:
152
- # PDF인 경우
153
- if f.is_pdf:
154
- return file_path
155
- # 이미지 파일인 경우
156
- # (중첩된 with문에 들어가기 전, 그냥 블록이 끝나면 자동 close)
157
- # 블록 종료 시 f는 이미 close됨.
158
-
159
- # 이미지 전처리 -> 새 이미지 -> PDF 변환
160
- preprocessed_path = preprocess_image(file_path)
161
- with pymupdf.open(preprocessed_path) as img_doc:
162
- pdf_bytes = img_doc.convert_to_pdf()
163
-
164
- unique_filename = f"{uuid.uuid4()}.pdf"
165
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
166
- with open(tmp_file_path, 'wb') as tmp_pdf_file:
167
- tmp_pdf_file.write(pdf_bytes)
168
-
169
- return tmp_file_path
170
-
171
- def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
172
- os.makedirs(output_dir, exist_ok=True)
173
- try:
174
- file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
175
- pdf_data = read_fn(doc_path)
176
- parse_method = "ocr" if is_ocr else "auto"
177
- local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
178
- do_parse(
179
- output_dir,
180
- file_name,
181
- pdf_data,
182
- [],
183
- parse_method,
184
- False,
185
- end_page_id=end_page_id,
186
- layout_model=layout_mode,
187
- formula_enable=formula_enable,
188
- table_enable=table_enable,
189
- lang=language,
190
- f_dump_orig_pdf=False
191
- )
192
- return local_md_dir, file_name
193
- except Exception as e:
194
- logger.exception(e)
195
-
196
- def compress_directory_to_zip(directory_path, output_zip_path):
197
- try:
198
- with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
199
- for root, dirs, files in os.walk(directory_path):
200
- for file in files:
201
- file_path = os.path.join(root, file)
202
- arcname = os.path.relpath(file_path, directory_path)
203
- zipf.write(file_path, arcname)
204
- return 0
205
- except Exception as e:
206
- logger.exception(e)
207
- return -1
208
-
209
- def image_to_base64(image_path):
210
- with open(image_path, "rb") as image_file:
211
- return base64.b64encode(image_file.read()).decode('utf-8')
212
-
213
- def replace_image_with_base64(markdown_text, image_dir_path):
214
- pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
215
-
216
- def replace(match):
217
- relative_path = match.group(1)
218
- full_path = os.path.join(image_dir_path, relative_path)
219
- base64_image = image_to_base64(full_path)
220
- return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
221
-
222
- return re.sub(pattern, replace, markdown_text)
223
-
224
- def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language, progress=gr.Progress(track_tqdm=False)):
225
- """
226
- 업로드된 PDF/이미지 -> PDF 변환 -> 마크다운 변환
227
- (프로그레스 표시)
228
- """
229
- progress(0, "PDF로 변환 중...")
230
- file_path = to_pdf(file_path)
231
- time.sleep(0.5)
232
-
233
- if end_pages > 20:
234
- end_pages = 20
235
-
236
- progress(20, "문서 파싱 중...")
237
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
238
- layout_mode, formula_enable, table_enable, language)
239
- time.sleep(0.5)
240
-
241
- progress(50, "압축(zip) 생성 중...")
242
- archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
243
- zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
244
- if zip_archive_success == 0:
245
- logger.info("압축 성공")
246
- else:
247
- logger.error("압축 실패")
248
- time.sleep(0.5)
249
-
250
- progress(70, "마크다운 읽는 중...")
251
- md_path = os.path.join(local_md_dir, file_name + ".md")
252
- with open(md_path, 'r', encoding='utf-8') as f:
253
- txt_content = f.read()
254
- time.sleep(0.5)
255
-
256
- progress(90, "이미지 base64 변환 중...")
257
- md_content = replace_image_with_base64(txt_content, local_md_dir)
258
- time.sleep(0.5)
259
-
260
- progress(100, "변환 완료!")
261
- return md_content
262
-
263
- def init_model():
264
- """
265
- magic-pdf 모델 초기화
266
- """
267
- from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
268
- try:
269
- model_manager = ModelSingleton()
270
- txt_model = model_manager.get_model(False, False)
271
- logger.info("txt_model init final")
272
- ocr_model = model_manager.get_model(True, False)
273
- logger.info("ocr_model init final")
274
- return 0
275
- except Exception as e:
276
- logger.exception(e)
277
- return -1
278
-
279
- model_init = init_model()
280
- logger.info(f"model_init: {model_init}")
281
-
282
- ###############################
283
- # 언어 목록
284
- ###############################
285
- latin_lang = [
286
- 'af','az','bs','cs','cy','da','de','es','et','fr','ga','hr','hu','id','is','it','ku',
287
- 'la','lt','lv','mi','ms','mt','nl','no','oc','pi','pl','pt','ro','rs_latin','sk','sl',
288
- 'sq','sv','sw','tl','tr','uz','vi','french','german'
289
- ]
290
- arabic_lang = ['ar','fa','ug','ur']
291
- cyrillic_lang = ['ru','rs_cyrillic','be','bg','uk','mn','abq','ady','kbd','ava','dar','inh','che','lbe','lez','tab']
292
- devanagari_lang = ['hi','mr','ne','bh','mai','ang','bho','mah','sck','new','gom','sa','bgc']
293
- other_lang = ['ch','en','korean','japan','chinese_cht','ta','te','ka']
294
-
295
- all_lang = ['', 'auto']
296
- all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
297
-
298
- ###############################
299
- # (1) PDF Chat 용 LLM 관련
300
- ###############################
301
- import google.generativeai as genai
302
- from gradio import ChatMessage
303
- from typing import Iterator
304
-
305
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
306
- genai.configure(api_key=GEMINI_API_KEY)
307
- model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-1219")
308
-
309
- def format_chat_history(messages: list) -> list:
310
- """
311
- Gemini가 이해할 수 있는 (role, parts[]) 형식으로 변환
312
- """
313
- formatted_history = []
314
- for message in messages:
315
- if not (message.role == "assistant" and hasattr(message, "metadata")):
316
- formatted_history.append({
317
- "role": "user" if message.role == "user" else "assistant",
318
- "parts": [message.content]
319
- })
320
- return formatted_history
321
-
322
- def convert_chat_messages_to_gradio_format(messages):
323
- """
324
- ChatMessage list -> [ (유저발화, 봇응답), (...), ... ]
325
- """
326
- gradio_chat = []
327
- user_text, assistant_text = None, None
328
-
329
- for msg in messages:
330
- if msg.role == "user":
331
- if user_text is not None or assistant_text is not None:
332
- gradio_chat.append((user_text or "", assistant_text or ""))
333
- user_text = msg.content
334
- assistant_text = None
335
- else:
336
- if user_text is None:
337
- user_text = ""
338
- if assistant_text is None:
339
- assistant_text = msg.content
340
- else:
341
- assistant_text += msg.content
342
-
343
- if user_text is not None or assistant_text is not None:
344
- gradio_chat.append((user_text or "", assistant_text or ""))
345
-
346
- return gradio_chat
347
-
348
- def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
349
- """
350
- Gemini 응답 스트리밍
351
- """
352
- if not user_message.strip():
353
- user_message = "...(No content from user)..."
354
-
355
- try:
356
- print(f"\n=== [Gemini] New Request ===\nUser message: '{user_message}'")
357
- chat_history = format_chat_history(messages)
358
- chat = model.start_chat(history=chat_history)
359
- response = chat.send_message(user_message, stream=True)
360
-
361
- thought_buffer = ""
362
- response_buffer = ""
363
- thinking_complete = False
364
-
365
- # "Thinking" 역할
366
- messages.append(
367
- ChatMessage(
368
- role="assistant",
369
- content="",
370
- metadata={"title": "⚙️ Thinking: *The thoughts produced by the model are experimental"}
371
- )
372
- )
373
- yield convert_chat_messages_to_gradio_format(messages)
374
-
375
- for chunk in response:
376
- parts = chunk.candidates[0].content.parts
377
- current_chunk = parts[0].text
378
-
379
- # 만약 parts가 2개이면 [thinking, 최종답변]
380
- if len(parts) == 2 and not thinking_complete:
381
- thought_buffer += current_chunk
382
- messages[-1] = ChatMessage(
383
- role="assistant",
384
- content=thought_buffer,
385
- metadata={"title": "⚙️ Thinking: *The thoughts produced by the model are experimental"}
386
- )
387
- yield convert_chat_messages_to_gradio_format(messages)
388
-
389
- response_buffer = parts[1].text
390
- messages.append(ChatMessage(role="assistant", content=response_buffer))
391
- thinking_complete = True
392
- elif thinking_complete:
393
- # 이미 최종답변 중
394
- response_buffer += current_chunk
395
- messages[-1] = ChatMessage(role="assistant", content=response_buffer)
396
- else:
397
- # 아직 thinking 중
398
- thought_buffer += current_chunk
399
- messages[-1] = ChatMessage(
400
- role="assistant",
401
- content=thought_buffer,
402
- metadata={"title": "⚙️ Thinking: *The thoughts produced by the model are experimental"}
403
- )
404
-
405
- yield convert_chat_messages_to_gradio_format(messages)
406
-
407
- print(f"\n=== [Gemini] Final Response ===\n{response_buffer}")
408
-
409
- except Exception as e:
410
- print(f"\n=== [Gemini] Error ===\n{str(e)}")
411
- messages.append(ChatMessage(role="assistant", content=f"I encountered an error: {str(e)}"))
412
- yield convert_chat_messages_to_gradio_format(messages)
413
-
414
- def user_message(msg: str, history: list, doc_text: str) -> tuple[str, list]:
415
- """
416
- doc_text(마크다운) 사용해 질문을 살짝 변형
417
- """
418
- if doc_text.strip():
419
- user_query = f"다음 문서를 참고하여 답변:\n\n{doc_text}\n\n질문: {msg}"
420
- else:
421
- user_query = msg
422
-
423
- history.append(ChatMessage(role="user", content=user_query))
424
- return "", history
425
-
426
- def reset_states(_):
427
- """
428
- 새 파일 업로드 시
429
- - chat_history -> 빈 리스트
430
- - md_state -> 빈 문자열
431
- - chatbot -> 빈 list of tuples
432
- """
433
- return [], "", []
434
-
435
- ###############################
436
- # (2) OCR FLEX 전용 (스니펫)
437
- ###############################
438
- latex_delimiters = [
439
- {"left": "$$", "right": "$$", "display": True},
440
- {"left": '$', "right": '$', "display": False}
441
- ]
442
-
443
- def to_markdown_ocr_flex(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
444
- """
445
- 스니펫에서 사용:
446
- 업로드된 PDF/이미지 -> 전처리 -> PDF 변환 -> 마크다운 변환
447
- """
448
- file_path = to_pdf(file_path)
449
- if end_pages > 20:
450
- end_pages = 20
451
- local_md_dir, file_name = parse_pdf(
452
- file_path, './output', end_pages - 1, is_ocr,
453
- layout_mode, formula_enable, table_enable, language
454
- )
455
- archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
456
- zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
457
- if zip_archive_success == 0:
458
- logger.info("압축 성공")
459
- else:
460
- logger.error("압축 실패")
461
-
462
- md_path = os.path.join(local_md_dir, file_name + ".md")
463
- with open(md_path, 'r', encoding='utf-8') as f:
464
- txt_content = f.read()
465
-
466
- md_content = replace_image_with_base64(txt_content, local_md_dir)
467
- new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
468
-
469
- return md_content, txt_content, archive_zip_path, new_pdf_path
470
-
471
- ###############################
472
- # UI 통합
473
- ###############################
474
- if __name__ == "__main__":
475
- with gr.Blocks(title="VisionOCR", css=create_css()) as demo:
476
- with gr.Tabs():
477
- ###############################
478
- # Tab (1) : PDF -> Markdown 변환 + Chat
479
- ###############################
480
- with gr.Tab("PDF Chat with LLM"):
481
- gr.HTML("""
482
- <div class="title-area">
483
- <h1>VisionOCR</h1>
484
- <p>PDF/이미지 -> 텍스트(마크다운) 변환 후, 추 LLM과 대화</p>
485
- </div>
486
- """)
487
-
488
- md_state = gr.State("") # 변환된 마크다운 텍스트
489
- chat_history = gr.State([]) # ChatMessage 리스트
490
-
491
- with gr.Row():
492
- file = gr.File(label="PDF/이미지 업로드", file_types=[".pdf", ".png", ".jpeg", ".jpg"], interactive=True)
493
- convert_btn = gr.Button("변환하기")
494
-
495
- chatbot = gr.Chatbot(height=600)
496
-
497
- file.change(
498
- fn=reset_states,
499
- inputs=file,
500
- outputs=[chat_history, md_state, chatbot]
501
- )
502
-
503
- # 숨김 옵션들
504
- max_pages = gr.Slider(1, 20, 10, visible=False, elem_classes="invisible")
505
- layout_mode = gr.Dropdown(["layoutlmv3","doclayout_yolo"], value="doclayout_yolo", visible=False, elem_classes="invisible")
506
- language = gr.Dropdown(all_lang, value='auto', visible=False, elem_classes="invisible")
507
- formula_enable = gr.Checkbox(value=True, visible=False, elem_classes="invisible")
508
- is_ocr = gr.Checkbox(value=False, visible=False, elem_classes="invisible")
509
- table_enable = gr.Checkbox(value=True, visible=False, elem_classes="invisible")
510
-
511
- convert_btn.click(
512
- fn=to_markdown,
513
- inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
514
- outputs=md_state,
515
- show_progress=True
516
- )
517
-
518
- gr.Markdown("## 추론 LLM과 대화")
519
-
520
- with gr.Row():
521
- chat_input = gr.Textbox(lines=1, placeholder="질문을 입력하세요...")
522
- clear_btn = gr.Button("대화 초기화")
523
-
524
- chat_input.submit(
525
- fn=user_message,
526
- inputs=[chat_input, chat_history, md_state],
527
- outputs=[chat_input, chat_history]
528
- ).then(
529
- fn=stream_gemini_response,
530
- inputs=[chat_input, chat_history],
531
- outputs=chatbot
532
- )
533
-
534
- def clear_all():
535
- return [], "", []
536
-
537
- clear_btn.click(
538
- fn=clear_all,
539
- inputs=[],
540
- outputs=[chat_history, md_state, chatbot]
541
- )
542
-
543
- ###############################
544
- # Tab (2) : OCR FLEX
545
- ###############################
546
- with gr.Tab("OCR FLEX"):
547
- gr.HTML("""
548
- <div class="title-area">
549
- <h1>OCR FLEX</h1>
550
- <p>PDF와 이미지에서 텍스트를 빠르고 정확하게 추출하세요</p>
551
- </div>
552
- """)
553
-
554
- with gr.Row():
555
- with gr.Column(variant='panel', scale=5):
556
- file_ocr = gr.File(label="PDF 또는 이미지 파일", file_types=[".pdf", ".png", ".jpeg", ".jpg"])
557
- max_pages_ocr = gr.Slider(1, 20, 10, step=1, label='최대 변환 페이지 수')
558
-
559
- with gr.Row():
560
- layout_mode_ocr = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="레이아웃 모델", value="doclayout_yolo")
561
- language_ocr = gr.Dropdown(all_lang, label="언어", value='auto')
562
-
563
- with gr.Row():
564
- formula_enable_ocr = gr.Checkbox(label="수식 인식 활성화", value=True)
565
- is_ocr_ocr = gr.Checkbox(label="OCR 강제 활성화", value=False)
566
- table_enable_ocr = gr.Checkbox(label="표 인식 활성화(테스트)", value=True)
567
-
568
- with gr.Row():
569
- change_bu_ocr = gr.Button("변환")
570
- clear_bu_ocr = gr.ClearButton(
571
- components=[
572
- file_ocr,
573
- max_pages_ocr,
574
- layout_mode_ocr,
575
- language_ocr,
576
- formula_enable_ocr,
577
- is_ocr_ocr,
578
- table_enable_ocr
579
- ],
580
- value="초기화"
581
- )
582
-
583
- pdf_show_ocr = PDF(label='PDF 미리보기', interactive=False, visible=True, height=800)
584
-
585
- with gr.Column(variant='panel', scale=5):
586
- output_file_ocr = gr.File(label="변환 결과", interactive=False)
587
-
588
- with gr.Tabs():
589
- with gr.Tab("마크다운 렌더링"):
590
- md_ocr = gr.Markdown(
591
- label="마크다운 렌더링",
592
- height=1100,
593
- show_copy_button=True,
594
- line_breaks=True,
595
- latex_delimiters=latex_delimiters
596
- )
597
-
598
- with gr.Tab("마크다운 텍스트"):
599
- md_text_ocr = gr.TextArea(lines=45, show_copy_button=True)
600
-
601
- file_ocr.change(
602
- fn=to_pdf,
603
- inputs=file_ocr,
604
- outputs=pdf_show_ocr
605
- )
606
-
607
- def run_ocr_flex(*args):
608
- return to_markdown_ocr_flex(*args)
609
-
610
- change_bu_ocr.click(
611
- fn=run_ocr_flex,
612
- inputs=[
613
- file_ocr,
614
- max_pages_ocr,
615
- is_ocr_ocr,
616
- layout_mode_ocr,
617
- formula_enable_ocr,
618
- table_enable_ocr,
619
- language_ocr
620
- ],
621
- outputs=[
622
- md_ocr,
623
- md_text_ocr,
624
- output_file_ocr,
625
- pdf_show_ocr
626
- ]
627
- )
628
-
629
- demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, ssr_mode=True)