custom_rag_project / backend.py
Seanya's picture
Upload folder using huggingface_hub
8725d40 verified
import os
import base64
import requests
from typing import List, Optional, Generator, Tuple
from openai import OpenAI
from docling.document_converter import DocumentConverter
import glob
from pdf2image import convert_from_path
from PIL import Image
import tempfile
import shutil
import fitz # PyMuPDF ์ถ”๊ฐ€
# Initialize the docling DocumentConverter
converter = DocumentConverter()
# ์ „์—ญ ์„ค์ •
openai_api_key = "EMPTY"
openai_api_base = "http://118.38.20.101:8080/v1"
model = "Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
# ์ „์—ญ ๋กœ๊ทธ ์‹œ์Šคํ…œ
current_log_messages = []
current_request_info = "" # ๋ˆ„์ ๋œ API ์š”์ฒญ ์ •๋ณด ์ €์žฅ
# OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base
)
def load_system_prompt() -> str:
"""์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ํŒŒ์ผ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."""
try:
with open("prompt_system.txt", "r", encoding="utf-8") as f:
return f.read().strip()
except Exception as e:
print(f"์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
return "๋‹น์‹ ์€ ์ด๋ ฅ์„œ ๋ถ„์„์„ ๋„์™€์ฃผ๋Š” AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."
def load_user_prompt() -> str:
"""์‚ฌ์šฉ์ž ํ”„๋กฌํ”„ํŠธ ํŒŒ์ผ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."""
try:
with open("prompt_user.txt", "r", encoding="utf-8") as f:
return f.read().strip()
except Exception as e:
print(f"์‚ฌ์šฉ์ž ํ”„๋กฌํ”„ํŠธ ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
return "[ํ…์ŠคํŠธ ์ถ”์ถœ ์ž‘์—…]\n\n์ฒจ๋ถ€๋œ ์ด๋ฏธ์ง€๋“ค์€ ์ด๋ ฅ์„œ๋ฅผ ์ด๋ฏธ์ง€ํ™” ํ•œ ๊ฒฐ๊ณผ๋ฌผ์ด์•ผ. ์ด๋ฏธ์ง€์˜ ๋‚ด์šฉ ๋ฐ ๋ ˆ์ด์•„์›ƒ์„ ์ฐธ๊ณ ํ•ด์„œ ์ด๋ ฅ์„œ์˜ ๋‚ด์šฉ์„ ์ •๋ฆฌ ํ›„ ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ ์ •๋ฆฌํ•ด์ค˜."
def load_postprocess_prompt() -> str:
"""ํ›„์ฒ˜๋ฆฌ ํ”„๋กฌํ”„ํŠธ ํŒŒ์ผ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."""
try:
with open("prompt_postprocess.txt", "r", encoding="utf-8") as f:
return f.read().strip()
except Exception as e:
print(f"ํ›„์ฒ˜๋ฆฌ ํ”„๋กฌํ”„ํŠธ ํŒŒ์ผ ๋กœ๋“œ ์˜ค๋ฅ˜: {e}")
return "[ํ…์ŠคํŠธ ๋ณ‘ํ•ฉ ์ž‘์—…]\n๋ฐฐ์น˜ ์ž‘์—…์œผ๋กœ ์ˆ˜์ง‘๋œ ํ…์ŠคํŠธ์ž…๋‹ˆ๋‹ค. ์ด์ œ ์™„์ „ํ•œ ํ˜•ํƒœ์˜ ์ด๋ ฅ์„œ๋ฅผ ๋งŒ๋“ค์–ด ์ฃผ์„ธ์š”. ์ถœ๋ ฅ ํฌ๋งท์€ ๋งˆํฌ๋‹ค์šด์ž…๋‹ˆ๋‹ค."
def encode_image_base64_from_url(image_path: str) -> str:
"""Encode an image retrieved from a file path or url to base64 format."""
try:
if isinstance(image_path, str) and 'http' in image_path:
with requests.get(image_path) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
elif isinstance(image_path, str) and os.path.isfile(image_path):
with open(image_path, 'rb') as image_file:
result = base64.b64encode(image_file.read()).decode('utf-8')
return result
else:
raise ValueError(f"Invalid image URL or file path: {image_path}")
except Exception as e:
print(f"Error encoding image: {e}")
raise
def convert_pdf_to_images(pdf_path: str, dpi: int = 200) -> List[str]:
"""Convert PDF to images using pdf2image and return list of image file paths."""
try:
# PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
images = convert_from_path(pdf_path, dpi=dpi)
# ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
temp_dir = tempfile.mkdtemp()
image_paths = []
for i, image in enumerate(images):
# ์ด๋ฏธ์ง€๋ฅผ ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
image_path = os.path.join(temp_dir, f"page_{i+1:03d}.png")
image.save(image_path, "PNG")
image_paths.append(image_path)
return image_paths
except Exception as e:
print(f"Error converting PDF to images: {e}")
raise
def combine_images_horizontally(image_paths: List[str]) -> List[str]:
"""Combine images in pairs horizontally. Returns list of combined image paths."""
if not image_paths:
return []
combined_paths = []
temp_dir = tempfile.mkdtemp()
# 2์žฅ์”ฉ ๊ฐ€๋กœ๋กœ ๋ถ™์ด๊ธฐ
for i in range(0, len(image_paths), 2):
if i + 1 < len(image_paths):
# 2์žฅ์„ ๊ฐ€๋กœ๋กœ ๋ถ™์ด๊ธฐ
img1 = Image.open(image_paths[i])
img2 = Image.open(image_paths[i + 1])
# ๋†’์ด๋ฅผ ๋งž์ถค (๋” ๋†’์€ ์ชฝ ๊ธฐ์ค€)
max_height = max(img1.height, img2.height)
# ๋น„์œจ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ๋†’์ด ์กฐ์ •
if img1.height != max_height:
ratio = max_height / img1.height
img1 = img1.resize((int(img1.width * ratio), max_height), Image.Resampling.LANCZOS)
if img2.height != max_height:
ratio = max_height / img2.height
img2 = img2.resize((int(img2.width * ratio), max_height), Image.Resampling.LANCZOS)
# ๊ฐ€๋กœ๋กœ ๋ถ™์ด๊ธฐ
combined_width = img1.width + img2.width
combined_image = Image.new('RGB', (combined_width, max_height), 'white')
combined_image.paste(img1, (0, 0))
combined_image.paste(img2, (img1.width, 0))
# ์ €์žฅ
combined_path = os.path.join(temp_dir, f"combined_{i//2 + 1:03d}.png")
combined_image.save(combined_path, "PNG")
combined_paths.append(combined_path)
img1.close()
img2.close()
combined_image.close()
else:
# ํ™€์ˆ˜ ๊ฐœ์˜ ๊ฒฝ์šฐ ๋งˆ์ง€๋ง‰ ์ด๋ฏธ์ง€๋Š” ๊ทธ๋Œ€๋กœ ๋ณต์‚ฌ
img = Image.open(image_paths[i])
single_path = os.path.join(temp_dir, f"single_{i//2 + 1:03d}.png")
img.save(single_path, "PNG")
combined_paths.append(single_path)
img.close()
return combined_paths
def combine_images_vertically(image_paths: List[str]) -> List[str]:
"""Combine images in pairs vertically. Returns list of combined image paths."""
if not image_paths:
return []
combined_paths = []
temp_dir = tempfile.mkdtemp()
# 2์žฅ์”ฉ ์„ธ๋กœ๋กœ ๋ถ™์ด๊ธฐ
for i in range(0, len(image_paths), 2):
if i + 1 < len(image_paths):
# 2์žฅ์„ ์„ธ๋กœ๋กœ ๋ถ™์ด๊ธฐ
img1 = Image.open(image_paths[i])
img2 = Image.open(image_paths[i + 1])
# ๋„ˆ๋น„๋ฅผ ๋งž์ถค (๋” ๋„“์€ ์ชฝ ๊ธฐ์ค€)
max_width = max(img1.width, img2.width)
# ๋น„์œจ์„ ์œ ์ง€ํ•˜๋ฉด์„œ ๋„ˆ๋น„ ์กฐ์ •
if img1.width != max_width:
ratio = max_width / img1.width
img1 = img1.resize((max_width, int(img1.height * ratio)), Image.Resampling.LANCZOS)
if img2.width != max_width:
ratio = max_width / img2.width
img2 = img2.resize((max_width, int(img2.height * ratio)), Image.Resampling.LANCZOS)
# ์„ธ๋กœ๋กœ ๋ถ™์ด๊ธฐ
combined_height = img1.height + img2.height
combined_image = Image.new('RGB', (max_width, combined_height), 'white')
combined_image.paste(img1, (0, 0))
combined_image.paste(img2, (0, img1.height))
# ์ €์žฅ
combined_path = os.path.join(temp_dir, f"vertical_combined_{i//2 + 1:03d}.png")
combined_image.save(combined_path, "PNG")
combined_paths.append(combined_path)
img1.close()
img2.close()
combined_image.close()
else:
# ํ™€์ˆ˜ ๊ฐœ์˜ ๊ฒฝ์šฐ ๋งˆ์ง€๋ง‰ ์ด๋ฏธ์ง€๋Š” ๊ทธ๋Œ€๋กœ ๋ณต์‚ฌ
img = Image.open(image_paths[i])
single_path = os.path.join(temp_dir, f"vertical_single_{i//2 + 1:03d}.png")
img.save(single_path, "PNG")
combined_paths.append(single_path)
img.close()
return combined_paths
def combine_images_with_overlap(image_paths: List[str], direction: str = "horizontal") -> List[str]:
"""Combine images with sliding window (overlap). Returns list of combined image paths."""
if not image_paths or len(image_paths) < 2:
return image_paths
combined_paths = []
temp_dir = tempfile.mkdtemp()
# ์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ๋กœ 2์žฅ์”ฉ ๋ณ‘ํ•ฉ (1,2), (2,3), (3,4), (4,5)...
for i in range(len(image_paths) - 1):
img1 = Image.open(image_paths[i])
img2 = Image.open(image_paths[i + 1])
if direction == "horizontal":
# ๊ฐ€๋กœ ๋ณ‘ํ•ฉ - ๋†’์ด๋ฅผ ๋งž์ถค
max_height = max(img1.height, img2.height)
if img1.height != max_height:
ratio = max_height / img1.height
img1 = img1.resize((int(img1.width * ratio), max_height), Image.Resampling.LANCZOS)
if img2.height != max_height:
ratio = max_height / img2.height
img2 = img2.resize((int(img2.width * ratio), max_height), Image.Resampling.LANCZOS)
# ๊ฐ€๋กœ๋กœ ๋ถ™์ด๊ธฐ
combined_width = img1.width + img2.width
combined_image = Image.new('RGB', (combined_width, max_height), 'white')
combined_image.paste(img1, (0, 0))
combined_image.paste(img2, (img1.width, 0))
combined_path = os.path.join(temp_dir, f"overlap_h_{i+1}_{i+2}.png")
else: # vertical
# ์„ธ๋กœ ๋ณ‘ํ•ฉ - ๋„ˆ๋น„๋ฅผ ๋งž์ถค
max_width = max(img1.width, img2.width)
if img1.width != max_width:
ratio = max_width / img1.width
img1 = img1.resize((max_width, int(img1.height * ratio)), Image.Resampling.LANCZOS)
if img2.width != max_width:
ratio = max_width / img2.width
img2 = img2.resize((max_width, int(img2.height * ratio)), Image.Resampling.LANCZOS)
# ์„ธ๋กœ๋กœ ๋ถ™์ด๊ธฐ
combined_height = img1.height + img2.height
combined_image = Image.new('RGB', (max_width, combined_height), 'white')
combined_image.paste(img1, (0, 0))
combined_image.paste(img2, (0, img1.height))
combined_path = os.path.join(temp_dir, f"overlap_v_{i+1}_{i+2}.png")
combined_image.save(combined_path, "PNG")
combined_paths.append(combined_path)
img1.close()
img2.close()
combined_image.close()
return combined_paths
def create_prompt_content_with_image(image_paths: List[str], prompt: str) -> list:
"""Create a prompt content with image URLs."""
if not image_paths:
return [{"type": "text", "text": prompt}]
else:
content = [{"type": "text", "text": prompt}]
for path in image_paths:
try:
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image_base64_from_url(path)}"
},
})
except Exception as e:
print(f"Error encoding image {path}: {e}")
return content
def log_api_request(messages: List[dict], model_name: str) -> str:
"""Log the actual API request content for debugging - appends to accumulated requests."""
import json
import datetime
global current_request_info
# ํ˜„์žฌ ์‹œ๊ฐ„ ๊ฐ€์ ธ์˜ค๊ธฐ
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ์‹ค์ œ API ์š”์ฒญ๊ณผ ๊ฐ™์€ ๊ตฌ์กฐ๋ฅผ ์ƒ์„ฑ (์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋Š” ์ถ•์•ฝ)
api_request = {
"model": model_name,
"messages": []
}
for message in messages:
message_copy = {"role": message.get("role", "unknown")}
content = message.get("content", "")
if isinstance(content, str):
# ํ…์ŠคํŠธ ๋‚ด์šฉ์ธ ๊ฒฝ์šฐ ๊ทธ๋Œ€๋กœ ํฌํ•จ
message_copy["content"] = content
elif isinstance(content, list):
# ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ฝ˜ํ…์ธ ์ธ ๊ฒฝ์šฐ
content_copy = []
for item in content:
if item.get("type") == "text":
content_copy.append({
"type": "text",
"text": item.get("text", "")
})
elif item.get("type") == "image_url":
image_url = item.get("image_url", {}).get("url", "")
if image_url.startswith("data:image"):
# Base64 ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋Š” ์ถ•์•ฝํ•ด์„œ ํ‘œ์‹œ
content_copy.append({
"type": "image_url",
"image_url": {
"url": f"data:image/*;base64,[BASE64_DATA_{len(image_url)}_CHARS]"
}
})
else:
content_copy.append({
"type": "image_url",
"image_url": {"url": image_url}
})
message_copy["content"] = content_copy
api_request["messages"].append(message_copy)
# JSON์œผ๋กœ ํฌ๋งทํŒ…
request_json = json.dumps(api_request, ensure_ascii=False, indent=2)
# ์ƒˆ ์š”์ฒญ์„ ๊ตฌ๋ถ„์„ ๊ณผ ํ•จ๊ป˜ ๊ธฐ์กด ๋‚ด์šฉ์— append
separator = f"\n{'='*80}\n๐Ÿš€ API ์š”์ฒญ [{timestamp}]\n{'='*80}\n"
new_request = f"{separator}{request_json}\n"
if current_request_info:
current_request_info += new_request
else:
# ์ฒซ ๋ฒˆ์งธ ์š”์ฒญ์ธ ๊ฒฝ์šฐ ๊ตฌ๋ถ„์„  ์—†์ด ์‹œ์ž‘
current_request_info = f"๐Ÿš€ API ์š”์ฒญ [{timestamp}]\n{'='*80}\n{request_json}\n"
return current_request_info
def send_chat_completion_request(image_paths: List[str], prompt: str, system_prompt: str = ""):
"""Send a chat completion request with images."""
# ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ๊ฐ€ ๋น„์–ด์žˆ์œผ๋ฉด ํŒŒ์ผ์—์„œ ๋กœ๋“œ
if not system_prompt.strip():
system_prompt = load_system_prompt()
messages = [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": create_prompt_content_with_image(image_paths, prompt),
},
]
# ์‹ค์ œ API ํ˜ธ์ถœ (๋กœ๊น…์€ process_request์—์„œ ๋ณ„๋„๋กœ ์ฒ˜๋ฆฌ)
return client.chat.completions.create(model=model, messages=messages)
def process_images_in_batches(image_paths: List[str], prompt: str, system_prompt: str, batch_size: int = 3) -> List[str]:
"""Process images in batches of specified size and return list of results."""
if not image_paths:
return []
results = []
# ์ด๋ฏธ์ง€๋ฅผ ๋ฐฐ์น˜ ํฌ๊ธฐ๋กœ ๋‚˜๋ˆ„์–ด ์ฒ˜๋ฆฌ
for i in range(0, len(image_paths), batch_size):
batch_images = image_paths[i:i + batch_size]
try:
# ๋ฐฐ์น˜ ์ •๋ณด๋ฅผ ํ”„๋กฌํ”„ํŠธ์— ์ถ”๊ฐ€
# batch_prompt = f"{prompt}\n\n[๋ฐฐ์น˜ {i//batch_size + 1}/{(len(image_paths) + batch_size - 1)//batch_size}]"
batch_prompt = f"{prompt}"
# API ์š”์ฒญ
completion = send_chat_completion_request(batch_images, batch_prompt, system_prompt)
response_content = completion.choices[0].message.content
print(response_content)
results.append(response_content)
except Exception as e:
results.append(f"๋ฐฐ์น˜ {i//batch_size + 1} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
return results
def merge_batch_results(results: List[str]) -> str:
"""Merge results from multiple batches into a single document."""
if not results:
return ""
if len(results) == 1:
return results[0]
merged_result = ""
# merged_result = "# ๋ชจ๋“  ์ด๋ ฅ์„œ ๋ถ„์„ ๊ฒฐ๊ณผ\n\n"
# merged_result += f"์ด {len(results)}๊ฐœ ์ด๋ฏธ์ง€๋ฅผ ์ธ์‹ํ–ˆ์Šต๋‹ˆ๋‹ค.\n\n"
for i, result in enumerate(results, 1):
# merged_result += f"## {i} ๋ฒˆ์งธ ์ด๋ฏธ์ง€ ์ธ์‹ ๊ฒฐ๊ณผ\n\n"
merged_result += result
# merged_result += "\n\n---\n\n"
return merged_result
def get_pdf_files():
"""Get list of PDF files in the resume_samples directory."""
pdf_files = glob.glob("./resume_samples/**/*.pdf", recursive=True)
if not pdf_files:
# PDF ํŒŒ์ผ์ด ์—†์œผ๋ฉด ๊ธฐ๋ณธ ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
os.makedirs("./resume_samples/pdf/text", exist_ok=True)
return []
return sorted(pdf_files)
def save_result_to_file(content: str, filename: str) -> str:
"""Save the analysis result to a markdown file."""
if not content:
return "์ €์žฅํ•  ๋‚ด์šฉ์ด ์—†์Šต๋‹ˆ๋‹ค."
if not filename:
return "ํŒŒ์ผ ์ด๋ฆ„์ด ์ง€์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํŒŒ์ผ ์ด๋ฆ„์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
# ํ™•์žฅ์ž ์ถ”๊ฐ€
if not filename.endswith('.md'):
filename += '.md'
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(content)
return f"๊ฒฐ๊ณผ๊ฐ€ {filename}์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
except Exception as e:
return f"ํŒŒ์ผ ์ €์žฅ ์˜ค๋ฅ˜: {str(e)}"
def extract_text_with_fitz(pdf_path: str) -> str:
"""PDF์—์„œ Fitz(PyMuPDF)๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ"""
try:
doc = fitz.open(pdf_path)
text_content = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text_content += f"## Page {page_num + 1}\n\n"
text_content += page.get_text("text")
if page_num < len(doc) - 1:
text_content += "\n\n---\n\n"
doc.close()
return text_content
except Exception as e:
return f"Fitz ํ…์ŠคํŠธ ์ถ”์ถœ ์˜ค๋ฅ˜: {str(e)}"
def extract_text_with_docling(pdf_path: str) -> str:
"""PDF์—์„œ Docling์œผ๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ (OCR ํฌํ•จ)"""
try:
result = converter.convert(pdf_path)
return result.document.export_to_markdown()
except Exception as e:
return f"Docling ํ…์ŠคํŠธ ์ถ”์ถœ ์˜ค๋ฅ˜: {str(e)}"
def preview_image_processing(pdf_path: str, processing_mode: str = "๊ฐ€๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)", overlap_option: str = "์ผ๋ฐ˜ ๋ณ‘ํ•ฉ") -> List[str]:
"""PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ์„ ํƒํ•œ ๋ฐฉ์‹์œผ๋กœ ์ฒ˜๋ฆฌํ•œ ๊ฒฐ๊ณผ๋ฅผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ์šฉ์œผ๋กœ ๋ฐ˜ํ™˜"""
try:
if not pdf_path or not os.path.exists(pdf_path):
return []
# PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
pdf_images = convert_pdf_to_images(pdf_path)
# ์ฒ˜๋ฆฌ ๋ฐฉ์‹์— ๋”ฐ๋ผ ๋ถ„๊ธฐ
if processing_mode == "๊ฐ€๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)":
if overlap_option == "์ค‘๋ณต ๋ณ‘ํ•ฉ (์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ)":
processed_images = combine_images_with_overlap(pdf_images, "horizontal")
else:
processed_images = combine_images_horizontally(pdf_images)
elif processing_mode == "์„ธ๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)":
if overlap_option == "์ค‘๋ณต ๋ณ‘ํ•ฉ (์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ)":
processed_images = combine_images_with_overlap(pdf_images, "vertical")
else:
processed_images = combine_images_vertically(pdf_images)
else: # "๋‚ฑ๊ฐœ ํŽ˜์ด์ง€"
processed_images = pdf_images
return processed_images
except Exception as e:
print(f"์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ์˜ค๋ฅ˜: {e}")
return []
def process_request(
prompt: str,
system_prompt: str,
use_images: bool,
use_docling: bool,
pdf_file_path: str,
uploaded_file: str,
output_filename: str,
image_processing_mode: str = "๊ฐ€๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)",
overlap_option: str = "์ผ๋ฐ˜ ๋ณ‘ํ•ฉ",
batch_size: int = 3,
use_postprocess: bool = True,
postprocess_prompt: str = "",
progress = None
) -> Generator[Tuple[str, str, str, str, str], None, None]:
"""Process the request with all the options and yield intermediate results."""
import time
# ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
total_start_time = time.time()
global current_log_messages, current_request_info
current_log_messages = [] # ๋กœ๊ทธ ์ดˆ๊ธฐํ™”
current_request_info = "" # API ์š”์ฒญ ์ •๋ณด ์ดˆ๊ธฐํ™”
# ํŒŒ์ผ ๊ฒฝ๋กœ ๊ฒฐ์ • (์—…๋กœ๋“œ๋œ ํŒŒ์ผ์ด ์žˆ์œผ๋ฉด ์šฐ์„  ์‚ฌ์šฉ)
final_pdf_path = uploaded_file if uploaded_file else pdf_file_path
# ์ดˆ๊ธฐํ™”
full_prompt = prompt
docling_output = ""
images_to_use = []
temp_dirs_to_cleanup = []
response_content = "" # ์ตœ์ข… ๊ฒฐ๊ณผ
batch_content = "" # ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ
def add_log(message):
current_log_messages.append(f"[{len(current_log_messages)+1:02d}] {message}")
log_text = "\n".join(current_log_messages)
# ์ค‘๊ฐ„ ๊ฒฐ๊ณผ๋ฅผ yieldํ•ฉ๋‹ˆ๋‹ค (๋ฐฐ์น˜ ๊ฒฐ๊ณผ, ์ตœ์ข… ๊ฒฐ๊ณผ, ํŒŒ์‹ฑ ๊ฒฐ๊ณผ, ๋กœ๊ทธ, API ์š”์ฒญ ์ •๋ณด)
yield batch_content, response_content, docling_output, log_text, current_request_info
return log_text
# ๋กœ๊ทธ ์ƒ์„ฑ๊ธฐ ๊ฐ์ฒด ์ƒ์„ฑ
log_generator = add_log("์‹œ์ž‘...")
next(log_generator) # ์ฒซ ๋ฒˆ์งธ ๋กœ๊ทธ ์ƒ์„ฑ
try:
# PDF ํŒŒ์ผ์ด ์„ ํƒ๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ
if not final_pdf_path or not os.path.exists(final_pdf_path):
msg = "PDF ํŒŒ์ผ์„ ์„ ํƒํ•˜๊ฑฐ๋‚˜ ์—…๋กœ๋“œํ•ด ์ฃผ์„ธ์š”."
for result in add_log("โŒ PDF ํŒŒ์ผ์ด ์„ ํƒ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค."):
yield result
yield "", msg, "", "\n".join(current_log_messages), current_request_info
return
for result in add_log(f"โœ… ์ฒ˜๋ฆฌํ•  PDF ํŒŒ์ผ: {os.path.basename(final_pdf_path)}"):
yield result
# PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ์ž๋™ ๋ณ€ํ™˜ (์ด๋ฏธ์ง€ ์‚ฌ์šฉ์ด ํ™œ์„ฑํ™”๋œ ๊ฒฝ์šฐ)
if use_images:
for result in add_log("๐Ÿ–ผ๏ธ PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ ์ค‘..."):
yield result
print(f"PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ ์ค‘: {final_pdf_path}")
# PDF๋ฅผ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
pdf_images = convert_pdf_to_images(final_pdf_path)
temp_dirs_to_cleanup.extend([os.path.dirname(path) for path in pdf_images])
for result in add_log(f"๐Ÿ“„ PDF์—์„œ {len(pdf_images)}๊ฐœ ํŽ˜์ด์ง€ ์ถ”์ถœ ์™„๋ฃŒ"):
yield result
# ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ ๋ฐฉ์‹์— ๋”ฐ๋ผ ๋ถ„๊ธฐ
if image_processing_mode == "๊ฐ€๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)":
if overlap_option == "์ค‘๋ณต ๋ณ‘ํ•ฉ (์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ)":
# ์ค‘๋ณต ๊ฐ€๋กœ ๋ณ‘ํ•ฉ
for result in add_log("๐Ÿ”— ํŽ˜์ด์ง€๋“ค์„ ์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ ๋ฐฉ์‹์œผ๋กœ ๊ฐ€๋กœ ๋ณ‘ํ•ฉ ์ค‘..."):
yield result
combined_images = combine_images_with_overlap(pdf_images, "horizontal")
for result in add_log(f"โœ… {len(combined_images)}๊ฐœ์˜ ์ค‘๋ณต ๊ฐ€๋กœ ๋ณ‘ํ•ฉ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์™„๋ฃŒ"):
yield result
else:
# ์ผ๋ฐ˜ ๊ฐ€๋กœ ๋ณ‘ํ•ฉ
for result in add_log("๐Ÿ”— ํŽ˜์ด์ง€๋“ค์„ 2์žฅ์”ฉ ๊ฐ€๋กœ๋กœ ๋ณ‘ํ•ฉ ์ค‘..."):
yield result
combined_images = combine_images_horizontally(pdf_images)
for result in add_log(f"โœ… {len(combined_images)}๊ฐœ์˜ ๊ฐ€๋กœ ๋ณ‘ํ•ฉ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์™„๋ฃŒ"):
yield result
temp_dirs_to_cleanup.extend([os.path.dirname(path) for path in combined_images])
images_to_use = combined_images
print(f"PDF์—์„œ {len(pdf_images)}๊ฐœ ํŽ˜์ด์ง€๋ฅผ {len(combined_images)}๊ฐœ ๊ฐ€๋กœ ๋ณ‘ํ•ฉ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ ์™„๋ฃŒ")
elif image_processing_mode == "์„ธ๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)":
if overlap_option == "์ค‘๋ณต ๋ณ‘ํ•ฉ (์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ)":
# ์ค‘๋ณต ์„ธ๋กœ ๋ณ‘ํ•ฉ
for result in add_log("๐Ÿ”— ํŽ˜์ด์ง€๋“ค์„ ์Šฌ๋ผ์ด๋”ฉ ์œˆ๋„์šฐ ๋ฐฉ์‹์œผ๋กœ ์„ธ๋กœ ๋ณ‘ํ•ฉ ์ค‘..."):
yield result
combined_images = combine_images_with_overlap(pdf_images, "vertical")
for result in add_log(f"โœ… {len(combined_images)}๊ฐœ์˜ ์ค‘๋ณต ์„ธ๋กœ ๋ณ‘ํ•ฉ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์™„๋ฃŒ"):
yield result
else:
# ์ผ๋ฐ˜ ์„ธ๋กœ ๋ณ‘ํ•ฉ
for result in add_log("๐Ÿ”— ํŽ˜์ด์ง€๋“ค์„ 2์žฅ์”ฉ ์„ธ๋กœ๋กœ ๋ณ‘ํ•ฉ ์ค‘..."):
yield result
combined_images = combine_images_vertically(pdf_images)
for result in add_log(f"โœ… {len(combined_images)}๊ฐœ์˜ ์„ธ๋กœ ๋ณ‘ํ•ฉ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์™„๋ฃŒ"):
yield result
temp_dirs_to_cleanup.extend([os.path.dirname(path) for path in combined_images])
images_to_use = combined_images
print(f"PDF์—์„œ {len(pdf_images)}๊ฐœ ํŽ˜์ด์ง€๋ฅผ {len(combined_images)}๊ฐœ ์„ธ๋กœ ๋ณ‘ํ•ฉ ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜ ์™„๋ฃŒ")
else: # "๋‚ฑ๊ฐœ ํŽ˜์ด์ง€"
# ํŽ˜์ด์ง€๋ฅผ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
images_to_use = pdf_images
for result in add_log(f"โœ… {len(pdf_images)}๊ฐœ์˜ ๊ฐœ๋ณ„ ํŽ˜์ด์ง€ ์ด๋ฏธ์ง€ ์ค€๋น„ ์™„๋ฃŒ"):
yield result
print(f"PDF์—์„œ {len(pdf_images)}๊ฐœ ํŽ˜์ด์ง€๋ฅผ ๊ฐœ๋ณ„ ์ด๋ฏธ์ง€๋กœ ์‚ฌ์šฉ")
# docling ์ฒ˜๋ฆฌ
if use_docling:
for result in add_log("๐Ÿ“ Docling์œผ๋กœ PDF ํ…์ŠคํŠธ ํŒŒ์‹ฑ ์ค‘..."):
yield result
try:
result = converter.convert(final_pdf_path)
docling_output = result.document.export_to_markdown()
full_prompt += f"\n\nํŒŒ์‹ฑ๋œ ์ด๋ ฅ์„œ ๋‚ด์šฉ: {docling_output}"
for result in add_log(f"โœ… ํ…์ŠคํŠธ ํŒŒ์‹ฑ ์™„๋ฃŒ (๊ธธ์ด: {len(docling_output)} ๋ฌธ์ž)"):
yield result
except Exception as e:
error_msg = f"Docling ๋ณ€ํ™˜ ์˜ค๋ฅ˜: {str(e)}"
for result in add_log(f"โŒ Docling ๋ณ€ํ™˜ ์˜ค๋ฅ˜: {str(e)}"):
yield result
for result in add_log(f"โŒ ์ฒ˜๋ฆฌ ์ค‘๋‹จ๋จ"):
yield result
yield "", error_msg, docling_output, "\n".join(current_log_messages), current_request_info
return
# ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ (๋ฐฐ์น˜ ์ฒ˜๋ฆฌ)
if images_to_use:
# ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์ •๋ณด๋งŒ ๋กœ๊ทธ์— ์ถ”๊ฐ€ (๊ธธ์ด ์ œ๊ฑฐ)
for result in add_log(f"๐Ÿค– ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ: {system_prompt[:50]}{'...' if len(system_prompt) > 50 else ''}"):
yield result
if len(images_to_use) <= batch_size:
# ์„ค์ •๋œ ๋ฐฐ์น˜ ํฌ๊ธฐ ์ดํ•˜๋ฉด ํ•œ ๋ฒˆ์— ์ฒ˜๋ฆฌ
for result in add_log(f"๐Ÿ“ค API ์š”์ฒญ ์ค€๋น„ ์ค‘... (์ด๋ฏธ์ง€ {len(images_to_use)}์žฅ)"):
yield result
# API ์š”์ฒญ ์ •๋ณด๋ฅผ ๋จผ์ € ๋กœ๊น…ํ•˜๊ณ  ์ฆ‰์‹œ UI์— ํ‘œ์‹œ
system_prompt_clean = system_prompt if system_prompt.strip() else load_system_prompt()
messages = [
{"role": "system", "content": system_prompt_clean},
{
"role": "user",
"content": create_prompt_content_with_image(images_to_use, full_prompt)
}
]
log_api_request(messages, model)
# ์ฆ‰์‹œ API ์š”์ฒญ ์ •๋ณด๋ฅผ UI์— ํ‘œ์‹œ
yield "", "", docling_output, "\n".join(current_log_messages), current_request_info
for result in add_log(f"๐Ÿค– LLM API ์š”์ฒญ ์ค‘... (์ด๋ฏธ์ง€ {len(images_to_use)}์žฅ)"):
yield result
# ๋ฐฐ์น˜ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
import time
batch_start_time = time.time()
completion = send_chat_completion_request(images_to_use, full_prompt, system_prompt)
response_content = completion.choices[0].message.content
batch_content = response_content # ๋‹จ์ผ ๋ฐฐ์น˜๋Š” ๋ฐฐ์น˜ ๊ฒฐ๊ณผ์™€ ๋™์ผ
print(response_content)
# ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
batch_duration = time.time() - batch_start_time
for result in add_log(f"โœ… LLM ๋ถ„์„ ์™„๋ฃŒ (์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {batch_duration:.1f}์ดˆ)"):
yield result
else:
# ์„ค์ •๋œ ๋ฐฐ์น˜ ํฌ๊ธฐ ์ดˆ๊ณผ๋ฉด ๋ฐฐ์น˜๋กœ ๋‚˜๋ˆ„์–ด ์ฒ˜๋ฆฌ
num_batches = (len(images_to_use) + batch_size - 1) // batch_size
for result in add_log(f"๐Ÿ“ฆ ์ด๋ฏธ์ง€๊ฐ€ {len(images_to_use)}์žฅ์ด๋ฏ€๋กœ {num_batches}๊ฐœ ๋ฐฐ์น˜๋กœ ๋‚˜๋ˆ„์–ด ์ฒ˜๋ฆฌ (๋ฐฐ์น˜๋‹น {batch_size}์žฅ)"):
yield result
print(f"์ด๋ฏธ์ง€๊ฐ€ {len(images_to_use)}์žฅ์ด๋ฏ€๋กœ ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค. (๋ฐฐ์น˜๋‹น {batch_size}์žฅ)")
batch_results = []
for i in range(0, len(images_to_use), batch_size):
batch_num = i // batch_size + 1
batch_images = images_to_use[i:i + batch_size]
for result in add_log(f"๐Ÿ“ค ๋ฐฐ์น˜ {batch_num}/{num_batches} API ์š”์ฒญ ์ค€๋น„ ์ค‘... (์ด๋ฏธ์ง€ {len(batch_images)}์žฅ)"):
yield result
# API ์š”์ฒญ ์ •๋ณด๋ฅผ ๋จผ์ € ๋กœ๊น…ํ•˜๊ณ  ์ฆ‰์‹œ UI์— ํ‘œ์‹œ
batch_prompt = f"{full_prompt}"
system_prompt_clean = system_prompt if system_prompt.strip() else load_system_prompt()
messages = [
{"role": "system", "content": system_prompt_clean},
{
"role": "user",
"content": create_prompt_content_with_image(batch_images, batch_prompt)
}
]
log_api_request(messages, model)
# ์ฆ‰์‹œ API ์š”์ฒญ ์ •๋ณด๋ฅผ UI์— ํ‘œ์‹œ
yield "", "", docling_output, "\n".join(current_log_messages), current_request_info
for result in add_log(f"๐Ÿค– ๋ฐฐ์น˜ {batch_num}/{num_batches} ์ฒ˜๋ฆฌ ์ค‘... (์ด๋ฏธ์ง€ {len(batch_images)}์žฅ)"):
yield result
try:
# ๋ฐฐ์น˜ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
import time
batch_start_time = time.time()
completion = send_chat_completion_request(batch_images, batch_prompt, system_prompt)
batch_response = completion.choices[0].message.content
batch_results.append(batch_response)
print(batch_response)
# ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
batch_duration = time.time() - batch_start_time
for result in add_log(f"โœ… ๋ฐฐ์น˜ {batch_num} ์™„๋ฃŒ (์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {batch_duration:.1f}์ดˆ)"):
yield result
except Exception as e:
batch_results.append(f"๋ฐฐ์น˜ {batch_num} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
for result in add_log(f"โŒ ๋ฐฐ์น˜ {batch_num} ์˜ค๋ฅ˜: {str(e)}"):
yield result
batch_content = merge_batch_results(batch_results) # ๋ฐฐ์น˜ ๊ฒฐ๊ณผ ์ €์žฅ
response_content = batch_content # ์ดˆ๊ธฐ ๊ฒฐ๊ณผ๋Š” ๋ฐฐ์น˜ ๊ฒฐ๊ณผ์™€ ๋™์ผ
for result in add_log("๐Ÿ”— ๋ชจ๋“  ๋ฐฐ์น˜ ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ ์™„๋ฃŒ"):
yield result
else:
# ์ด๋ฏธ์ง€๊ฐ€ ์—†์œผ๋ฉด ํ…์ŠคํŠธ๋งŒ ์ฒ˜๋ฆฌ - ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ์ •๋ณด๋งŒ ํ‘œ์‹œ
for result in add_log(f"๐Ÿค– ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ: {system_prompt[:50]}{'...' if len(system_prompt) > 50 else ''}"):
yield result
for result in add_log("๐Ÿ“ค ํ…์ŠคํŠธ ์ „์šฉ API ์š”์ฒญ ์ค€๋น„ ์ค‘..."):
yield result
# API ์š”์ฒญ ์ •๋ณด๋ฅผ ๋จผ์ € ๋กœ๊น…ํ•˜๊ณ  ์ฆ‰์‹œ UI์— ํ‘œ์‹œ
system_prompt_clean = system_prompt if system_prompt.strip() else load_system_prompt()
messages = [
{"role": "system", "content": system_prompt_clean},
{
"role": "user",
"content": create_prompt_content_with_image([], full_prompt)
}
]
log_api_request(messages, model)
# ์ฆ‰์‹œ API ์š”์ฒญ ์ •๋ณด๋ฅผ UI์— ํ‘œ์‹œ
yield "", "", docling_output, "\n".join(current_log_messages), current_request_info
for result in add_log("๐Ÿค– ํ…์ŠคํŠธ ์ „์šฉ LLM API ์š”์ฒญ ์ค‘..."):
yield result
# ํ…์ŠคํŠธ ์ „์šฉ ์ฒ˜๋ฆฌ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
import time
text_start_time = time.time()
completion = send_chat_completion_request([], full_prompt, system_prompt)
response_content = completion.choices[0].message.content
batch_content = response_content # ํ…์ŠคํŠธ ์ „์šฉ์€ ๋ฐฐ์น˜ ๊ฒฐ๊ณผ์™€ ๋™์ผ
print(response_content)
# ํ…์ŠคํŠธ ์ „์šฉ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
text_duration = time.time() - text_start_time
for result in add_log(f"โœ… ํ…์ŠคํŠธ ๋ถ„์„ ์™„๋ฃŒ (์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {text_duration:.1f}์ดˆ)"):
yield result
# ํ›„์ฒ˜๋ฆฌ ์ˆ˜ํ–‰ (๋‹ค์ค‘ ๋ฐฐ์น˜์ธ ๊ฒฝ์šฐ์—๋งŒ)
if use_postprocess and len(images_to_use) > batch_size:
for result in add_log("๐Ÿ”„ ํ›„์ฒ˜๋ฆฌ ์ž‘์—…์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."):
yield result
# ํ›„์ฒ˜๋ฆฌ ํ”„๋กฌํ”„ํŠธ๊ฐ€ ๋น„์–ด์žˆ์œผ๋ฉด ๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ
if not postprocess_prompt.strip():
postprocess_prompt = load_postprocess_prompt()
# ๋ฐฐ์น˜ ๊ฒฐ๊ณผ๋“ค์„ ํ•˜๋‚˜์˜ ํ…์ŠคํŠธ๋กœ ํ•ฉ์นจ
combined_results = f"{postprocess_prompt}\n\n=== ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ ===\n\n{response_content}"
for result in add_log("๐Ÿ“ค ํ›„์ฒ˜๋ฆฌ API ์š”์ฒญ ์ค€๋น„ ์ค‘..."):
yield result
# ํ›„์ฒ˜๋ฆฌ API ์š”์ฒญ ์ •๋ณด๋ฅผ ๋จผ์ € ๋กœ๊น…ํ•˜๊ณ  ์ฆ‰์‹œ UI์— ํ‘œ์‹œ
system_prompt_clean = system_prompt if system_prompt.strip() else load_system_prompt()
messages = [
{"role": "system", "content": system_prompt_clean},
{
"role": "user",
"content": combined_results
}
]
log_api_request(messages, model)
# ์ฆ‰์‹œ API ์š”์ฒญ ์ •๋ณด๋ฅผ UI์— ํ‘œ์‹œ
yield "", "", docling_output, "\n".join(current_log_messages), current_request_info
for result in add_log("๐Ÿค– ํ›„์ฒ˜๋ฆฌ LLM API ์š”์ฒญ ์ค‘..."):
yield result
# ํ›„์ฒ˜๋ฆฌ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
import time
postprocess_start_time = time.time()
# ํ›„์ฒ˜๋ฆฌ API ํ˜ธ์ถœ (์ด๋ฏธ์ง€ ์—†์ด ํ…์ŠคํŠธ๋งŒ)
completion = send_chat_completion_request([], combined_results, system_prompt)
response_content = completion.choices[0].message.content
print(response_content)
# ํ›„์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
postprocess_duration = time.time() - postprocess_start_time
for result in add_log(f"โœ… ํ›„์ฒ˜๋ฆฌ ์™„๋ฃŒ (์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {postprocess_duration:.1f}์ดˆ)"):
yield result
# ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ ๋ฐ ๋กœ๊น…
total_duration = time.time() - total_start_time
for result in add_log(f"โฑ๏ธ ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {total_duration:.1f}์ดˆ"):
yield result
for result in add_log("๐ŸŽ‰ ๋ชจ๋“  ์ฒ˜๋ฆฌ๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!"):
yield result
yield batch_content, response_content, docling_output, "\n".join(current_log_messages), current_request_info
except Exception as e:
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ์—๋„ ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ธฐ๋ก
total_duration = time.time() - total_start_time
for result in add_log(f"โฑ๏ธ ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {total_duration:.1f}์ดˆ (์˜ค๋ฅ˜๋กœ ์ธํ•œ ์ค‘๋‹จ)"):
yield result
error_msg = f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
for result in add_log(f"โŒ {error_msg}"):
yield result
for result in add_log("์ฒ˜๋ฆฌ๊ฐ€ ์ค‘๋‹จ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."):
yield result
yield "", error_msg, docling_output, "\n".join(current_log_messages), current_request_info
finally:
# ์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ •๋ฆฌ
if temp_dirs_to_cleanup:
for result in add_log("๐Ÿงน ์ž„์‹œ ํŒŒ์ผ ์ •๋ฆฌ ์ค‘..."):
yield result
for temp_dir in set(temp_dirs_to_cleanup): # ์ค‘๋ณต ์ œ๊ฑฐ
try:
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
print(f"์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ •๋ฆฌ: {temp_dir}")
except Exception as e:
print(f"์ž„์‹œ ๋””๋ ‰ํ† ๋ฆฌ ์ •๋ฆฌ ์‹คํŒจ: {temp_dir}, ์˜ค๋ฅ˜: {e}")
def process_request_preprocessing_only(
prompt: str,
system_prompt: str,
use_images: bool,
use_docling: bool,
pdf_file_path: str,
uploaded_file: str,
output_filename: str,
image_processing_mode: str = "๊ฐ€๋กœ ๋ณ‘ํ•ฉ (2ํŽ˜์ด์ง€์”ฉ)",
overlap_option: str = "์ผ๋ฐ˜ ๋ณ‘ํ•ฉ",
batch_size: int = 3,
progress = None
) -> Generator[Tuple[str, str, str, str, str], None, None]:
"""์ „์ฒ˜๋ฆฌ(๋ฐฐ์น˜ ์ฒ˜๋ฆฌ)๋งŒ ์ˆ˜ํ–‰ํ•˜๋Š” ํ•จ์ˆ˜"""
# ์›๋ณธ ํ•จ์ˆ˜๋ฅผ ํ˜ธ์ถœํ•˜๋˜, use_postprocess=False๋กœ ์„ค์ •
yield from process_request(
prompt=prompt,
system_prompt=system_prompt,
use_images=use_images,
use_docling=use_docling,
pdf_file_path=pdf_file_path,
uploaded_file=uploaded_file,
output_filename=output_filename,
image_processing_mode=image_processing_mode,
overlap_option=overlap_option,
batch_size=batch_size,
use_postprocess=False, # ํ›„์ฒ˜๋ฆฌ ๋น„ํ™œ์„ฑํ™”
postprocess_prompt="",
progress=progress
)
def process_request_postprocessing_only(
batch_result: str,
system_prompt: str,
postprocess_prompt: str = "",
progress = None
) -> Generator[Tuple[str, str, str, str, str], None, None]:
"""ํ›„์ฒ˜๋ฆฌ(๋ฐฐ์น˜ ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ)๋งŒ ์ˆ˜ํ–‰ํ•˜๋Š” ํ•จ์ˆ˜"""
import time
global current_log_messages, current_request_info
current_log_messages = [] # ๋กœ๊ทธ ์ดˆ๊ธฐํ™”
current_request_info = "" # API ์š”์ฒญ ์ •๋ณด ์ดˆ๊ธฐํ™”
# ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
total_start_time = time.time()
def add_log(message):
current_log_messages.append(f"[{len(current_log_messages)+1:02d}] {message}")
log_text = "\n".join(current_log_messages)
# ํ›„์ฒ˜๋ฆฌ์—์„œ๋Š” ๋ฐฐ์น˜ ๊ฒฐ๊ณผ๋Š” ๊ทธ๋Œ€๋กœ ์œ ์ง€ํ•˜๊ณ  ์ตœ์ข… ๊ฒฐ๊ณผ๋งŒ ์—…๋ฐ์ดํŠธ
yield batch_result, "", "", log_text, current_request_info
return log_text
# ๋กœ๊ทธ ์ƒ์„ฑ๊ธฐ ๊ฐ์ฒด ์ƒ์„ฑ
log_generator = add_log("ํ›„์ฒ˜๋ฆฌ ์‹œ์ž‘...")
next(log_generator) # ์ฒซ ๋ฒˆ์งธ ๋กœ๊ทธ ์ƒ์„ฑ
try:
if not batch_result or not batch_result.strip():
msg = "ํ›„์ฒ˜๋ฆฌํ•  ๋ฐฐ์น˜ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ๋จผ์ € ์ „์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰ํ•ด์ฃผ์„ธ์š”."
for result in add_log("โŒ ๋ฐฐ์น˜ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."):
yield result
yield batch_result, msg, "", "\n".join(current_log_messages), current_request_info
return
for result in add_log("๐Ÿ”„ ํ›„์ฒ˜๋ฆฌ ์ž‘์—…์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."):
yield result
# ํ›„์ฒ˜๋ฆฌ ํ”„๋กฌํ”„ํŠธ๊ฐ€ ๋น„์–ด์žˆ์œผ๋ฉด ๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ
if not postprocess_prompt.strip():
postprocess_prompt = load_postprocess_prompt()
# ๋ฐฐ์น˜ ๊ฒฐ๊ณผ๋“ค์„ ํ•˜๋‚˜์˜ ํ…์ŠคํŠธ๋กœ ํ•ฉ์นจ
combined_results = f"{postprocess_prompt}\n\n=== ๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ ===\n\n{batch_result}"
for result in add_log("๐Ÿ“ค ํ›„์ฒ˜๋ฆฌ API ์š”์ฒญ ์ค€๋น„ ์ค‘..."):
yield result
# ํ›„์ฒ˜๋ฆฌ API ์š”์ฒญ ์ •๋ณด๋ฅผ ๋จผ์ € ๋กœ๊น…ํ•˜๊ณ  ์ฆ‰์‹œ UI์— ํ‘œ์‹œ
system_prompt_clean = system_prompt if system_prompt.strip() else load_system_prompt()
messages = [
{"role": "system", "content": system_prompt_clean},
{
"role": "user",
"content": combined_results
}
]
log_api_request(messages, model)
# ์ฆ‰์‹œ API ์š”์ฒญ ์ •๋ณด๋ฅผ UI์— ํ‘œ์‹œ
yield batch_result, "", "", "\n".join(current_log_messages), current_request_info
for result in add_log("๐Ÿค– ํ›„์ฒ˜๋ฆฌ LLM API ์š”์ฒญ ์ค‘..."):
yield result
# ํ›„์ฒ˜๋ฆฌ ์‹œ์ž‘ ์‹œ๊ฐ„ ๊ธฐ๋ก
postprocess_start_time = time.time()
# ํ›„์ฒ˜๋ฆฌ API ํ˜ธ์ถœ (์ด๋ฏธ์ง€ ์—†์ด ํ…์ŠคํŠธ๋งŒ)
completion = send_chat_completion_request([], combined_results, system_prompt)
final_result = completion.choices[0].message.content
# ํ›„์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
postprocess_duration = time.time() - postprocess_start_time
for result in add_log(f"โœ… ํ›„์ฒ˜๋ฆฌ ์™„๋ฃŒ (์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {postprocess_duration:.1f}์ดˆ)"):
yield result
# ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ ๋ฐ ๋กœ๊น…
total_duration = time.time() - total_start_time
for result in add_log(f"โฑ๏ธ ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {total_duration:.1f}์ดˆ"):
yield result
for result in add_log("๐ŸŽ‰ ํ›„์ฒ˜๋ฆฌ๊ฐ€ ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค!"):
yield result
# ์ตœ์ข… ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜ (๋ฐฐ์น˜ ๊ฒฐ๊ณผ๋Š” ๊ทธ๋Œ€๋กœ, ์ตœ์ข… ๊ฒฐ๊ณผ๋งŒ ์—…๋ฐ์ดํŠธ)
yield batch_result, final_result, "", "\n".join(current_log_messages), current_request_info
except Exception as e:
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ์—๋„ ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ธฐ๋ก
total_duration = time.time() - total_start_time
for result in add_log(f"โฑ๏ธ ์ „์ฒด ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {total_duration:.1f}์ดˆ (์˜ค๋ฅ˜๋กœ ์ธํ•œ ์ค‘๋‹จ)"):
yield result
error_msg = f"ํ›„์ฒ˜๋ฆฌ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
for result in add_log(f"โŒ {error_msg}"):
yield result
for result in add_log("ํ›„์ฒ˜๋ฆฌ๊ฐ€ ์ค‘๋‹จ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."):
yield result
yield batch_result, error_msg, "", "\n".join(current_log_messages), current_request_info