Spaces:
Running
Running
File size: 5,121 Bytes
6bd37dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import os
import json
import gradio as gr
from PIL import Image
from dotenv import load_dotenv
from pillow_heif import register_heif_opener
from menu.llm import (
GeminiAPI,
OpenAIAPI
)
from menu.donut import DonutFinetuned
register_heif_opener()
load_dotenv(override=True)
GEMINI_API_TOKEN = os.getenv("GEMINI_API_TOKEN", "")
OPENAI_API_TOKEN = os.getenv("OPENAI_API_TOKEN", "")
SOURCE_CODE_GH_URL = "https://github.com/ryanlinjui/menu-text-detection"
BADGE_URL = "https://img.shields.io/badge/GitHub_Code-Click_Here!!-default?logo=github"
GITHUB_RAW_URL = "https://raw.githubusercontent.com/ryanlinjui/menu-text-detection/main"
EXAMPLE_IMAGE_LIST = [
f"{GITHUB_RAW_URL}/examples/menu-hd.jpg",
f"{GITHUB_RAW_URL}/examples/menu-vs.jpg",
f"{GITHUB_RAW_URL}/examples/menu-si.jpg"
]
FINETUNED_MODEL_LIST = [
"Donut (Document Parsing Task) Fine-tuned Model"
]
LLM_MODEL_LIST = [
"gemini-2.5-pro",
"gemini-2.5-flash",
"gemini-2.0-flash",
"gpt-4.1",
"gpt-4o",
"o4-mini"
]
donut_finetuned = DonutFinetuned("ryanlinjui/donut-base-finetuned-menu")
def handle(image: Image.Image, model: str, api_token: str) -> str:
if image is None:
raise gr.Error("Please upload an image first.")
if model == FINETUNED_MODEL_LIST[0]:
result = donut_finetuned.predict(image)
elif model in LLM_MODEL_LIST:
if len(api_token) < 10:
raise gr.Error(f"Please provide a valid token for {model}.")
try:
if model in LLM_MODEL_LIST[:3]:
result = GeminiAPI.call(image, model, api_token)
else:
result = OpenAIAPI.call(image, model, api_token)
except Exception as e:
raise gr.Error(f"Failed to process with API model {model}: {str(e)}")
else:
raise gr.Error("Invalid model selection. Please choose a valid model.")
return json.dumps(result, indent=4, ensure_ascii=False, sort_keys=True)
def UserInterface() -> gr.Interface:
with gr.Blocks(
delete_cache=(86400, 86400),
css="""
.image-panel {
display: flex;
flex-direction: column;
height: 600px;
}
.image-panel img {
object-fit: contain;
max-height: 600px;
max-width: 600px;
width: 100%;
}
.large-text textarea {
font-size: 20px !important;
height: 600px !important;
width: 100% !important;
}
"""
) as gradio_interface:
gr.HTML(f'<a href="{SOURCE_CODE_GH_URL}"><img src="{BADGE_URL}" alt="GitHub Code"/></a>')
gr.Markdown("# Menu Text Detection")
with gr.Row():
with gr.Column(scale=1, min_width=500):
gr.Markdown("## 📷 Menu Image")
menu_image = gr.Image(
type="pil",
label="Input menu image",
elem_classes="image-panel"
)
gr.Markdown("## 🤖 Model Selection")
model_choice_dropdown = gr.Dropdown(
choices=FINETUNED_MODEL_LIST + LLM_MODEL_LIST,
value=FINETUNED_MODEL_LIST[0],
label="Select Text Detection Model"
)
api_token_textbox = gr.Textbox(
label="API Token",
placeholder="Enter your API token here...",
type="password",
visible=False
)
generate_button = gr.Button("Generate Menu Information", variant="primary")
gr.Examples(
examples=EXAMPLE_IMAGE_LIST,
inputs=menu_image,
label="Example Menu Images"
)
with gr.Column(scale=1):
gr.Markdown("## 🍽️ Menu Info")
menu_json_textbox = gr.Textbox(
label="Ouput JSON",
interactive=True,
text_align="left",
elem_classes="large-text"
)
def update_token_visibility(choice):
if choice in LLM_MODEL_LIST:
current_token = ""
if choice in LLM_MODEL_LIST[:3]:
current_token = GEMINI_API_TOKEN
else:
current_token = OPENAI_API_TOKEN
return gr.update(visible=True, value=current_token)
else:
return gr.update(visible=False)
model_choice_dropdown.change(
fn=update_token_visibility,
inputs=model_choice_dropdown,
outputs=api_token_textbox
)
generate_button.click(
fn=handle,
inputs=[menu_image, model_choice_dropdown, api_token_textbox],
outputs=menu_json_textbox
)
return gradio_interface
if __name__ == "__main__":
demo = UserInterface()
demo.launch() |