# Login to HuggingFace (just login once)

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()

# Collect Menu Image Datasets
- Use `metadata.jsonl` to label the images's ground truth. You can visit [here](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) to see the examples.
- After finishing, push to HuggingFace Datasets.
- For labeling:
 - [Google AI Studio](https://aistudio.google.com) or [OpenAI ChatGPT](https://chatgpt.com).
 - Use function calling by API. Start the gradio app locally or visit [here](https://huggingface.co/spaces/ryanlinjui/menu-text-detection).

### Menu Type
- **h**: horizontal menu
- **v**: vertical menu
- **d**: document-style menu
- **s**: in-scene menu (non-document style)
- **i**: irregular menu (menu with irregular text layout)

> Please see the [examples](https://github.com/ryanlinjui/menu-text-detection/tree/main/examples) for more details.

In [None]:
import os
import json

import numpy as np
from PIL import Image
from pillow_heif import register_heif_opener

from menu.llm import (
 GeminiAPI,
 OpenAIAPI
)

IMAGE_DIR = "datasets/images" # set your image directory here
SELECTED_MODEL = "gemini-2.5-flash" # set model name here, refer MODEL_LIST from app.py for more
API_TOKEN = "" # set your API token here
SELECTED_FUNCTION = GeminiAPI # set "GeminiAPI" or "OpenAIAPI"

register_heif_opener()

for file in os.listdir(IMAGE_DIR):
 print(f"Processing image: {file}")
 try:
 image = np.array(Image.open(os.path.join(IMAGE_DIR, file)))
 data = {
 "file_name": file,
 "menu": SELECTED_FUNCTION.call(image, SELECTED_MODEL, API_TOKEN)
 }
 with open(os.path.join(IMAGE_DIR, "metadata.jsonl"), "a", encoding="utf-8") as metaf:
 metaf.write(json.dumps(data, ensure_ascii=False, sort_keys=True) + "\n")
 except Exception as e:
 print(f"Skipping invalid image '{file}': {e}")
 continue

# Push Datasets to HuggingFace

In [None]:
from datasets import load_dataset

dataset = load_dataset(path="datasets/menu-zh-TW") # load dataset from the local directory including the metadata.jsonl, images files.
dataset.push_to_hub(repo_id="ryanlinjui/menu-zh-TW") # push to the huggingface dataset hub

# Prepare the dataset for training

In [None]:
from menu.utils import split_dataset
from datasets import load_dataset

dataset = load_dataset(path="ryanlinjui/menu-zh-TW") # set your dataset repo id for training
dataset = split_dataset(dataset["train"], train=0.8, validation=0.1, test=0.1, seed=42) # (optional) use it if your dataset is not split into train/validation/test
print(f"Dataset split: {len(dataset['train'])} train, {len(dataset['validation'])} validation, {len(dataset['test'])} test")

# Fine-tune Donut Model

In [None]:
import logging
from menu.donut import DonutTrainer

logging.getLogger("transformers").setLevel(logging.ERROR) # filter output message from transformers

DonutTrainer.train(
 dataset=dataset,
 pretrained_model_repo_id="naver-clova-ix/donut-base", # set your pretrained model repo id for fine-tuning
 ground_truth_key="menu", # set your ground truth key for training
 huggingface_model_id="ryanlinjui/donut-base-finetuned-menu", # set your huggingface model repo id for saving / pushing to the hub
 epochs=15, # set your training epochs
 train_batch_size=8, # set your training batch size
 val_batch_size=1, # set your validation batch size
 learning_rate=3e-5, # set your learning rate
 val_check_interval=0.5, # how many times we want to validate during an epoch
 check_val_every_n_epoch=1, # how many epochs we want to validate
 gradient_clip_val=1.0, # gradient clipping value for training stability
 num_training_samples_per_epoch=198, # set num_training_samples_per_epoch = training set size
 num_nodes=1, # number of nodes for distributed training
 warmup_steps=75 # number of warmup steps for learning rate scheduler, 198/8*30/10, 10%
)

# Evaluate Donut Model

In [None]:
import json
from datasets import load_dataset

from menu.utils import split_dataset
from menu.donut import DonutFinetuned

dataset = load_dataset("ryanlinjui/menu-zh-TW")
dataset = split_dataset(dataset["train"], train=0.8, validation=0.1, test=0.1, seed=42) # (optional) use it if your dataset is not split into train/validation/test
donut_finetuned = DonutFinetuned(pretrained_model_repo_id="ryanlinjui/donut-base-finetuned-menu")
scores, output_list = donut_finetuned.evaluate(dataset=dataset["test"], ground_truth_key="menu")

print("Evaluation scores:")
for key, value in scores.items():
 print(f"{key}: {value}")

print("\nSample outputs:")
for output in output_list[:5]:
 print(json.dumps(output, ensure_ascii=False, indent=4))

# Test Donut Model

In [None]:
from PIL import Image
from menu.donut import DonutFinetuned

image = Image.open("./examples/menu-hd.jpg")

donut_finetuned = DonutFinetuned(pretrained_model_repo_id="ryanlinjui/donut-base-finetuned-menu")
outputs = donut_finetuned.predict(image=image)
print(outputs)