Spaces:

LanguageBind
/

UniWorld-V1

Runtime error

File size: 4,658 Bytes

0c8d55e

import pandas as pd
import numpy as np
import pandas as pd
from PIL import ImageDraw
from datasets import load_dataset, Image
from PIL import Image
try:
    from paddleocr import PaddleOCR
except:
    PaddleOCR = None
          

def ocr_with_paddle(img):
    if PaddleOCR is None:
        raise ValueError('sudo apt install swig -y && pip install paddleocr==2.7.0.3 paddle-bfloat==0.1.7 paddlepaddle==2.5.2 protobuf==3.20.2')
    ocr = PaddleOCR(lang='en', use_angle_cls=True, show_log=False)
    result = ocr.ocr(img)
    new_result = []
    if result[0] is None:
        return new_result
    for i in result[0]:
        new_result.append(i[:-1] + [i[-1][0], i[-1][1]])
    return new_result

def draw_boxes(image, bounds, color='yellow', width=2):
    draw = ImageDraw.Draw(image)
    for bound in bounds:
        p0, p1, p2, p3 = bound[0]
        draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
    return image
       

def calculate_position(box, width, height):
    """Calculates the position of a bounding box within a 9-grid.

    Args:
        box: A list of coordinates representing the bounding box (e.g., [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]).
        width: The width of the image.
        height: The height of the image.

    Returns:
        A string representing the position of the box (e.g., "top-left", "center", "bottom-right").
    """
    x_coords = [coord[0] for coord in box]
    y_coords = [coord[1] for coord in box]

    # Calculate the center of the bounding box
    center_x = (min(x_coords) + max(x_coords)) / 2
    center_y = (min(y_coords) + max(y_coords)) / 2

    # Determine the row and column position
    if center_y < height / 3:
        row = "top"
    elif center_y < 2 * height / 3:
        row = "middle"
    else:
        row = "bottom"

    if center_x < width / 3:
        col = "left"
    elif center_x < 2 * width / 3:
        col = "center"
    else:
        col = "right"

    return f"{row}-{col}"


def process_dataframe(df, image_width, image_height):
    """Processes the DataFrame to filter by score and add a position column.

    Args:
        df: The input Pandas DataFrame with 'box', 'text', and 'score' columns.
        image_width: The width of the image.
        image_height: The height of the image.

    Returns:
        A Pandas DataFrame filtered by score and with an added 'position' column.
    """

    # Filter the DataFrame by score
    df_filtered = df[df['score'] > 0.9].copy()  # Use .copy() to avoid SettingWithCopyWarning

    # Apply the position calculation and create the 'position' column
    df_filtered['position'] = df_filtered['box'].apply(lambda box: calculate_position(box, image_width, image_height))

    return df_filtered



def format_for_text_to_image_condensed(df, image_number):
    """Formats the DataFrame into a condensed sentence for text-to-image models,
    grouping text at the same position, and includes the image number (full spelling)."""
    if len(df) == 0:
        return ''
    ordinal_map = {
        1: "first", 2: "second", 3: "third", 4: "fourth", 5: "fifth",
        6: "sixth", 7: "seventh", 8: "eighth", 9: "ninth", 10: "tenth",
        11: "eleventh", 12: "twelfth", 13: "thirteenth", 14: "fourteenth",
        15: "fifteenth", 16: "sixteenth", 17: "seventeenth", 18: "eighteenth",
        19: "nineteenth", 20: "twentieth"
    }

    ordinal = ordinal_map.get(image_number, None)  # Use number as string if not in map
    assert ordinal is not None, "ordinal is not None"
    position_to_texts = {}
    for index, row in df.iterrows():
        position = row['position']
        text = row['text']
        if position in position_to_texts:
            position_to_texts[position].append(text)
        else:
            position_to_texts[position] = [text]

    sentences = [f'In the {ordinal} image: (']
    for position, texts in position_to_texts.items():
        quoted_texts = [f"\"{text}\"" for text in texts]  # Quote each text
        text_string = ", ".join(quoted_texts)  # Join with commas
        sentences.append(f"The texts {text_string} are located at the {position} of the {ordinal} image.")
    return " ".join(sentences) + ' )'

def get_ocr_result(img_path: str, img_index: int = 0):
    img_index = img_index + 1
    ocr_result = ocr_with_paddle(img_path)
    ocr_result_df = pd.DataFrame(ocr_result, columns=['box', 'text', 'score'])
    image_width, image_height = Image.open(img_path).size 
    df_processed = process_dataframe(ocr_result_df, image_width, image_height)
    formatted_sentence = format_for_text_to_image_condensed(df_processed, image_number=img_index)
    return formatted_sentence