File size: 4,658 Bytes
0c8d55e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pandas as pd
import numpy as np
import pandas as pd
from PIL import ImageDraw
from datasets import load_dataset, Image
from PIL import Image
try:
    from paddleocr import PaddleOCR
except:
    PaddleOCR = None
          

def ocr_with_paddle(img):
    if PaddleOCR is None:
        raise ValueError('sudo apt install swig -y && pip install paddleocr==2.7.0.3 paddle-bfloat==0.1.7 paddlepaddle==2.5.2 protobuf==3.20.2')
    ocr = PaddleOCR(lang='en', use_angle_cls=True, show_log=False)
    result = ocr.ocr(img)
    new_result = []
    if result[0] is None:
        return new_result
    for i in result[0]:
        new_result.append(i[:-1] + [i[-1][0], i[-1][1]])
    return new_result

def draw_boxes(image, bounds, color='yellow', width=2):
    draw = ImageDraw.Draw(image)
    for bound in bounds:
        p0, p1, p2, p3 = bound[0]
        draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
    return image
       

def calculate_position(box, width, height):
    """Calculates the position of a bounding box within a 9-grid.

    Args:
        box: A list of coordinates representing the bounding box (e.g., [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]).
        width: The width of the image.
        height: The height of the image.

    Returns:
        A string representing the position of the box (e.g., "top-left", "center", "bottom-right").
    """
    x_coords = [coord[0] for coord in box]
    y_coords = [coord[1] for coord in box]

    # Calculate the center of the bounding box
    center_x = (min(x_coords) + max(x_coords)) / 2
    center_y = (min(y_coords) + max(y_coords)) / 2

    # Determine the row and column position
    if center_y < height / 3:
        row = "top"
    elif center_y < 2 * height / 3:
        row = "middle"
    else:
        row = "bottom"

    if center_x < width / 3:
        col = "left"
    elif center_x < 2 * width / 3:
        col = "center"
    else:
        col = "right"

    return f"{row}-{col}"


def process_dataframe(df, image_width, image_height):
    """Processes the DataFrame to filter by score and add a position column.

    Args:
        df: The input Pandas DataFrame with 'box', 'text', and 'score' columns.
        image_width: The width of the image.
        image_height: The height of the image.

    Returns:
        A Pandas DataFrame filtered by score and with an added 'position' column.
    """

    # Filter the DataFrame by score
    df_filtered = df[df['score'] > 0.9].copy()  # Use .copy() to avoid SettingWithCopyWarning

    # Apply the position calculation and create the 'position' column
    df_filtered['position'] = df_filtered['box'].apply(lambda box: calculate_position(box, image_width, image_height))

    return df_filtered



def format_for_text_to_image_condensed(df, image_number):
    """Formats the DataFrame into a condensed sentence for text-to-image models,
    grouping text at the same position, and includes the image number (full spelling)."""
    if len(df) == 0:
        return ''
    ordinal_map = {
        1: "first", 2: "second", 3: "third", 4: "fourth", 5: "fifth",
        6: "sixth", 7: "seventh", 8: "eighth", 9: "ninth", 10: "tenth",
        11: "eleventh", 12: "twelfth", 13: "thirteenth", 14: "fourteenth",
        15: "fifteenth", 16: "sixteenth", 17: "seventeenth", 18: "eighteenth",
        19: "nineteenth", 20: "twentieth"
    }

    ordinal = ordinal_map.get(image_number, None)  # Use number as string if not in map
    assert ordinal is not None, "ordinal is not None"
    position_to_texts = {}
    for index, row in df.iterrows():
        position = row['position']
        text = row['text']
        if position in position_to_texts:
            position_to_texts[position].append(text)
        else:
            position_to_texts[position] = [text]

    sentences = [f'In the {ordinal} image: (']
    for position, texts in position_to_texts.items():
        quoted_texts = [f"\"{text}\"" for text in texts]  # Quote each text
        text_string = ", ".join(quoted_texts)  # Join with commas
        sentences.append(f"The texts {text_string} are located at the {position} of the {ordinal} image.")
    return " ".join(sentences) + ' )'

def get_ocr_result(img_path: str, img_index: int = 0):
    img_index = img_index + 1
    ocr_result = ocr_with_paddle(img_path)
    ocr_result_df = pd.DataFrame(ocr_result, columns=['box', 'text', 'score'])
    image_width, image_height = Image.open(img_path).size 
    df_processed = process_dataframe(ocr_result_df, image_width, image_height)
    formatted_sentence = format_for_text_to_image_condensed(df_processed, image_number=img_index)
    return formatted_sentence