File size: 3,552 Bytes
d85cc90
00e82af
 
 
d85cc90
00e82af
 
 
 
d85cc90
00e82af
 
 
d85cc90
 
00e82af
 
 
 
 
 
 
d85cc90
 
 
00e82af
d85cc90
2041b7b
00e82af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d85cc90
00e82af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d85cc90
0ede05a
 
d85cc90
0ede05a
 
 
 
a077881
d85cc90
0ede05a
 
 
 
d85cc90
a077881
d85cc90
 
 
 
 
2041b7b
 
 
 
 
 
 
a077881
2041b7b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile as wavfile
import numpy as np
import tempfile
from transformers import pipeline
from collections import Counter
import inflect

# Load models
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

# Generate audio and save as temporary .wav
def generate_audio(text):
    narrated = narrator(text)
    audio = narrated["audio"]
    sampling_rate = narrated["sampling_rate"]

    if audio.dtype != np.int16:
        audio = (audio * 32767).astype(np.int16)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        wavfile.write(f.name, int(sampling_rate), audio)
        return f.name

# Turn detections into human-friendly text
def read_objects(detections):
    if not detections:
        return "No objects were detected in this picture."

    labels = [det['label'] for det in detections]
    label_counts = Counter(labels)

    p = inflect.engine()
    phrases = []
    for label, count in label_counts.items():
        word = p.plural(label, count)
        phrases.append(f"{count} {word}")

    if len(phrases) == 1:
        result = phrases[0]
    else:
        result = ", ".join(phrases[:-1]) + " and " + phrases[-1]

    return f"This picture contains {result}."

# Annotate the image with bounding boxes and labels
def draw_detected_objects(image, detections, score_threshold=0.5):
    annotated_image = image.copy()
    draw = ImageDraw.Draw(annotated_image)

    try:
        font = ImageFont.truetype("arial.ttf", size=14)
    except:
        font = ImageFont.load_default()

    for item in detections:
        score = item["score"]
        if score < score_threshold:
            continue

        box = item["box"]
        label = item["label"]
        text = f"{label} ({score:.2f})"

        text_bbox = font.getbbox(text)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]

        draw.rectangle(
            [(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
            outline="red", width=3
        )

        draw.rectangle(
            [(box["xmin"], box["ymin"] - text_height),
             (box["xmin"] + text_width, box["ymin"])],
            fill="red"
        )
        draw.text(
            (box["xmin"], box["ymin"] - text_height),
            text, fill="white", font=font
        )

    return annotated_image

def detect_image(image):
    try:
        output = obj_detector(image)
        print("βœ… Detection output:", output)
        natural_text = read_objects(output)
        print("πŸ“ Generated Text:", natural_text)
        audio_path = generate_audio(natural_text)
        print("πŸ”Š Audio Path:", audio_path)
        processed_image = draw_detected_objects(image, output)
        return processed_image.convert("RGB"), audio_path  # πŸ‘ˆ Force RGB
    except Exception as e:
        print("❌ Error:", str(e))
        return Image.new("RGB", (512, 512), color="gray"), None




# Launch Gradio app
demo = gr.Interface(
    fn=detect_image,
    inputs=[gr.Image(label="Upload an Image", type="pil")],
    outputs=[
        gr.Image(label="Image with Detected Objects", type="pil"),
        gr.Audio(label="Audio Description")
    ],
    title="@GenAI Project 7: Object Detector with Audio",
    description="This app detects objects in images, highlights them, and generates a natural language audio description."
)

demo.launch(share=True)