|
import gradio as gr |
|
from PIL import Image, ImageDraw, ImageFont |
|
import scipy.io.wavfile as wavfile |
|
import numpy as np |
|
import tempfile |
|
from transformers import pipeline |
|
from collections import Counter |
|
import inflect |
|
|
|
|
|
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") |
|
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") |
|
|
|
|
|
def generate_audio(text): |
|
narrated = narrator(text) |
|
audio = narrated["audio"] |
|
sampling_rate = narrated["sampling_rate"] |
|
|
|
if audio.dtype != np.int16: |
|
audio = (audio * 32767).astype(np.int16) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: |
|
wavfile.write(f.name, int(sampling_rate), audio) |
|
return f.name |
|
|
|
|
|
def read_objects(detections): |
|
if not detections: |
|
return "No objects were detected in this picture." |
|
|
|
labels = [det['label'] for det in detections] |
|
label_counts = Counter(labels) |
|
|
|
p = inflect.engine() |
|
phrases = [] |
|
for label, count in label_counts.items(): |
|
word = p.plural(label, count) |
|
phrases.append(f"{count} {word}") |
|
|
|
if len(phrases) == 1: |
|
result = phrases[0] |
|
else: |
|
result = ", ".join(phrases[:-1]) + " and " + phrases[-1] |
|
|
|
return f"This picture contains {result}." |
|
|
|
|
|
def draw_detected_objects(image, detections, score_threshold=0.5): |
|
annotated_image = image.copy() |
|
draw = ImageDraw.Draw(annotated_image) |
|
|
|
try: |
|
font = ImageFont.truetype("arial.ttf", size=14) |
|
except: |
|
font = ImageFont.load_default() |
|
|
|
for item in detections: |
|
score = item["score"] |
|
if score < score_threshold: |
|
continue |
|
|
|
box = item["box"] |
|
label = item["label"] |
|
text = f"{label} ({score:.2f})" |
|
|
|
text_bbox = font.getbbox(text) |
|
text_width = text_bbox[2] - text_bbox[0] |
|
text_height = text_bbox[3] - text_bbox[1] |
|
|
|
draw.rectangle( |
|
[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])], |
|
outline="red", width=3 |
|
) |
|
|
|
draw.rectangle( |
|
[(box["xmin"], box["ymin"] - text_height), |
|
(box["xmin"] + text_width, box["ymin"])], |
|
fill="red" |
|
) |
|
draw.text( |
|
(box["xmin"], box["ymin"] - text_height), |
|
text, fill="white", font=font |
|
) |
|
|
|
return annotated_image |
|
|
|
def detect_image(image): |
|
try: |
|
output = obj_detector(image) |
|
print("β
Detection output:", output) |
|
natural_text = read_objects(output) |
|
print("π Generated Text:", natural_text) |
|
audio_path = generate_audio(natural_text) |
|
print("π Audio Path:", audio_path) |
|
processed_image = draw_detected_objects(image, output) |
|
return processed_image.convert("RGB"), audio_path |
|
except Exception as e: |
|
print("β Error:", str(e)) |
|
return Image.new("RGB", (512, 512), color="gray"), None |
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn=detect_image, |
|
inputs=[gr.Image(label="Upload an Image", type="pil")], |
|
outputs=[ |
|
gr.Image(label="Image with Detected Objects", type="pil"), |
|
gr.Audio(label="Audio Description") |
|
], |
|
title="@GenAI Project 7: Object Detector with Audio", |
|
description="This app detects objects in images, highlights them, and generates a natural language audio description." |
|
) |
|
|
|
demo.launch(share=True) |
|
|
|
|