Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PIL import Image, ImageDraw, ImageFont | |
import scipy.io.wavfile as wavfile | |
import numpy as np | |
import tempfile | |
from transformers import pipeline | |
from collections import Counter | |
import inflect | |
# Load models | |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") | |
# Generate audio and save as temporary .wav | |
def generate_audio(text): | |
narrated = narrator(text) | |
audio = narrated["audio"] | |
sampling_rate = narrated["sampling_rate"] | |
if audio.dtype != np.int16: | |
audio = (audio * 32767).astype(np.int16) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
wavfile.write(f.name, int(sampling_rate), audio) | |
return f.name | |
# Turn detections into human-friendly text | |
def read_objects(detections): | |
if not detections: | |
return "No objects were detected in this picture." | |
labels = [det['label'] for det in detections] | |
label_counts = Counter(labels) | |
p = inflect.engine() | |
phrases = [] | |
for label, count in label_counts.items(): | |
word = p.plural(label, count) | |
phrases.append(f"{count} {word}") | |
if len(phrases) == 1: | |
result = phrases[0] | |
else: | |
result = ", ".join(phrases[:-1]) + " and " + phrases[-1] | |
return f"This picture contains {result}." | |
# Annotate the image with bounding boxes and labels | |
def draw_detected_objects(image, detections, score_threshold=0.5): | |
annotated_image = image.copy() | |
draw = ImageDraw.Draw(annotated_image) | |
try: | |
font = ImageFont.truetype("arial.ttf", size=14) | |
except: | |
font = ImageFont.load_default() | |
for item in detections: | |
score = item["score"] | |
if score < score_threshold: | |
continue | |
box = item["box"] | |
label = item["label"] | |
text = f"{label} ({score:.2f})" | |
text_bbox = font.getbbox(text) | |
text_width = text_bbox[2] - text_bbox[0] | |
text_height = text_bbox[3] - text_bbox[1] | |
draw.rectangle( | |
[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])], | |
outline="red", width=3 | |
) | |
draw.rectangle( | |
[(box["xmin"], box["ymin"] - text_height), | |
(box["xmin"] + text_width, box["ymin"])], | |
fill="red" | |
) | |
draw.text( | |
(box["xmin"], box["ymin"] - text_height), | |
text, fill="white", font=font | |
) | |
return annotated_image | |
def detect_image(image): | |
try: | |
output = obj_detector(image) | |
print("β Detection output:", output) | |
natural_text = read_objects(output) | |
print("π Generated Text:", natural_text) | |
audio_path = generate_audio(natural_text) | |
print("π Audio Path:", audio_path) | |
processed_image = draw_detected_objects(image, output) | |
return processed_image.convert("RGB"), audio_path # π Force RGB | |
except Exception as e: | |
print("β Error:", str(e)) | |
return Image.new("RGB", (512, 512), color="gray"), None | |
# Launch Gradio app | |
demo = gr.Interface( | |
fn=detect_image, | |
inputs=[gr.Image(label="Upload an Image", type="pil")], | |
outputs=[ | |
gr.Image(label="Image with Detected Objects", type="pil"), | |
gr.Audio(label="Audio Description") | |
], | |
title="@GenAI Project 7: Object Detector with Audio", | |
description="This app detects objects in images, highlights them, and generates a natural language audio description." | |
) | |
demo.launch(share=True) | |