Spaces:
Sleeping
Sleeping
File size: 3,552 Bytes
d85cc90 00e82af d85cc90 00e82af d85cc90 00e82af d85cc90 00e82af d85cc90 00e82af d85cc90 2041b7b 00e82af d85cc90 00e82af d85cc90 0ede05a d85cc90 0ede05a a077881 d85cc90 0ede05a d85cc90 a077881 d85cc90 2041b7b a077881 2041b7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile as wavfile
import numpy as np
import tempfile
from transformers import pipeline
from collections import Counter
import inflect
# Load models
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
# Generate audio and save as temporary .wav
def generate_audio(text):
narrated = narrator(text)
audio = narrated["audio"]
sampling_rate = narrated["sampling_rate"]
if audio.dtype != np.int16:
audio = (audio * 32767).astype(np.int16)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
wavfile.write(f.name, int(sampling_rate), audio)
return f.name
# Turn detections into human-friendly text
def read_objects(detections):
if not detections:
return "No objects were detected in this picture."
labels = [det['label'] for det in detections]
label_counts = Counter(labels)
p = inflect.engine()
phrases = []
for label, count in label_counts.items():
word = p.plural(label, count)
phrases.append(f"{count} {word}")
if len(phrases) == 1:
result = phrases[0]
else:
result = ", ".join(phrases[:-1]) + " and " + phrases[-1]
return f"This picture contains {result}."
# Annotate the image with bounding boxes and labels
def draw_detected_objects(image, detections, score_threshold=0.5):
annotated_image = image.copy()
draw = ImageDraw.Draw(annotated_image)
try:
font = ImageFont.truetype("arial.ttf", size=14)
except:
font = ImageFont.load_default()
for item in detections:
score = item["score"]
if score < score_threshold:
continue
box = item["box"]
label = item["label"]
text = f"{label} ({score:.2f})"
text_bbox = font.getbbox(text)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle(
[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
outline="red", width=3
)
draw.rectangle(
[(box["xmin"], box["ymin"] - text_height),
(box["xmin"] + text_width, box["ymin"])],
fill="red"
)
draw.text(
(box["xmin"], box["ymin"] - text_height),
text, fill="white", font=font
)
return annotated_image
def detect_image(image):
try:
output = obj_detector(image)
print("β
Detection output:", output)
natural_text = read_objects(output)
print("π Generated Text:", natural_text)
audio_path = generate_audio(natural_text)
print("π Audio Path:", audio_path)
processed_image = draw_detected_objects(image, output)
return processed_image.convert("RGB"), audio_path # π Force RGB
except Exception as e:
print("β Error:", str(e))
return Image.new("RGB", (512, 512), color="gray"), None
# Launch Gradio app
demo = gr.Interface(
fn=detect_image,
inputs=[gr.Image(label="Upload an Image", type="pil")],
outputs=[
gr.Image(label="Image with Detected Objects", type="pil"),
gr.Audio(label="Audio Description")
],
title="@GenAI Project 7: Object Detector with Audio",
description="This app detects objects in images, highlights them, and generates a natural language audio description."
)
demo.launch(share=True)
|