import gradio as gr from PIL import Image, ImageDraw, ImageFont import scipy.io.wavfile as wavfile import numpy as np import tempfile from transformers import pipeline from collections import Counter import inflect # Load models narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") # Generate audio and save as temporary .wav def generate_audio(text): narrated = narrator(text) audio = narrated["audio"] sampling_rate = narrated["sampling_rate"] if audio.dtype != np.int16: audio = (audio * 32767).astype(np.int16) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: wavfile.write(f.name, int(sampling_rate), audio) return f.name # Turn detections into human-friendly text def read_objects(detections): if not detections: return "No objects were detected in this picture." labels = [det['label'] for det in detections] label_counts = Counter(labels) p = inflect.engine() phrases = [] for label, count in label_counts.items(): word = p.plural(label, count) phrases.append(f"{count} {word}") if len(phrases) == 1: result = phrases[0] else: result = ", ".join(phrases[:-1]) + " and " + phrases[-1] return f"This picture contains {result}." # Annotate the image with bounding boxes and labels def draw_detected_objects(image, detections, score_threshold=0.5): annotated_image = image.copy() draw = ImageDraw.Draw(annotated_image) try: font = ImageFont.truetype("arial.ttf", size=14) except: font = ImageFont.load_default() for item in detections: score = item["score"] if score < score_threshold: continue box = item["box"] label = item["label"] text = f"{label} ({score:.2f})" text_bbox = font.getbbox(text) text_width = text_bbox[2] - text_bbox[0] text_height = text_bbox[3] - text_bbox[1] draw.rectangle( [(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])], outline="red", width=3 ) draw.rectangle( [(box["xmin"], box["ymin"] - text_height), (box["xmin"] + text_width, box["ymin"])], fill="red" ) draw.text( (box["xmin"], box["ymin"] - text_height), text, fill="white", font=font ) return annotated_image def detect_image(image): try: output = obj_detector(image) print("✅ Detection output:", output) natural_text = read_objects(output) print("📝 Generated Text:", natural_text) audio_path = generate_audio(natural_text) print("🔊 Audio Path:", audio_path) processed_image = draw_detected_objects(image, output) return processed_image.convert("RGB"), audio_path # 👈 Force RGB except Exception as e: print("❌ Error:", str(e)) return Image.new("RGB", (512, 512), color="gray"), None # Launch Gradio app demo = gr.Interface( fn=detect_image, inputs=[gr.Image(label="Upload an Image", type="pil")], outputs=[ gr.Image(label="Image with Detected Objects", type="pil"), gr.Audio(label="Audio Description") ], title="@GenAI Project 7: Object Detector with Audio", description="This app detects objects in images, highlights them, and generates a natural language audio description." ) demo.launch(share=True)