import gradio as gr # not used here, but kept if needed later from PIL import Image, ImageDraw, ImageFont import scipy.io.wavfile as wavfile import numpy as np from transformers import pipeline from collections import Counter import inflect # # Paths for your models # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/" # "3bcb8321394f671bd948ebf0d086d694dda95464") # narrator = pipeline("text-to-speech", model=tts_model_path) narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") # obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/" # "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b") # obj_detector = pipeline("object-detection", model=obj_detector_path) obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") def generate_audio(text, output_path="finetuned_output.wav"): narrated = narrator(text) audio = narrated["audio"] sampling_rate = narrated["sampling_rate"] # Convert to int16 if needed if audio.dtype != np.int16: audio = (audio * 32767).astype(np.int16) wavfile.write(output_path, sampling_rate, audio) return output_path def read_objects(detections: list[dict]) -> str: if not detections: return "No objects were detected in this picture." labels = [det['label'] for det in detections] label_counts = Counter(labels) p = inflect.engine() phrases = [] for label, count in label_counts.items(): word = p.plural(label, count) phrases.append(f"{count} {word}") if len(phrases) == 1: result = phrases[0] else: result = ", ".join(phrases[:-1]) + " and " + phrases[-1] return f"This picture contains {result}." def draw_detected_objects(image, detections, score_threshold=0.5): annotated_image = image.copy() draw = ImageDraw.Draw(annotated_image) try: font = ImageFont.truetype("arial.ttf", size=14) except: font = ImageFont.load_default() for item in detections: score = item["score"] if score < score_threshold: continue box = item["box"] label = item["label"] text = f"{label} ({score:.2f})" text_bbox = font.getbbox(text) text_width = text_bbox[2] - text_bbox[0] text_height = text_bbox[3] - text_bbox[1] draw.rectangle( [(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])], outline="red", width=3 ) draw.rectangle( [(box["xmin"], box["ymin"] - text_height), (box["xmin"] + text_width, box["ymin"])], fill="red" ) draw.text( (box["xmin"], box["ymin"] - text_height), text, fill="white", font=font ) return annotated_image def detect_image(image): raw_image = image output = obj_detector(raw_image) processed_image = draw_detected_objects(raw_image, output) natural_text = read_objects(output) processed_audio = generate_audio(natural_text) return processed_image, processed_audio gr.close_all() demo = gr.Interface(fn=detect_image, inputs=[gr.Image(label="Select Image", type="pil")], outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")], title="@GenAI Project 7: Object Detector with Audio", description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.") demo.launch()