Spaces:
Sleeping
Sleeping
import gradio as gr # not used here, but kept if needed later | |
from PIL import Image, ImageDraw, ImageFont | |
import scipy.io.wavfile as wavfile | |
import numpy as np | |
from transformers import pipeline | |
from collections import Counter | |
import inflect | |
# # Paths for your models | |
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/" | |
# "3bcb8321394f671bd948ebf0d086d694dda95464") | |
# narrator = pipeline("text-to-speech", model=tts_model_path) | |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
# obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/" | |
# "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b") | |
# obj_detector = pipeline("object-detection", model=obj_detector_path) | |
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") | |
def generate_audio(text, output_path="finetuned_output.wav"): | |
narrated = narrator(text) | |
audio = narrated["audio"] | |
sampling_rate = narrated["sampling_rate"] | |
# Convert to int16 if needed | |
if audio.dtype != np.int16: | |
audio = (audio * 32767).astype(np.int16) | |
wavfile.write(output_path, sampling_rate, audio) | |
return output_path | |
def read_objects(detections: list[dict]) -> str: | |
if not detections: | |
return "No objects were detected in this picture." | |
labels = [det['label'] for det in detections] | |
label_counts = Counter(labels) | |
p = inflect.engine() | |
phrases = [] | |
for label, count in label_counts.items(): | |
word = p.plural(label, count) | |
phrases.append(f"{count} {word}") | |
if len(phrases) == 1: | |
result = phrases[0] | |
else: | |
result = ", ".join(phrases[:-1]) + " and " + phrases[-1] | |
return f"This picture contains {result}." | |
def draw_detected_objects(image, detections, score_threshold=0.5): | |
annotated_image = image.copy() | |
draw = ImageDraw.Draw(annotated_image) | |
try: | |
font = ImageFont.truetype("arial.ttf", size=14) | |
except: | |
font = ImageFont.load_default() | |
for item in detections: | |
score = item["score"] | |
if score < score_threshold: | |
continue | |
box = item["box"] | |
label = item["label"] | |
text = f"{label} ({score:.2f})" | |
text_bbox = font.getbbox(text) | |
text_width = text_bbox[2] - text_bbox[0] | |
text_height = text_bbox[3] - text_bbox[1] | |
draw.rectangle( | |
[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])], | |
outline="red", width=3 | |
) | |
draw.rectangle( | |
[(box["xmin"], box["ymin"] - text_height), | |
(box["xmin"] + text_width, box["ymin"])], | |
fill="red" | |
) | |
draw.text( | |
(box["xmin"], box["ymin"] - text_height), | |
text, fill="white", font=font | |
) | |
return annotated_image | |
def detect_image(image): | |
raw_image = image | |
output = obj_detector(raw_image) | |
processed_image = draw_detected_objects(raw_image, output) | |
natural_text = read_objects(output) | |
processed_audio = generate_audio(natural_text) | |
return processed_image, processed_audio | |
gr.close_all() | |
demo = gr.Interface(fn=detect_image, | |
inputs=[gr.Image(label="Select Image", type="pil")], | |
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")], | |
title="@GenAI Project 7: Object Detector with Audio", | |
description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.") | |
demo.launch() | |