Spaces:
Sleeping
Sleeping
File size: 3,584 Bytes
00e82af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr # not used here, but kept if needed later
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile as wavfile
import numpy as np
from transformers import pipeline
from collections import Counter
import inflect
# # Paths for your models
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/"
# "3bcb8321394f671bd948ebf0d086d694dda95464")
# narrator = pipeline("text-to-speech", model=tts_model_path)
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
# obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/"
# "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
# obj_detector = pipeline("object-detection", model=obj_detector_path)
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
def generate_audio(text, output_path="finetuned_output.wav"):
narrated = narrator(text)
audio = narrated["audio"]
sampling_rate = narrated["sampling_rate"]
# Convert to int16 if needed
if audio.dtype != np.int16:
audio = (audio * 32767).astype(np.int16)
wavfile.write(output_path, sampling_rate, audio)
return output_path
def read_objects(detections: list[dict]) -> str:
if not detections:
return "No objects were detected in this picture."
labels = [det['label'] for det in detections]
label_counts = Counter(labels)
p = inflect.engine()
phrases = []
for label, count in label_counts.items():
word = p.plural(label, count)
phrases.append(f"{count} {word}")
if len(phrases) == 1:
result = phrases[0]
else:
result = ", ".join(phrases[:-1]) + " and " + phrases[-1]
return f"This picture contains {result}."
def draw_detected_objects(image, detections, score_threshold=0.5):
annotated_image = image.copy()
draw = ImageDraw.Draw(annotated_image)
try:
font = ImageFont.truetype("arial.ttf", size=14)
except:
font = ImageFont.load_default()
for item in detections:
score = item["score"]
if score < score_threshold:
continue
box = item["box"]
label = item["label"]
text = f"{label} ({score:.2f})"
text_bbox = font.getbbox(text)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle(
[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
outline="red", width=3
)
draw.rectangle(
[(box["xmin"], box["ymin"] - text_height),
(box["xmin"] + text_width, box["ymin"])],
fill="red"
)
draw.text(
(box["xmin"], box["ymin"] - text_height),
text, fill="white", font=font
)
return annotated_image
def detect_image(image):
raw_image = image
output = obj_detector(raw_image)
processed_image = draw_detected_objects(raw_image, output)
natural_text = read_objects(output)
processed_audio = generate_audio(natural_text)
return processed_image, processed_audio
gr.close_all()
demo = gr.Interface(fn=detect_image,
inputs=[gr.Image(label="Select Image", type="pil")],
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
title="@GenAI Project 7: Object Detector with Audio",
description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.")
demo.launch()
|