Spaces:

Miraj3
/

Object_Detector_with_Audio

Sleeping

App Files Files Community

Miraj3 commited on 4 days ago

Commit

00e82af

verified ·

1 Parent(s): 5782799

Create app.py

Browse files

Files changed (1) hide show

app.py +112 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr  # not used here, but kept if needed later
+from PIL import Image, ImageDraw, ImageFont
+import scipy.io.wavfile as wavfile
+import numpy as np
+from transformers import pipeline
+from collections import Counter
+import inflect
+# # Paths for your models
+# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/"
+#                   "3bcb8321394f671bd948ebf0d086d694dda95464")
+# narrator = pipeline("text-to-speech", model=tts_model_path)
+narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
+# obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/"
+#                      "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
+# obj_detector = pipeline("object-detection", model=obj_detector_path)
+obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
+def generate_audio(text, output_path="finetuned_output.wav"):
+    narrated = narrator(text)
+    audio = narrated["audio"]
+    sampling_rate = narrated["sampling_rate"]
+    # Convert to int16 if needed
+    if audio.dtype != np.int16:
+        audio = (audio * 32767).astype(np.int16)
+    wavfile.write(output_path, sampling_rate, audio)
+    return output_path
+def read_objects(detections: list[dict]) -> str:
+    if not detections:
+        return "No objects were detected in this picture."
+    labels = [det['label'] for det in detections]
+    label_counts = Counter(labels)
+    p = inflect.engine()
+    phrases = []
+    for label, count in label_counts.items():
+        word = p.plural(label, count)
+        phrases.append(f"{count} {word}")
+    if len(phrases) == 1:
+        result = phrases[0]
+    else:
+        result = ", ".join(phrases[:-1]) + " and " + phrases[-1]
+    return f"This picture contains {result}."
+def draw_detected_objects(image, detections, score_threshold=0.5):
+    annotated_image = image.copy()
+    draw = ImageDraw.Draw(annotated_image)
+    try:
+        font = ImageFont.truetype("arial.ttf", size=14)
+    except:
+        font = ImageFont.load_default()
+    for item in detections:
+        score = item["score"]
+        if score < score_threshold:
+            continue
+        box = item["box"]
+        label = item["label"]
+        text = f"{label} ({score:.2f})"
+        text_bbox = font.getbbox(text)
+        text_width = text_bbox[2] - text_bbox[0]
+        text_height = text_bbox[3] - text_bbox[1]
+        draw.rectangle(
+            [(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
+            outline="red", width=3
+        )
+        draw.rectangle(
+            [(box["xmin"], box["ymin"] - text_height),
+             (box["xmin"] + text_width, box["ymin"])],
+            fill="red"
+        )
+        draw.text(
+            (box["xmin"], box["ymin"] - text_height),
+            text, fill="white", font=font
+        )
+    return annotated_image
+def detect_image(image):
+    raw_image = image
+    output = obj_detector(raw_image)
+    processed_image = draw_detected_objects(raw_image, output)
+    natural_text = read_objects(output)
+    processed_audio = generate_audio(natural_text)
+    return processed_image, processed_audio
+gr.close_all()
+demo = gr.Interface(fn=detect_image,
+                    inputs=[gr.Image(label="Select Image", type="pil")],
+                    outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
+                    title="@GenAI Project 7: Object Detector with Audio",
+                    description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.")
+demo.launch()