Spaces:

Miraj3
/

Object_Detector_with_Audio

Sleeping

App Files Files Community

Object_Detector_with_Audio / app.py

Miraj3

Update app.py

2041b7b verified 20 days ago

raw

history blame

3.38 kB

	import gradio as gr
	from PIL import Image, ImageDraw, ImageFont
	import scipy.io.wavfile as wavfile
	import numpy as np
	import tempfile
	from transformers import pipeline
	from collections import Counter
	import inflect

	# Load models
	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
	obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

	# Generate audio and save as temporary .wav
	def generate_audio(text):
	narrated = narrator(text)
	audio = narrated["audio"]
	sampling_rate = narrated["sampling_rate"]

	if audio.dtype != np.int16:
	audio = (audio * 32767).astype(np.int16)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	wavfile.write(f.name, int(sampling_rate), audio)
	return f.name

	# Turn detections into human-friendly text
	def read_objects(detections):
	if not detections:
	return "No objects were detected in this picture."

	labels = [det['label'] for det in detections]
	label_counts = Counter(labels)

	p = inflect.engine()
	phrases = []
	for label, count in label_counts.items():
	word = p.plural(label, count)
	phrases.append(f"{count} {word}")

	if len(phrases) == 1:
	result = phrases[0]
	else:
	result = ", ".join(phrases[:-1]) + " and " + phrases[-1]

	return f"This picture contains {result}."

	# Annotate the image with bounding boxes and labels
	def draw_detected_objects(image, detections, score_threshold=0.5):
	annotated_image = image.copy()
	draw = ImageDraw.Draw(annotated_image)

	try:
	font = ImageFont.truetype("arial.ttf", size=14)
	except:
	font = ImageFont.load_default()

	for item in detections:
	score = item["score"]
	if score < score_threshold:
	continue

	box = item["box"]
	label = item["label"]
	text = f"{label} ({score:.2f})"

	text_bbox = font.getbbox(text)
	text_width = text_bbox[2] - text_bbox[0]
	text_height = text_bbox[3] - text_bbox[1]

	draw.rectangle(
	[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
	outline="red", width=3
	)

	draw.rectangle(
	[(box["xmin"], box["ymin"] - text_height),
	(box["xmin"] + text_width, box["ymin"])],
	fill="red"
	)
	draw.text(
	(box["xmin"], box["ymin"] - text_height),
	text, fill="white", font=font
	)

	return annotated_image

	# Gradio function
	def detect_image(image):
	try:
	raw_image = image
	output = obj_detector(raw_image)
	processed_image = draw_detected_objects(raw_image, output)
	natural_text = read_objects(output)
	processed_audio = generate_audio(natural_text)
	return processed_image, processed_audio
	except Exception as e:
	print("❌ Error:", e)
	return None, None

	# Launch Gradio app
	demo = gr.Interface(
	fn=detect_image,
	inputs=[gr.Image(label="Upload an Image", type="pil")],
	outputs=[
	gr.Image(label="Image with Detected Objects", type="pil"),
	gr.Audio(label="Audio Description")
	],
	title="@GenAI Project 7: Object Detector with Audio",
	description="This app detects objects in images, highlights them, and generates a natural language audio description."
	)

	demo.launch()