Spaces:

Miraj3
/

Object_Detector_with_Audio

Running

App Files Files Community

Object_Detector_with_Audio / app.py

Miraj3

Update app.py

a077881 verified 4 days ago

raw

history blame

3.55 kB

	import gradio as gr
	from PIL import Image, ImageDraw, ImageFont
	import scipy.io.wavfile as wavfile
	import numpy as np
	import tempfile
	from transformers import pipeline
	from collections import Counter
	import inflect

	# Load models
	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
	obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

	# Generate audio and save as temporary .wav
	def generate_audio(text):
	narrated = narrator(text)
	audio = narrated["audio"]
	sampling_rate = narrated["sampling_rate"]

	if audio.dtype != np.int16:
	audio = (audio * 32767).astype(np.int16)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
	wavfile.write(f.name, int(sampling_rate), audio)
	return f.name

	# Turn detections into human-friendly text
	def read_objects(detections):
	if not detections:
	return "No objects were detected in this picture."

	labels = [det['label'] for det in detections]
	label_counts = Counter(labels)

	p = inflect.engine()
	phrases = []
	for label, count in label_counts.items():
	word = p.plural(label, count)
	phrases.append(f"{count} {word}")

	if len(phrases) == 1:
	result = phrases[0]
	else:
	result = ", ".join(phrases[:-1]) + " and " + phrases[-1]

	return f"This picture contains {result}."

	# Annotate the image with bounding boxes and labels
	def draw_detected_objects(image, detections, score_threshold=0.5):
	annotated_image = image.copy()
	draw = ImageDraw.Draw(annotated_image)

	try:
	font = ImageFont.truetype("arial.ttf", size=14)
	except:
	font = ImageFont.load_default()

	for item in detections:
	score = item["score"]
	if score < score_threshold:
	continue

	box = item["box"]
	label = item["label"]
	text = f"{label} ({score:.2f})"

	text_bbox = font.getbbox(text)
	text_width = text_bbox[2] - text_bbox[0]
	text_height = text_bbox[3] - text_bbox[1]

	draw.rectangle(
	[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
	outline="red", width=3
	)

	draw.rectangle(
	[(box["xmin"], box["ymin"] - text_height),
	(box["xmin"] + text_width, box["ymin"])],
	fill="red"
	)
	draw.text(
	(box["xmin"], box["ymin"] - text_height),
	text, fill="white", font=font
	)

	return annotated_image

	def detect_image(image):
	try:
	output = obj_detector(image)
	print("✅ Detection output:", output)
	natural_text = read_objects(output)
	print("📝 Generated Text:", natural_text)
	audio_path = generate_audio(natural_text)
	print("🔊 Audio Path:", audio_path)
	processed_image = draw_detected_objects(image, output)
	return processed_image.convert("RGB"), audio_path # 👈 Force RGB
	except Exception as e:
	print("❌ Error:", str(e))
	return Image.new("RGB", (512, 512), color="gray"), None




	# Launch Gradio app
	demo = gr.Interface(
	fn=detect_image,
	inputs=[gr.Image(label="Upload an Image", type="pil")],
	outputs=[
	gr.Image(label="Image with Detected Objects", type="pil"),
	gr.Audio(label="Audio Description")
	],
	title="@GenAI Project 7: Object Detector with Audio",
	description="This app detects objects in images, highlights them, and generates a natural language audio description."
	)

	demo.launch(share=True)