Spaces:

Miraj3
/

Object_Detector_with_Audio

Sleeping

App Files Files Community

Object_Detector_with_Audio / app.py

Miraj3

Create app.py

00e82af verified 27 days ago

raw

history blame

3.58 kB

	import gradio as gr # not used here, but kept if needed later
	from PIL import Image, ImageDraw, ImageFont
	import scipy.io.wavfile as wavfile
	import numpy as np

	from transformers import pipeline
	from collections import Counter
	import inflect

	# # Paths for your models
	# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/"
	# "3bcb8321394f671bd948ebf0d086d694dda95464")
	# narrator = pipeline("text-to-speech", model=tts_model_path)

	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")

	# obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/"
	# "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
	# obj_detector = pipeline("object-detection", model=obj_detector_path)

	obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

	def generate_audio(text, output_path="finetuned_output.wav"):
	narrated = narrator(text)

	audio = narrated["audio"]
	sampling_rate = narrated["sampling_rate"]

	# Convert to int16 if needed
	if audio.dtype != np.int16:
	audio = (audio * 32767).astype(np.int16)

	wavfile.write(output_path, sampling_rate, audio)
	return output_path

	def read_objects(detections: list[dict]) -> str:
	if not detections:
	return "No objects were detected in this picture."

	labels = [det['label'] for det in detections]
	label_counts = Counter(labels)

	p = inflect.engine()
	phrases = []
	for label, count in label_counts.items():
	word = p.plural(label, count)
	phrases.append(f"{count} {word}")

	if len(phrases) == 1:
	result = phrases[0]
	else:
	result = ", ".join(phrases[:-1]) + " and " + phrases[-1]

	return f"This picture contains {result}."

	def draw_detected_objects(image, detections, score_threshold=0.5):
	annotated_image = image.copy()
	draw = ImageDraw.Draw(annotated_image)

	try:
	font = ImageFont.truetype("arial.ttf", size=14)
	except:
	font = ImageFont.load_default()

	for item in detections:
	score = item["score"]
	if score < score_threshold:
	continue

	box = item["box"]
	label = item["label"]
	text = f"{label} ({score:.2f})"

	text_bbox = font.getbbox(text)
	text_width = text_bbox[2] - text_bbox[0]
	text_height = text_bbox[3] - text_bbox[1]

	draw.rectangle(
	[(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
	outline="red", width=3
	)

	draw.rectangle(
	[(box["xmin"], box["ymin"] - text_height),
	(box["xmin"] + text_width, box["ymin"])],
	fill="red"
	)
	draw.text(
	(box["xmin"], box["ymin"] - text_height),
	text, fill="white", font=font
	)

	return annotated_image

	def detect_image(image):
	raw_image = image
	output = obj_detector(raw_image)
	processed_image = draw_detected_objects(raw_image, output)
	natural_text = read_objects(output)
	processed_audio = generate_audio(natural_text)
	return processed_image, processed_audio

	gr.close_all()

	demo = gr.Interface(fn=detect_image,
	inputs=[gr.Image(label="Select Image", type="pil")],
	outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
	title="@GenAI Project 7: Object Detector with Audio",
	description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.")
	demo.launch()