Miraj3 commited on
Commit
00e82af
·
verified ·
1 Parent(s): 5782799

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr # not used here, but kept if needed later
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ import scipy.io.wavfile as wavfile
4
+ import numpy as np
5
+
6
+ from transformers import pipeline
7
+ from collections import Counter
8
+ import inflect
9
+
10
+ # # Paths for your models
11
+ # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/"
12
+ # "3bcb8321394f671bd948ebf0d086d694dda95464")
13
+ # narrator = pipeline("text-to-speech", model=tts_model_path)
14
+
15
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
16
+
17
+ # obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/"
18
+ # "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
19
+ # obj_detector = pipeline("object-detection", model=obj_detector_path)
20
+
21
+ obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
22
+
23
+ def generate_audio(text, output_path="finetuned_output.wav"):
24
+ narrated = narrator(text)
25
+
26
+ audio = narrated["audio"]
27
+ sampling_rate = narrated["sampling_rate"]
28
+
29
+ # Convert to int16 if needed
30
+ if audio.dtype != np.int16:
31
+ audio = (audio * 32767).astype(np.int16)
32
+
33
+ wavfile.write(output_path, sampling_rate, audio)
34
+ return output_path
35
+
36
+ def read_objects(detections: list[dict]) -> str:
37
+ if not detections:
38
+ return "No objects were detected in this picture."
39
+
40
+ labels = [det['label'] for det in detections]
41
+ label_counts = Counter(labels)
42
+
43
+ p = inflect.engine()
44
+ phrases = []
45
+ for label, count in label_counts.items():
46
+ word = p.plural(label, count)
47
+ phrases.append(f"{count} {word}")
48
+
49
+ if len(phrases) == 1:
50
+ result = phrases[0]
51
+ else:
52
+ result = ", ".join(phrases[:-1]) + " and " + phrases[-1]
53
+
54
+ return f"This picture contains {result}."
55
+
56
+ def draw_detected_objects(image, detections, score_threshold=0.5):
57
+ annotated_image = image.copy()
58
+ draw = ImageDraw.Draw(annotated_image)
59
+
60
+ try:
61
+ font = ImageFont.truetype("arial.ttf", size=14)
62
+ except:
63
+ font = ImageFont.load_default()
64
+
65
+ for item in detections:
66
+ score = item["score"]
67
+ if score < score_threshold:
68
+ continue
69
+
70
+ box = item["box"]
71
+ label = item["label"]
72
+ text = f"{label} ({score:.2f})"
73
+
74
+ text_bbox = font.getbbox(text)
75
+ text_width = text_bbox[2] - text_bbox[0]
76
+ text_height = text_bbox[3] - text_bbox[1]
77
+
78
+ draw.rectangle(
79
+ [(box["xmin"], box["ymin"]), (box["xmax"], box["ymax"])],
80
+ outline="red", width=3
81
+ )
82
+
83
+ draw.rectangle(
84
+ [(box["xmin"], box["ymin"] - text_height),
85
+ (box["xmin"] + text_width, box["ymin"])],
86
+ fill="red"
87
+ )
88
+ draw.text(
89
+ (box["xmin"], box["ymin"] - text_height),
90
+ text, fill="white", font=font
91
+ )
92
+
93
+ return annotated_image
94
+
95
+ def detect_image(image):
96
+ raw_image = image
97
+ output = obj_detector(raw_image)
98
+ processed_image = draw_detected_objects(raw_image, output)
99
+ natural_text = read_objects(output)
100
+ processed_audio = generate_audio(natural_text)
101
+ return processed_image, processed_audio
102
+
103
+ gr.close_all()
104
+
105
+ demo = gr.Interface(fn=detect_image,
106
+ inputs=[gr.Image(label="Select Image", type="pil")],
107
+ outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
108
+ title="@GenAI Project 7: Object Detector with Audio",
109
+ description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.")
110
+ demo.launch()
111
+
112
+