Miraj3 commited on
Commit
d85cc90
·
verified ·
1 Parent(s): ec23bd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -32
app.py CHANGED
@@ -1,38 +1,30 @@
1
- import gradio as gr # not used here, but kept if needed later
2
  from PIL import Image, ImageDraw, ImageFont
3
  import scipy.io.wavfile as wavfile
4
  import numpy as np
5
-
6
  from transformers import pipeline
7
  from collections import Counter
8
  import inflect
9
 
10
- # # Paths for your models
11
- # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots/"
12
- # "3bcb8321394f671bd948ebf0d086d694dda95464")
13
- # narrator = pipeline("text-to-speech", model=tts_model_path)
14
-
15
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
16
-
17
- # obj_detector_path = ("../Models/models--facebook--detr-resnet-50/snapshots/"
18
- # "1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
19
- # obj_detector = pipeline("object-detection", model=obj_detector_path)
20
-
21
  obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
22
 
23
- def generate_audio(text, output_path="finetuned_output.wav"):
 
24
  narrated = narrator(text)
25
-
26
  audio = narrated["audio"]
27
  sampling_rate = narrated["sampling_rate"]
28
 
29
- # Convert to int16 if needed
30
  if audio.dtype != np.int16:
31
  audio = (audio * 32767).astype(np.int16)
32
 
33
- wavfile.write(output_path, sampling_rate, audio)
34
- return output_path
 
35
 
 
36
  def read_objects(detections: list[dict]) -> str:
37
  if not detections:
38
  return "No objects were detected in this picture."
@@ -53,6 +45,7 @@ def read_objects(detections: list[dict]) -> str:
53
 
54
  return f"This picture contains {result}."
55
 
 
56
  def draw_detected_objects(image, detections, score_threshold=0.5):
57
  annotated_image = image.copy()
58
  draw = ImageDraw.Draw(annotated_image)
@@ -92,21 +85,24 @@ def draw_detected_objects(image, detections, score_threshold=0.5):
92
 
93
  return annotated_image
94
 
 
95
  def detect_image(image):
96
- raw_image = image
97
- output = obj_detector(raw_image)
98
- processed_image = draw_detected_objects(raw_image, output)
99
- natural_text = read_objects(output)
100
- processed_audio = generate_audio(natural_text)
101
- return processed_image, processed_audio
102
-
 
 
 
 
 
103
  gr.close_all()
104
 
105
- demo = gr.Interface(fn=detect_image,
106
- inputs=[gr.Image(label="Select Image", type="pil")],
107
- outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
108
- title="@GenAI Project 7: Object Detector with Audio",
109
- description="THIS APPLICATION IS USED TO DETECT, HIGHLIGHT THE IMAGE AND ALSO GIVES AUDIO DESCRIPTION.")
110
- demo.launch()
111
-
112
-
 
1
+ import gradio as gr
2
  from PIL import Image, ImageDraw, ImageFont
3
  import scipy.io.wavfile as wavfile
4
  import numpy as np
5
+ import tempfile
6
  from transformers import pipeline
7
  from collections import Counter
8
  import inflect
9
 
10
+ # Load models
 
 
 
 
11
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 
 
 
 
 
12
  obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
13
 
14
+ # Generate audio and save as temporary .wav
15
+ def generate_audio(text):
16
  narrated = narrator(text)
 
17
  audio = narrated["audio"]
18
  sampling_rate = narrated["sampling_rate"]
19
 
 
20
  if audio.dtype != np.int16:
21
  audio = (audio * 32767).astype(np.int16)
22
 
23
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
24
+ wavfile.write(f.name, int(sampling_rate), audio)
25
+ return f.name
26
 
27
+ # Turn detections into human-friendly text
28
  def read_objects(detections: list[dict]) -> str:
29
  if not detections:
30
  return "No objects were detected in this picture."
 
45
 
46
  return f"This picture contains {result}."
47
 
48
+ # Annotate the image with bounding boxes and labels
49
  def draw_detected_objects(image, detections, score_threshold=0.5):
50
  annotated_image = image.copy()
51
  draw = ImageDraw.Draw(annotated_image)
 
85
 
86
  return annotated_image
87
 
88
+ # Gradio function
89
  def detect_image(image):
90
+ try:
91
+ raw_image = image
92
+ output = obj_detector(raw_image)
93
+ processed_image = draw_detected_objects(raw_image, output)
94
+ natural_text = read_objects(output)
95
+ processed_audio = generate_audio(natural_text)
96
+ return processed_image, processed_audio
97
+ except Exception as e:
98
+ print("❌ Error:", e)
99
+ return None, None
100
+
101
+ # Launch Gradio app
102
  gr.close_all()
103
 
104
+ demo = gr.Interface(
105
+ fn=detect_image,
106
+ inputs=[gr.Image(label="Upload an Image", type="pil")],
107
+ outputs=[
108
+ gr.Image(label="Image with Detecte