snehasanjana commited on
Commit
b5d2f15
·
verified ·
1 Parent(s): cef9e35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -2,22 +2,29 @@ import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForVision2Seq
3
  from PIL import Image
4
  import torch
 
5
 
6
  # Load model and processor
7
  processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
8
  model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
9
  model.eval()
10
 
 
 
 
 
11
  def grounding(image, prompt):
12
  inputs = processor(text=prompt, images=image, return_tensors="pt")
13
  with torch.no_grad():
14
  generated_ids = model.generate(**inputs, max_new_tokens=256)
15
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
16
- return generated_text
17
 
 
18
  gr.Interface(
19
  fn=grounding,
20
  inputs=[gr.Image(type="pil"), gr.Textbox(label="Text Prompt")],
21
  outputs="text",
22
- title="Image to text Generation"
 
23
  ).launch()
 
2
  from transformers import AutoProcessor, AutoModelForVision2Seq
3
  from PIL import Image
4
  import torch
5
+ import re
6
 
7
  # Load model and processor
8
  processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
9
  model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
10
  model.eval()
11
 
12
+ def clean_caption(caption):
13
+ # Remove non-alphanumeric characters and extra whitespace, capitalize result
14
+ return re.sub(r'[^\w\s]', '', caption).strip().capitalize()
15
+
16
  def grounding(image, prompt):
17
  inputs = processor(text=prompt, images=image, return_tensors="pt")
18
  with torch.no_grad():
19
  generated_ids = model.generate(**inputs, max_new_tokens=256)
20
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
21
+ return clean_caption(generated_text)
22
 
23
+ # Gradio Interface
24
  gr.Interface(
25
  fn=grounding,
26
  inputs=[gr.Image(type="pil"), gr.Textbox(label="Text Prompt")],
27
  outputs="text",
28
+ title="Image to Text Generation",
29
+ description="Kosmos-2: Upload an image and provide a text prompt for grounded captioning."
30
  ).launch()