Spaces:

obichimav
/

Object-Detection-and-Plant-Analysis-System

Running

App Files Files Community

obichimav commited on 18 days ago

Commit

e957e17

verified ·

1 Parent(s): 3aab296

Update app.py

Browse files

Files changed (1) hide show

app.py +365 -256

app.py CHANGED Viewed

@@ -1,179 +1,194 @@
-import os
-import openai
 import gradio as gr
 import numpy as np
 import torch
 from PIL import Image
 import matplotlib.pyplot as plt
-import importlib.util
 from transformers import pipeline
-import requests
-# Set your OpenAI API key (ensure the environment variable is set or replace with your key)
-openai.api_key = os.getenv("OPENAI_API_KEY", "your-openai-api-key-here")
 def install_sam2_if_needed():
-    """
-    Check if SAM2 is installed, and install it if needed.
-    """
     if importlib.util.find_spec("sam2") is not None:
         print("SAM2 is already installed.")
-        return
     try:
-        import pip
         print("Installing SAM2 from GitHub...")
-        pip.main(['install', 'git+https://github.com/facebookresearch/sam2.git'])
         print("SAM2 installed successfully.")
     except Exception as e:
         print(f"Error installing SAM2: {e}")
-        print("You may need to manually install SAM2: !pip install git+https://github.com/facebookresearch/sam2.git")
-        raise
-def detect_objects_owlv2(text_query, image, threshold=0.1):
-    """
-    Detect objects in an image using OWLv2 model.
-    Args:
-        text_query (str): Text description of objects to detect
-        image (PIL.Image or numpy.ndarray): Input image
-        threshold (float): Detection threshold
-    Returns:
-        list: List of detections with bbox, label, and score
-    """
-    # Initialize the OWL-ViT model
-    detector = pipeline(model="google/owlv2-base-patch16-ensemble", task="zero-shot-object-detection")
-    # Convert numpy array to PIL Image if needed
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    # Run detection
-    predictions = detector(image, candidate_labels=[text_query])
-    # Filter by threshold and format results
-    detections = []
-    for pred in predictions:
-        if pred['score'] >= threshold:
-            bbox = pred['box']
-            # Normalize bbox coordinates (OWL-ViT returns absolute coordinates)
-            width, height = image.size
-            normalized_bbox = [
-                bbox['xmin'] / width,
-                bbox['ymin'] / height,
-                bbox['xmax'] / width,
-                bbox['ymax'] / height
-            ]
-            detection = {
-                'label': pred['label'],
-                'bbox': normalized_bbox,
-                'score': pred['score']
-            }
-            detections.append(detection)
-    return detections
-def generate_masks_from_detections(detections, image, model_name="facebook/sam2-hiera-large"):
-    """
-    Generate segmentation masks for objects detected by OWLv2 using SAM2 from Hugging Face.
-    Args:
-        detections (list): List of detections [{'label': str, 'bbox': [x1, y1, x2, y2], 'score': float}, ...]
-        image (PIL.Image.Image or str): The image or path to the image to analyze
-        model_name (str): Hugging Face model name for SAM2.
-    Returns:
-        list: List of detections with added 'mask' arrays.
-    """
-    install_sam2_if_needed()
-    from sam2.sam2_image_predictor import SAM2ImagePredictor
-    # Load image
-    if isinstance(image, str):
-        image = Image.open(image)
-    elif isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    image_np = np.array(image.convert("RGB"))
-    H, W = image_np.shape[:2]
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"Using device: {device}")
-    print(f"Loading SAM2 model from Hugging Face: {model_name}")
-    predictor = SAM2ImagePredictor.from_pretrained(model_name)
-    predictor.model.to(device)
-    # Convert normalized bboxes to pixels
-    input_boxes = []
-    for det in detections:
-        x1, y1, x2, y2 = det['bbox']
-        input_boxes.append([int(x1 * W), int(y1 * H), int(x2 * W), int(y2 * H)])
-    input_boxes = np.array(input_boxes)
-    print(f"Processing image and predicting masks for {len(input_boxes)} boxes...")
-    with torch.inference_mode():
-        predictor.set_image(image_np)
-        if device == "cuda":
-            with torch.autocast("cuda", dtype=torch.bfloat16):
-                masks, scores, _ = predictor.predict(
-                    point_coords=None, point_labels=None,
-                    box=input_boxes, multimask_output=False
                 )
-        else:
-            masks, scores, _ = predictor.predict(
-                point_coords=None, point_labels=None,
-                box=input_boxes, multimask_output=False
-            )
-    # Attach masks to detections, handling both (1,H,W) and (H,W) outputs
-    results = []
-    for i, det in enumerate(detections):
-        raw = masks[i]
-        if raw.ndim == 3:
-            mask = raw[0]
-        else:
-            mask = raw
-        mask = mask.astype(np.uint8)
-        new_det = det.copy()
-        new_det['mask'] = mask
-        results.append(new_det)
-    print(f"Successfully generated {len(results)} masks.")
-    return results
-def overlay_detections_on_image(image, detections_with_masks, show_masks=True, show_boxes=True, show_labels=True):
     """
-    Overlay detections (boxes and/or masks) on the image and return as numpy array.
-    Args:
-        image: Input image (PIL.Image or numpy array)
-        detections_with_masks: List of detections with masks
-        show_masks: Whether to show segmentation masks
-        show_boxes: Whether to show bounding boxes
-        show_labels: Whether to show labels
-    Returns:
-        numpy.ndarray: Image with overlaid detections
     """
-    # Convert to PIL Image if needed
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
     image_np = np.array(image.convert("RGB"))
     height, width = image_np.shape[:2]
-    # Create figure without displaying
-    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
-    ax.imshow(image_np)
     # Define colors for different instances
     colors = plt.cm.tab10(np.linspace(0, 1, 10))
     # Plot each detection
     for i, detection in enumerate(detections_with_masks):
         bbox = detection['bbox']
         label = detection['label']
         score = detection['score']
@@ -186,170 +201,264 @@ def overlay_detections_on_image(image, detections_with_masks, show_masks=True, s
         # Color for this instance
         color = colors[i % len(colors)]
-        # Display mask if available and requested
-        if show_masks and 'mask' in detection:
             mask = detection['mask']
             mask_color = np.zeros((height, width, 4), dtype=np.float32)
             mask_color[mask > 0] = [color[0], color[1], color[2], 0.5]
-            ax.imshow(mask_color)
         # Draw bounding box if requested
         if show_boxes:
             rect = plt.Rectangle((x1_px, y1_px), x2_px - x1_px, y2_px - y1_px,
                                 fill=False, edgecolor=color, linewidth=2)
-            ax.add_patch(rect)
         # Add label and score if requested
         if show_labels:
-            ax.text(x1_px, y1_px - 5, f"{label}: {score:.2f}",
                     color='white', bbox=dict(facecolor=color, alpha=0.8), fontsize=10)
-    ax.axis('off')
-    # Convert plot to numpy array
-    fig.canvas.draw()
-    result_array = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
-    result_array = result_array.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-    plt.close(fig)  # Important: close the figure to free memory
-    return result_array
-def get_single_prompt(user_input):
     """
-    Uses OpenAI to rephrase the user's chatter into a single, concise prompt for object detection.
-    The generated prompt will not include any question marks.
     """
-    if not user_input.strip():
-        user_input = "Detect objects in the image"
-    prompt_instruction = (
-        f"Based on the following user input, generate a single, concise prompt for object detection. "
-        f"Do not include any question marks in the output. "
-        f"User input: \"{user_input}\""
-    )
-    response = openai.chat.completions.create(
-        model="gpt-4o",  # adjust model name if needed
-        messages=[{"role": "user", "content": prompt_instruction}],
-        temperature=0.3,
-        max_tokens=50,
-    )
-    generated_prompt = response.choices[0].message.content.strip()
-    # Ensure no question marks remain
-    generated_prompt = generated_prompt.replace("?", "")
-    return generated_prompt
-def is_count_query(user_input):
-    """
-    Check if the user's input indicates a counting request.
-    Looks for common keywords such as "count", "how many", "number of", etc.
-    """
-    keywords = ["count", "how many", "number of", "total", "get me a count"]
-    for kw in keywords:
-        if kw.lower() in user_input.lower():
-            return True
-    return False
-def process_question_and_detect(user_input, image, threshold, use_sam):
-    """
-    1. Uses OpenAI to generate a single, concise prompt (without question marks) from the user's input.
-    2. Feeds that prompt to the custom detection function.
-    3. Optionally generates segmentation masks using SAM2.
-    4. Overlays the detection results on the image.
-    5. If the user's input implies a counting request, it also returns the count of detected objects.
-    """
     if image is None:
-        return None, "Please upload an image."
     try:
-        # Generate the concise prompt from the user's input
-        generated_prompt = get_single_prompt(user_input)
-        # Run object detection using the generated prompt
-        detections = detect_objects_owlv2(generated_prompt, image, threshold=threshold)
-        # Generate masks if SAM is enabled
-        if use_sam and len(detections) > 0:
-            try:
-                detections_with_masks = generate_masks_from_detections(detections, image)
-            except Exception as e:
-                print(f"SAM2 failed, using detections without masks: {e}")
-                detections_with_masks = detections
         else:
-            detections_with_masks = detections
-        # Overlay results on the image
-        viz = overlay_detections_on_image(image, detections_with_masks,
-                                        show_masks=use_sam,
-                                        show_boxes=True,
-                                        show_labels=True)
-        # If the user's input implies a counting request, include the count
-        count_text = ""
-        if is_count_query(user_input):
-            count = len(detections)
-            count_text = f"Detected {count} objects."
-        output_text = f"Generated prompt: {generated_prompt}\n{count_text}"
-        if len(detections) == 0:
-            output_text += f"\nNo objects detected with threshold {threshold}. Try lowering the threshold."
-        print(output_text)
-        return viz, output_text
     except Exception as e:
-        error_msg = f"Error during detection: {str(e)}"
-        print(error_msg)
         return image, error_msg
-# Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Custom Object Detection and Counting App")
-    gr.Markdown(
-        """
-        Enter your input (for example:
-        - "What is the number of fruit in my image?"
-        - "How many bicycles can you see?"
-        - "Get me a count of my bottles")
-        and upload an image.
-        The app uses OpenAI to generate a single, concise prompt for object detection (without question marks),
-        then runs the detection using OWL-ViT. Optionally, SAM2 can generate precise segmentation masks.
-        """
-    )
     with gr.Row():
-        with gr.Column():
-            user_input = gr.Textbox(label="Enter your input", placeholder="Type your input here...")
-            image_input = gr.Image(label="Upload Image", type="numpy")
             with gr.Row():
                 threshold_slider = gr.Slider(
-                    minimum=0.01,
-                    maximum=1.0,
-                    value=0.1,
                     step=0.01,
-                    label="Detection Threshold",
-                    info="Lower values detect more objects but may include false positives"
                 )
-                use_sam_checkbox = gr.Checkbox(
-                    label="Use SAM2 for Segmentation",
                     value=False,
-                    info="Enable to generate precise segmentation masks (requires additional computation)"
                 )
-            submit_btn = gr.Button("Detect and Count")
-        with gr.Column():
-            output_image = gr.Image(label="Detection Result")
-            output_text = gr.Textbox(label="Output Details", lines=3)
-    submit_btn.click(
-        fn=process_question_and_detect,
-        inputs=[user_input, image_input, threshold_slider, use_sam_checkbox],
         outputs=[output_image, output_text]
     )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import numpy as np
 import torch
 from PIL import Image
 import matplotlib.pyplot as plt
 from transformers import pipeline
+import warnings
+from io import BytesIO
+import importlib.util
+# Suppress warnings
+warnings.filterwarnings("ignore")
+# Global variables for models
+detector = None
+sam_predictor = None
+def load_detector():
+    """Load the OWL-ViT detector once and cache it."""
+    global detector
+    if detector is None:
+        print("Loading OWL-ViT model...")
+        detector = pipeline(
+            model="google/owlv2-base-patch16-ensemble",
+            task="zero-shot-object-detection",
+            device=0 if torch.cuda.is_available() else -1
+        )
+        print("OWL-ViT model loaded successfully!")
 def install_sam2_if_needed():
+    """Check if SAM2 is installed, and install it if needed."""
     if importlib.util.find_spec("sam2") is not None:
         print("SAM2 is already installed.")
+        return True
     try:
+        import subprocess
+        import sys
         print("Installing SAM2 from GitHub...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/sam2.git"])
         print("SAM2 installed successfully.")
+        return True
     except Exception as e:
         print(f"Error installing SAM2: {e}")
+        return False
+def load_sam_predictor():
+    """Load SAM2 predictor if available."""
+    global sam_predictor
+    if sam_predictor is None:
+        if install_sam2_if_needed():
+            try:
+                from sam2.sam2_image_predictor import SAM2ImagePredictor
+                print("Loading SAM2 model...")
+                sam_predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large")
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                sam_predictor.model.to(device)
+                print(f"SAM2 model loaded successfully on {device}!")
+                return True
+            except Exception as e:
+                print(f"Error loading SAM2: {e}")
+                return False
+    return sam_predictor is not None
+def detect_objects_owlv2(text_query, image, threshold=0.1):
+    """Detect objects using OWL-ViT."""
+    try:
+        load_detector()
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        # Clean up the text query
+        query_terms = [term.strip() for term in text_query.split(',') if term.strip()]
+        if not query_terms:
+            query_terms = ["object"]
+        print(f"Detecting: {query_terms}")
+        predictions = detector(image, candidate_labels=query_terms)
+        detections = []
+        for pred in predictions:
+            if pred['score'] >= threshold:
+                bbox = pred['box']
+                width, height = image.size
+                normalized_bbox = [
+                    bbox['xmin'] / width,
+                    bbox['ymin'] / height,
+                    bbox['xmax'] / width,
+                    bbox['ymax'] / height
+                ]
+                detection = {
+                    'label': pred['label'],
+                    'bbox': normalized_bbox,
+                    'score': pred['score']
+                }
+                detections.append(detection)
+        return detections, image
+    except Exception as e:
+        print(f"Detection error: {e}")
+        return [], image
+def generate_masks_sam2(detections, image):
+    """Generate segmentation masks using SAM2."""
+    try:
+        if not load_sam_predictor():
+            print("SAM2 not available, skipping mask generation")
+            return detections
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        image_np = np.array(image.convert("RGB"))
+        H, W = image_np.shape[:2]
+        # Set image for SAM2
+        sam_predictor.set_image(image_np)
+        # Convert normalized bboxes to pixel coordinates
+        input_boxes = []
+        for det in detections:
+            x1, y1, x2, y2 = det['bbox']
+            input_boxes.append([int(x1 * W), int(y1 * H), int(x2 * W), int(y2 * H)])
+        if not input_boxes:
+            return detections
+        input_boxes = np.array(input_boxes)
+        print(f"Generating masks for {len(input_boxes)} detections...")
+        with torch.inference_mode():
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            if device == "cuda":
+                with torch.autocast("cuda", dtype=torch.bfloat16):
+                    masks, scores, _ = sam_predictor.predict(
+                        point_coords=None,
+                        point_labels=None,
+                        box=input_boxes,
+                        multimask_output=False
+                    )
+            else:
+                masks, scores, _ = sam_predictor.predict(
+                    point_coords=None,
+                    point_labels=None,
+                    box=input_boxes,
+                    multimask_output=False
                 )
+        # Add masks to detections
+        results = []
+        for i, det in enumerate(detections):
+            new_det = det.copy()
+            mask = masks[i]
+            if mask.ndim == 3:
+                mask = mask[0]  # Remove batch dimension if present
+            new_det['mask'] = mask.astype(np.uint8)
+            results.append(new_det)
+        print(f"Successfully generated {len(results)} masks")
+        return results
+    except Exception as e:
+        print(f"SAM2 mask generation error: {e}")
+        return detections
+def visualize_detections_with_masks(image, detections_with_masks, show_labels=True, show_boxes=True):
     """
+    Visualize the detections with their segmentation masks.
+    Returns PIL Image instead of showing plot.
     """
+    # Load the image
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
     image_np = np.array(image.convert("RGB"))
+    # Get image dimensions
     height, width = image_np.shape[:2]
+    # Create figure
+    fig = plt.figure(figsize=(12, 8))
+    plt.imshow(image_np)
     # Define colors for different instances
     colors = plt.cm.tab10(np.linspace(0, 1, 10))
     # Plot each detection
     for i, detection in enumerate(detections_with_masks):
+        # Get bbox, mask, label, and score
         bbox = detection['bbox']
         label = detection['label']
         score = detection['score']
         # Color for this instance
         color = colors[i % len(colors)]
+        # Display mask with transparency if available
+        if 'mask' in detection:
             mask = detection['mask']
             mask_color = np.zeros((height, width, 4), dtype=np.float32)
             mask_color[mask > 0] = [color[0], color[1], color[2], 0.5]
+            plt.imshow(mask_color)
         # Draw bounding box if requested
         if show_boxes:
             rect = plt.Rectangle((x1_px, y1_px), x2_px - x1_px, y2_px - y1_px,
                                 fill=False, edgecolor=color, linewidth=2)
+            plt.gca().add_patch(rect)
         # Add label and score if requested
         if show_labels:
+            plt.text(x1_px, y1_px - 5, f"{label}: {score:.2f}",
                     color='white', bbox=dict(facecolor=color, alpha=0.8), fontsize=10)
+    plt.axis('off')
+    plt.tight_layout()
+    # Convert to PIL Image using the correct method
+    buf = BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    plt.close(fig)
+    buf.seek(0)
+    result_image = Image.open(buf)
+    return result_image
+def visualize_detections(image, detections, show_labels=True):
     """
+    Visualize object detections with bounding boxes only.
+    Returns PIL Image instead of showing plot.
     """
+    # Load the image
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    image_np = np.array(image.convert("RGB"))
+    # Get image dimensions
+    height, width = image_np.shape[:2]
+    # Create figure
+    fig = plt.figure(figsize=(12, 8))
+    plt.imshow(image_np)
+    # Define colors for different instances
+    colors = plt.cm.tab10(np.linspace(0, 1, 10))
+    # Plot each detection
+    for i, detection in enumerate(detections):
+        # Get bbox, label, and score
+        bbox = detection['bbox']
+        label = detection['label']
+        score = detection['score']
+        # Convert normalized bbox to pixel coordinates
+        x1, y1, x2, y2 = bbox
+        x1_px, y1_px = int(x1 * width), int(y1 * height)
+        x2_px, y2_px = int(x2 * width), int(y2 * height)
+        # Color for this instance
+        color = colors[i % len(colors)]
+        # Draw bounding box
+        rect = plt.Rectangle((x1_px, y1_px), x2_px - x1_px, y2_px - y1_px,
+                            fill=False, edgecolor=color, linewidth=2)
+        plt.gca().add_patch(rect)
+        # Add label and score if requested
+        if show_labels:
+            plt.text(x1_px, y1_px - 5, f"{label}: {score:.2f}",
+                    color='white', bbox=dict(facecolor=color, alpha=0.8), fontsize=10)
+    plt.axis('off')
+    plt.tight_layout()
+    # Convert to PIL Image
+    buf = BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    plt.close(fig)
+    buf.seek(0)
+    result_image = Image.open(buf)
+    return result_image
+def is_count_query(text):
+    """Check if the query is asking for counting."""
+    count_keywords = ["how many", "count", "number of", "total"]
+    return any(keyword in text.lower() for keyword in count_keywords)
+def detection_pipeline(query_text, image, threshold, use_sam):
+    """Main detection pipeline."""
     if image is None:
+        return None, "⚠️ Please upload an image first!"
     try:
+        # Extract object name from query
+        query_lower = query_text.lower()
+        # Simple keyword extraction
+        if "people" in query_lower or "person" in query_lower:
+            search_terms = "person"
+        elif "car" in query_lower or "vehicle" in query_lower:
+            search_terms = "car"
+        elif "apple" in query_lower:
+            search_terms = "apple"
+        elif "bottle" in query_lower:
+            search_terms = "bottle"
+        elif "phone" in query_lower:
+            search_terms = "phone"
+        elif "dog" in query_lower:
+            search_terms = "dog"
+        elif "cat" in query_lower:
+            search_terms = "cat"
+        else:
+            # Extract last word as potential object
+            words = query_text.strip().split()
+            search_terms = words[-1] if words else "object"
+        print(f"Processing query: '{query_text}' -> searching for: '{search_terms}'")
+        # Run object detection
+        detections, processed_image = detect_objects_owlv2(search_terms, image, threshold)
+        # Generate masks if requested
+        if use_sam and detections:
+            detections = generate_masks_sam2(detections, processed_image)
+        # Create visualization using your proven functions
+        if use_sam and detections:
+            result_image = visualize_detections_with_masks(
+                processed_image,
+                detections,
+                show_labels=True,
+                show_boxes=True
+            )
         else:
+            result_image = visualize_detections(
+                processed_image,
+                detections,
+                show_labels=True
+            )
+        # Generate summary
+        count = len(detections)
+        summary_parts = []
+        summary_parts.append(f"🔍 **Search Query**: '{query_text}'")
+        summary_parts.append(f"🎯 **Detected Object Type**: '{search_terms}'")
+        summary_parts.append(f"⚙️ **Threshold**: {threshold}")
+        summary_parts.append(f"🤖 **SAM2 Segmentation**: {'Enabled' if use_sam else 'Disabled'}")
+        if count > 0:
+            if is_count_query(query_text):
+                summary_parts.append(f"🔢 **Answer: {count} {search_terms}(s) found**")
+            else:
+                summary_parts.append(f"✅ **Found {count} {search_terms}(s)**")
+            # Show detection details
+            for i, det in enumerate(detections[:5]):  # Show first 5
+                summary_parts.append(f"   • Detection {i+1}: {det['score']:.3f} confidence")
+            if count > 5:
+                summary_parts.append(f"   • ... and {count-5} more detections")
+        else:
+            summary_parts.append(f"❌ **No {search_terms}(s) detected**")
+            summary_parts.append("💡 Try lowering the threshold or using different terms")
+        summary_text = "\n".join(summary_parts)
+        return result_image, summary_text
     except Exception as e:
+        error_msg = f"❌ **Error**: {str(e)}"
         return image, error_msg
+# ----------------
+# GRADIO INTERFACE
+# ----------------
+with gr.Blocks(title="🔍 Object Detection & Segmentation") as demo:
+    gr.Markdown("""
+    # 🔍 Object Detection & Segmentation App
+    **Simple and powerful object detection using OWL-ViT + SAM2**
+    1. **Enter your query** (e.g., "How many people?", "Find cars", "Count apples")
+    2. **Upload an image**
+    3. **Adjust detection sensitivity**
+    4. **Toggle SAM2 segmentation** for precise masks
+    5. **Click Detect!**
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
+            query_input = gr.Textbox(
+                label="🗣️ What do you want to detect?",
+                placeholder="e.g., 'How many people are in the image?'",
+                value="How many people are in the image?",
+                lines=2
+            )
+            image_input = gr.Image(
+                label="📸 Upload your image",
+                type="numpy"
+            )
             with gr.Row():
                 threshold_slider = gr.Slider(
+                    minimum=0.01,
+                    maximum=0.9,
+                    value=0.1,
                     step=0.01,
+                    label="🎚️ Detection Sensitivity"
                 )
+                sam_checkbox = gr.Checkbox(
+                    label="🎭 Enable SAM2 Segmentation",
                     value=False,
+                    info="Generate precise pixel masks"
                 )
+            detect_button = gr.Button("🔍 Detect Objects!", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            output_image = gr.Image(label="🎯 Detection Results")
+            output_text = gr.Textbox(
+                label="📊 Detection Summary",
+                lines=12,
+                show_copy_button=True
+            )
+    # Event handlers
+    detect_button.click(
+        fn=detection_pipeline,
+        inputs=[query_input, image_input, threshold_slider, sam_checkbox],
         outputs=[output_image, output_text]
     )
+    # Also trigger on Enter in text box
+    query_input.submit(
+        fn=detection_pipeline,
+        inputs=[query_input, image_input, threshold_slider, sam_checkbox],
+        outputs=[output_image, output_text]
+    )
+    # Examples section
+    gr.Examples(
+        examples=[
+            ["How many people are in the image?", None, 0.1, False],
+            ["Find all cars", None, 0.15, True],
+            ["Count the bottles", None, 0.1, True],
+            ["Detect dogs", None, 0.2, False],
+            ["How many phones?", None, 0.15, True],
+        ],
+        inputs=[query_input, image_input, threshold_slider, sam_checkbox],
+    )
+# Launch
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)