ScreenCoder / screencoder /image_box_detection.py
Jimmyzheng-10's picture
Retrieve
95b965e
import argparse, asyncio, cv2, json, os, sys
from pathlib import Path
import numpy as np
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
# ---------- Fallback HTML parsing method ----------
def extract_bboxes_from_html_fallback(html_path: Path):
"""
Fallback method to extract bboxes from HTML without using Playwright.
This is a simplified version that may not be as accurate but will allow the pipeline to continue.
"""
try:
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Extract region bboxes from CSS styles
region_bboxes = []
region_containers = soup.find_all('div', class_='box')
for i, container in enumerate(region_containers):
container_id = container.get('id', f'region_{i}')
style = container.get('style', '')
# Parse CSS style to extract position and size
# This is a simplified parser - in real scenarios, you might need a more robust CSS parser
left = 0
top = 0
width = 100
height = 100
if 'left:' in style:
left_str = style.split('left:')[1].split('%')[0].strip()
left = float(left_str)
if 'top:' in style:
top_str = style.split('top:')[1].split('%')[0].strip()
top = float(top_str)
if 'width:' in style:
width_str = style.split('width:')[1].split('%')[0].strip()
width = float(width_str)
if 'height:' in style:
height_str = style.split('height:')[1].split('%')[0].strip()
height = float(height_str)
# Convert percentage to pixels (assuming 1280x720 viewport)
x = int(left * 12.8) # 1280 / 100
y = int(top * 7.2) # 720 / 100
w = int(width * 12.8)
h = int(height * 7.2)
region_bboxes.append({
'id': container_id,
'x': x,
'y': y,
'w': w,
'h': h
})
# Extract placeholder bboxes
placeholder_bboxes = []
placeholder_images = soup.find_all(class_='bg-gray-400')
for i, img in enumerate(placeholder_images):
# For fallback, we'll use a simple approach
# In a real scenario, you'd need to parse the actual layout
placeholder_bboxes.append({
'id': f'ph{i}',
'x': 100 + i * 50,
'y': 100 + i * 50,
'w': 100,
'h': 100,
'region_id': region_bboxes[0]['id'] if region_bboxes else '1'
})
return region_bboxes, placeholder_bboxes, 1280, 720
except Exception as e:
print(f"Error in fallback HTML parsing: {e}")
return [], [], 1280, 720
# ---------- Main logic ----------
async def extract_bboxes_from_html(html_path: Path):
async with async_playwright() as p:
try:
# Try to launch browser with headless mode for HF Spaces compatibility
browser = await p.chromium.launch(headless=True)
except Exception as e:
print(f"Error launching browser: {e}")
print("Attempting to install browser dependencies...")
try:
# Try to install browser dependencies
import subprocess
result = subprocess.run(["playwright", "install", "chromium"],
capture_output=True, text=True, timeout=300)
if result.returncode == 0:
print("Browser dependencies installed successfully, retrying...")
browser = await p.chromium.launch(headless=True)
else:
print(f"Failed to install browser dependencies: {result.stderr}")
# Return empty results to continue the pipeline
return [], [], 1280, 720
except Exception as install_error:
print(f"Failed to install browser dependencies: {install_error}")
# Return empty results to continue the pipeline
return [], [], 1280, 720
try:
ctx = await browser.new_context(
viewport={"width": 1280, "height": 720},
)
page = await ctx.new_page()
await page.goto(html_path.resolve().as_uri())
metrics = await page.evaluate("""
() => {
const region_containers = Array.from(document.querySelectorAll('.box[id]'));
const region_bboxes = region_containers.map(el => {
const rect = el.getBoundingClientRect();
return { id: el.id, x: rect.x, y: rect.y, w: rect.width, h: rect.height };
});
const placeholder_bboxes = [];
let ph_id_counter = 0;
const all_potential_placeholders = document.querySelectorAll('.bg-gray-400');
for (const el of all_potential_placeholders) {
// Apply the same filters as before
if (el.tagName === 'SVG') continue;
if (el.innerText && el.innerText.trim() !== '') continue;
const el_rect = el.getBoundingClientRect();
const el_center = { x: el_rect.left + el_rect.width / 2, y: el_rect.top + el_rect.height / 2 };
// Find which region this placeholder is inside
let containing_region_id = null;
for (const region_el of region_containers) {
const region_rect = region_el.getBoundingClientRect();
if (el_center.x >= region_rect.left && el_center.x <= region_rect.right &&
el_center.y >= region_rect.top && el_center.y <= region_rect.bottom) {
containing_region_id = region_el.id;
break; // Assume non-overlapping regions
}
}
if (containing_region_id) {
placeholder_bboxes.push({
id: 'ph' + ph_id_counter++,
x: el_rect.x,
y: el_rect.y,
w: el_rect.width,
h: el_rect.height,
region_id: containing_region_id
});
}
}
const layout_rect = document.documentElement.getBoundingClientRect();
return {
region_bboxes,
placeholder_bboxes,
layout_width: layout_rect.width,
layout_height: layout_rect.height
};
}
""")
await browser.close()
return metrics['region_bboxes'], metrics['placeholder_bboxes'], metrics['layout_width'], metrics['layout_height']
except Exception as e:
print(f"Error during browser operation: {e}")
await browser.close()
# Return empty results to continue the pipeline
return [], [], 1280, 720
def draw_bboxes_on_image(img, region_bboxes, placeholder_bboxes):
"""Draw region (green) and placeholder (red) boxes with labels on img."""
boxed = img.copy()
H, W = img.shape[:2]
# --- Helper to draw a single box with label ---
def draw_box_with_label(b, color, label_text):
x, y, w, h = b["x"], b["y"], b["w"], b["h"]
# Boundary correction
x_draw, y_draw = max(0, x), max(0, y)
w_draw, h_draw = min(w, W - x_draw), min(h, H - y_draw)
cv2.rectangle(boxed, (x_draw, y_draw), (x_draw + w_draw, y_draw + h_draw), color, 3) # Thicker lines
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.8
font_thickness = 2
text_color = (255, 255, 255)
(text_width, text_height), baseline = cv2.getTextSize(label_text, font, font_scale, font_thickness)
# Position for the label background. Put it just above the box.
label_y_start = y - text_height - baseline - 5
if label_y_start < 0: # Adjust if the label goes off the top of the image
label_y_start = y + 5
label_x_start = x
label_y_end = label_y_start + text_height + baseline
cv2.rectangle(boxed, (label_x_start, label_y_start), (label_x_start + text_width, label_y_end), color, cv2.FILLED)
cv2.putText(boxed, label_text, (label_x_start + 2, label_y_start + text_height), font, font_scale, text_color, font_thickness)
# --- Draw Regions (Green) ---
for b in region_bboxes:
draw_box_with_label(b, color=(0, 255, 0), label_text=f'Area_{b.get("id", "")}')
# --- Draw Placeholders (Red) ---
for b in placeholder_bboxes:
draw_box_with_label(b, color=(0, 0, 255), label_text=f'{b.get("region_id")}_{b.get("id")}')
return boxed
def main():
args = get_args()
run_id = args.run_id
# --- Dynamic Path Construction ---
base_dir = Path(__file__).parent.resolve()
tmp_dir = base_dir / 'data' / 'tmp' / run_id
output_dir = base_dir / 'data' / 'output' / run_id
html_path = output_dir / f"{run_id}_layout.html"
screenshot_path = tmp_dir / f"{run_id}.png"
output_json_path = tmp_dir / f"{run_id}_bboxes.json"
debug_image_path = tmp_dir / f"debug_gray_bboxes_{run_id}.png"
if not html_path.exists():
sys.exit(f"Error: HTML file not found at {html_path}")
if not screenshot_path.exists():
sys.exit(f"Error: Screenshot not found at {screenshot_path}")
print(f"--- Starting Image Box Detection for run_id: {run_id} ---")
# Read original screenshot
img = cv2.imread(str(screenshot_path))
if img is None:
sys.exit(f"Error: Cannot read image {screenshot_path}")
if img.std() < 5:
print("Warning: The screenshot is almost pure color, it may not be the original screenshot with real thumbnails.")
H, W = img.shape[:2]
# Parse HTML → Get bboxes
try:
region_bboxes, placeholder_bboxes, layout_width, layout_height = asyncio.run(
extract_bboxes_from_html(html_path)
)
print("Successfully extracted bboxes using Playwright")
except Exception as e:
print(f"Playwright failed: {e}")
print("Falling back to HTML parsing method...")
region_bboxes, placeholder_bboxes, layout_width, layout_height = extract_bboxes_from_html_fallback(html_path)
print("Successfully extracted bboxes using fallback method")
if not placeholder_bboxes:
# This is not necessarily an error; some UIs might not have placeholders.
print("Info: No gray placeholder blocks found.")
# Calculate separate scale factors for X and Y to handle aspect ratio differences
scale_x = W / layout_width if layout_width > 0 else 1
scale_y = H / layout_height if layout_height > 0 else 1
if abs(scale_x - scale_y) > 0.05:
print(f"[*] Detected different X/Y scales. X: {scale_x:.2f}, Y: {scale_y:.2f}")
elif abs(scale_x - 1.0) > 0.05:
print(f"[*] Detected uniform scale: {scale_x:.2f}")
# Scale all bboxes to the original image coordinate system
scaled_regions = []
for b in region_bboxes:
scaled_regions.append({
**b,
"x": int(b['x'] * scale_x), "y": int(b['y'] * scale_y),
"w": int(b['w'] * scale_x), "h": int(b['h'] * scale_y)
})
scaled_placeholders = []
for b in placeholder_bboxes:
scaled_placeholders.append({
**b,
"x": int(b['x'] * scale_x), "y": int(b['y'] * scale_y),
"w": int(b['w'] * scale_x), "h": int(b['h'] * scale_y)
})
# Draw boxes using the now-scaled data
overlay = draw_bboxes_on_image(img, scaled_regions, scaled_placeholders)
# Save debug image
debug_image_path.parent.mkdir(parents=True, exist_ok=True)
cv2.imwrite(str(debug_image_path), overlay)
print(f"Success: BBox overlay saved to {debug_image_path}")
# Convert absolute pixel coordinates to proportions for the final JSON output
proportional_regions = []
for b in scaled_regions:
proportional_regions.append({
**b,
"x": b["x"] / W, "y": b["y"] / H,
"w": b["w"] / W, "h": b["h"] / H
})
proportional_placeholders = []
for b in scaled_placeholders:
proportional_placeholders.append({
**b,
"x": b["x"] / W, "y": b["y"] / H,
"w": b["w"] / W, "h": b["h"] / H
})
# Print/save bbox array
print("\n=== BBox (proportional to image dimensions) ===")
output_data = {
"regions": proportional_regions,
"placeholders": proportional_placeholders
}
output_json = json.dumps(output_data, indent=2, ensure_ascii=False)
print(output_json)
output_json_path.parent.mkdir(parents=True, exist_ok=True)
output_json_path.write_text(output_json)
print(f"Success: BBox list saved to {output_json_path}")
print(f"--- Image Box Detection Complete for run_id: {run_id} ---")
def get_args():
parser = argparse.ArgumentParser(
description="Extracts placeholder bounding boxes from an HTML file and maps them to a screenshot."
)
parser.add_argument('--run_id', required=True, type=str,
help="A unique identifier for the processing run.")
return parser.parse_args()
# ---------- CLI ----------
if __name__ == "__main__":
main()