Spaces:
Runtime error
Runtime error
dung-vpt-uney
commited on
Commit
·
9c4a163
1
Parent(s):
c3e1463
Deploy latest CoRGI Gradio demo
Browse files- PROGRESS_LOG.md +1 -1
- README.md +1 -0
- corgi/__pycache__/gradio_app.cpython-313.pyc +0 -0
- corgi/__pycache__/pipeline.cpython-313.pyc +0 -0
- corgi/__pycache__/types.cpython-313.pyc +0 -0
- corgi/gradio_app.py +20 -0
- corgi/pipeline.py +26 -0
- corgi/types.py +22 -0
PROGRESS_LOG.md
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
- Introduced structured logging for the app (`app.py`) and pipeline execution to trace model loads, cache hits, and Gradio lifecycle events on Spaces.
|
| 15 |
- Reworked the Gradio UI to show per-step panels with annotated evidence galleries, giving each CoRGI reasoning step its own window alongside the final synthesized answer.
|
| 16 |
- Preloaded the default Qwen3-VL model/tokenizer at import so Spaces load the GPU weights before serving requests.
|
| 17 |
-
- Switched inference to bfloat16, tightened defaults (max steps/regions = 3), and moved the @spaces.GPU decorator down to the raw `_chat` call so each generation stays within the 120 s ZeroGPU budget.
|
| 18 |
|
| 19 |
## 2024-10-21
|
| 20 |
- Updated default checkpoints to `Qwen/Qwen3-VL-8B-Thinking` and verified CLI/Gradio/test coverage.
|
|
|
|
| 14 |
- Introduced structured logging for the app (`app.py`) and pipeline execution to trace model loads, cache hits, and Gradio lifecycle events on Spaces.
|
| 15 |
- Reworked the Gradio UI to show per-step panels with annotated evidence galleries, giving each CoRGI reasoning step its own window alongside the final synthesized answer.
|
| 16 |
- Preloaded the default Qwen3-VL model/tokenizer at import so Spaces load the GPU weights before serving requests.
|
| 17 |
+
- Switched inference to bfloat16, tightened defaults (max steps/regions = 3), added per-stage timers, and moved the @spaces.GPU decorator down to the raw `_chat` call so each generation stays within the 120 s ZeroGPU budget.
|
| 18 |
|
| 19 |
## 2024-10-21
|
| 20 |
- Updated default checkpoints to `Qwen/Qwen3-VL-8B-Thinking` and verified CLI/Gradio/test coverage.
|
README.md
CHANGED
|
@@ -47,3 +47,4 @@ python app.py
|
|
| 47 |
- **ROI Extraction**: Shows the source image with every grounded bounding box plus per-evidence crops, and lists the prompts used for each verification step.
|
| 48 |
- **Evidence Descriptions**: Summarises each grounded region (bbox, description, confidence) with the associated ROI prompts.
|
| 49 |
- **Answer Synthesis**: Highlights the final answer, supporting context, and the synthesis prompt/response pair.
|
|
|
|
|
|
| 47 |
- **ROI Extraction**: Shows the source image with every grounded bounding box plus per-evidence crops, and lists the prompts used for each verification step.
|
| 48 |
- **Evidence Descriptions**: Summarises each grounded region (bbox, description, confidence) with the associated ROI prompts.
|
| 49 |
- **Answer Synthesis**: Highlights the final answer, supporting context, and the synthesis prompt/response pair.
|
| 50 |
+
- **Performance**: Reports per-stage timings (reasoning, ROI extraction, synthesis) plus overall latency so you can monitor ZeroGPU runtime limits.
|
corgi/__pycache__/gradio_app.cpython-313.pyc
CHANGED
|
Binary files a/corgi/__pycache__/gradio_app.cpython-313.pyc and b/corgi/__pycache__/gradio_app.cpython-313.pyc differ
|
|
|
corgi/__pycache__/pipeline.cpython-313.pyc
CHANGED
|
Binary files a/corgi/__pycache__/pipeline.cpython-313.pyc and b/corgi/__pycache__/pipeline.cpython-313.pyc differ
|
|
|
corgi/__pycache__/types.cpython-313.pyc
CHANGED
|
Binary files a/corgi/__pycache__/types.cpython-313.pyc and b/corgi/__pycache__/types.cpython-313.pyc differ
|
|
|
corgi/gradio_app.py
CHANGED
|
@@ -158,6 +158,7 @@ def _empty_ui_payload(message: str) -> Dict[str, object]:
|
|
| 158 |
"evidence_prompt": placeholder_prompt,
|
| 159 |
"answer_process_markdown": message,
|
| 160 |
"answer_prompt": placeholder_prompt,
|
|
|
|
| 161 |
}
|
| 162 |
|
| 163 |
|
|
@@ -270,6 +271,20 @@ def _prepare_ui_payload(
|
|
| 270 |
]
|
| 271 |
answer_process_markdown = "\n".join(answer_process_lines)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
return {
|
| 274 |
"answer_markdown": answer_text,
|
| 275 |
"chain_markdown": chain_markdown,
|
|
@@ -281,6 +296,7 @@ def _prepare_ui_payload(
|
|
| 281 |
"evidence_prompt": evidence_prompt_md,
|
| 282 |
"answer_process_markdown": answer_process_markdown,
|
| 283 |
"answer_prompt": answer_prompt_md,
|
|
|
|
| 284 |
}
|
| 285 |
|
| 286 |
|
|
@@ -456,6 +472,8 @@ def build_demo(
|
|
| 456 |
with gr.Tab("Answer Synthesis"):
|
| 457 |
answer_process_markdown = gr.Markdown("_No answer generated yet._")
|
| 458 |
answer_prompt_markdown = gr.Markdown("```text\nAwaiting answer prompt...\n```")
|
|
|
|
|
|
|
| 459 |
|
| 460 |
def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
|
| 461 |
pipeline_state = state_data if isinstance(state_data, PipelineState) else None
|
|
@@ -479,6 +497,7 @@ def build_demo(
|
|
| 479 |
payload["evidence_prompt"],
|
| 480 |
payload["answer_process_markdown"],
|
| 481 |
payload["answer_prompt"],
|
|
|
|
| 482 |
]
|
| 483 |
|
| 484 |
output_components = [
|
|
@@ -493,6 +512,7 @@ def build_demo(
|
|
| 493 |
evidence_prompt_markdown,
|
| 494 |
answer_process_markdown,
|
| 495 |
answer_prompt_markdown,
|
|
|
|
| 496 |
]
|
| 497 |
|
| 498 |
run_button.click(
|
|
|
|
| 158 |
"evidence_prompt": placeholder_prompt,
|
| 159 |
"answer_process_markdown": message,
|
| 160 |
"answer_prompt": placeholder_prompt,
|
| 161 |
+
"timing_markdown": message,
|
| 162 |
}
|
| 163 |
|
| 164 |
|
|
|
|
| 271 |
]
|
| 272 |
answer_process_markdown = "\n".join(answer_process_lines)
|
| 273 |
|
| 274 |
+
timing_lines: List[str] = []
|
| 275 |
+
if result.timings:
|
| 276 |
+
total_entry = next((t for t in result.timings if t.name == "total_pipeline"), None)
|
| 277 |
+
if total_entry:
|
| 278 |
+
timing_lines.append(f"**Total pipeline:** {total_entry.duration_ms/1000:.2f} s")
|
| 279 |
+
for timing in result.timings:
|
| 280 |
+
if timing.name == "total_pipeline":
|
| 281 |
+
continue
|
| 282 |
+
label = timing.name.replace("_", " ")
|
| 283 |
+
if timing.step_index is not None:
|
| 284 |
+
label += f" (step {timing.step_index})"
|
| 285 |
+
timing_lines.append(f"- {label}: {timing.duration_ms/1000:.2f} s")
|
| 286 |
+
timing_markdown = "\n".join(timing_lines) if timing_lines else "_No timing data available._"
|
| 287 |
+
|
| 288 |
return {
|
| 289 |
"answer_markdown": answer_text,
|
| 290 |
"chain_markdown": chain_markdown,
|
|
|
|
| 296 |
"evidence_prompt": evidence_prompt_md,
|
| 297 |
"answer_process_markdown": answer_process_markdown,
|
| 298 |
"answer_prompt": answer_prompt_md,
|
| 299 |
+
"timing_markdown": timing_markdown,
|
| 300 |
}
|
| 301 |
|
| 302 |
|
|
|
|
| 472 |
with gr.Tab("Answer Synthesis"):
|
| 473 |
answer_process_markdown = gr.Markdown("_No answer generated yet._")
|
| 474 |
answer_prompt_markdown = gr.Markdown("```text\nAwaiting answer prompt...\n```")
|
| 475 |
+
with gr.Tab("Performance"):
|
| 476 |
+
timing_markdown = gr.Markdown("_No timing data available._")
|
| 477 |
|
| 478 |
def _on_submit(state_data, image, question, model_id, max_steps, max_regions):
|
| 479 |
pipeline_state = state_data if isinstance(state_data, PipelineState) else None
|
|
|
|
| 497 |
payload["evidence_prompt"],
|
| 498 |
payload["answer_process_markdown"],
|
| 499 |
payload["answer_prompt"],
|
| 500 |
+
payload["timing_markdown"],
|
| 501 |
]
|
| 502 |
|
| 503 |
output_components = [
|
|
|
|
| 512 |
evidence_prompt_markdown,
|
| 513 |
answer_process_markdown,
|
| 514 |
answer_prompt_markdown,
|
| 515 |
+
timing_markdown,
|
| 516 |
]
|
| 517 |
|
| 518 |
run_button.click(
|
corgi/pipeline.py
CHANGED
|
@@ -3,14 +3,18 @@ from __future__ import annotations
|
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
from typing import List, Optional, Protocol
|
| 5 |
|
|
|
|
|
|
|
| 6 |
from PIL import Image
|
| 7 |
|
| 8 |
from .types import (
|
| 9 |
GroundedEvidence,
|
| 10 |
PromptLog,
|
| 11 |
ReasoningStep,
|
|
|
|
| 12 |
evidences_to_serializable,
|
| 13 |
prompt_logs_to_serializable,
|
|
|
|
| 14 |
steps_to_serializable,
|
| 15 |
)
|
| 16 |
|
|
@@ -58,6 +62,8 @@ class PipelineResult:
|
|
| 58 |
reasoning_log: Optional[PromptLog] = None
|
| 59 |
grounding_logs: List[PromptLog] = field(default_factory=list)
|
| 60 |
answer_log: Optional[PromptLog] = None
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def to_json(self) -> dict:
|
| 63 |
payload = {
|
|
@@ -65,6 +71,7 @@ class PipelineResult:
|
|
| 65 |
"steps": steps_to_serializable(self.steps),
|
| 66 |
"evidence": evidences_to_serializable(self.evidence),
|
| 67 |
"answer": self.answer,
|
|
|
|
| 68 |
}
|
| 69 |
reasoning_entries = (
|
| 70 |
prompt_logs_to_serializable([self.reasoning_log]) if self.reasoning_log else []
|
|
@@ -73,6 +80,7 @@ class PipelineResult:
|
|
| 73 |
payload["reasoning_log"] = reasoning_entries[0]
|
| 74 |
|
| 75 |
payload["grounding_logs"] = prompt_logs_to_serializable(self.grounding_logs)
|
|
|
|
| 76 |
|
| 77 |
answer_entries = prompt_logs_to_serializable([self.answer_log]) if self.answer_log else []
|
| 78 |
if answer_entries:
|
|
@@ -97,21 +105,37 @@ class CoRGIPipeline:
|
|
| 97 |
max_regions: int = 3,
|
| 98 |
) -> PipelineResult:
|
| 99 |
self._vlm.reset_logs()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
steps = self._vlm.structured_reasoning(image=image, question=question, max_steps=max_steps)
|
|
|
|
|
|
|
|
|
|
| 101 |
evidences: List[GroundedEvidence] = []
|
| 102 |
for step in steps:
|
| 103 |
if not step.needs_vision:
|
| 104 |
continue
|
|
|
|
|
|
|
| 105 |
step_evs = self._vlm.extract_step_evidence(
|
| 106 |
image=image,
|
| 107 |
question=question,
|
| 108 |
step=step,
|
| 109 |
max_regions=max_regions,
|
| 110 |
)
|
|
|
|
|
|
|
| 111 |
if not step_evs:
|
| 112 |
continue
|
| 113 |
evidences.extend(step_evs[:max_regions])
|
|
|
|
| 114 |
answer = self._vlm.synthesize_answer(image=image, question=question, steps=steps, evidences=evidences)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
return PipelineResult(
|
| 116 |
question=question,
|
| 117 |
steps=steps,
|
|
@@ -120,6 +144,8 @@ class CoRGIPipeline:
|
|
| 120 |
reasoning_log=self._vlm.reasoning_log,
|
| 121 |
grounding_logs=list(self._vlm.grounding_logs),
|
| 122 |
answer_log=self._vlm.answer_log,
|
|
|
|
|
|
|
| 123 |
)
|
| 124 |
|
| 125 |
|
|
|
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
from typing import List, Optional, Protocol
|
| 5 |
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
from PIL import Image
|
| 9 |
|
| 10 |
from .types import (
|
| 11 |
GroundedEvidence,
|
| 12 |
PromptLog,
|
| 13 |
ReasoningStep,
|
| 14 |
+
StageTiming,
|
| 15 |
evidences_to_serializable,
|
| 16 |
prompt_logs_to_serializable,
|
| 17 |
+
stage_timings_to_serializable,
|
| 18 |
steps_to_serializable,
|
| 19 |
)
|
| 20 |
|
|
|
|
| 62 |
reasoning_log: Optional[PromptLog] = None
|
| 63 |
grounding_logs: List[PromptLog] = field(default_factory=list)
|
| 64 |
answer_log: Optional[PromptLog] = None
|
| 65 |
+
timings: List[StageTiming] = field(default_factory=list)
|
| 66 |
+
total_duration_ms: float = 0.0
|
| 67 |
|
| 68 |
def to_json(self) -> dict:
|
| 69 |
payload = {
|
|
|
|
| 71 |
"steps": steps_to_serializable(self.steps),
|
| 72 |
"evidence": evidences_to_serializable(self.evidence),
|
| 73 |
"answer": self.answer,
|
| 74 |
+
"total_duration_ms": self.total_duration_ms,
|
| 75 |
}
|
| 76 |
reasoning_entries = (
|
| 77 |
prompt_logs_to_serializable([self.reasoning_log]) if self.reasoning_log else []
|
|
|
|
| 80 |
payload["reasoning_log"] = reasoning_entries[0]
|
| 81 |
|
| 82 |
payload["grounding_logs"] = prompt_logs_to_serializable(self.grounding_logs)
|
| 83 |
+
payload["timings"] = stage_timings_to_serializable(self.timings)
|
| 84 |
|
| 85 |
answer_entries = prompt_logs_to_serializable([self.answer_log]) if self.answer_log else []
|
| 86 |
if answer_entries:
|
|
|
|
| 105 |
max_regions: int = 3,
|
| 106 |
) -> PipelineResult:
|
| 107 |
self._vlm.reset_logs()
|
| 108 |
+
timings: List[StageTiming] = []
|
| 109 |
+
total_start = time.monotonic()
|
| 110 |
+
|
| 111 |
+
reasoning_start = time.monotonic()
|
| 112 |
steps = self._vlm.structured_reasoning(image=image, question=question, max_steps=max_steps)
|
| 113 |
+
reasoning_duration = (time.monotonic() - reasoning_start) * 1000.0
|
| 114 |
+
timings.append(StageTiming(name="structured_reasoning", duration_ms=reasoning_duration))
|
| 115 |
+
|
| 116 |
evidences: List[GroundedEvidence] = []
|
| 117 |
for step in steps:
|
| 118 |
if not step.needs_vision:
|
| 119 |
continue
|
| 120 |
+
stage_name = f"roi_step_{step.index}"
|
| 121 |
+
grounding_start = time.monotonic()
|
| 122 |
step_evs = self._vlm.extract_step_evidence(
|
| 123 |
image=image,
|
| 124 |
question=question,
|
| 125 |
step=step,
|
| 126 |
max_regions=max_regions,
|
| 127 |
)
|
| 128 |
+
grounding_duration = (time.monotonic() - grounding_start) * 1000.0
|
| 129 |
+
timings.append(StageTiming(name=stage_name, duration_ms=grounding_duration, step_index=step.index))
|
| 130 |
if not step_evs:
|
| 131 |
continue
|
| 132 |
evidences.extend(step_evs[:max_regions])
|
| 133 |
+
answer_start = time.monotonic()
|
| 134 |
answer = self._vlm.synthesize_answer(image=image, question=question, steps=steps, evidences=evidences)
|
| 135 |
+
answer_duration = (time.monotonic() - answer_start) * 1000.0
|
| 136 |
+
timings.append(StageTiming(name="answer_synthesis", duration_ms=answer_duration))
|
| 137 |
+
total_duration = (time.monotonic() - total_start) * 1000.0
|
| 138 |
+
timings.append(StageTiming(name="total_pipeline", duration_ms=total_duration))
|
| 139 |
return PipelineResult(
|
| 140 |
question=question,
|
| 141 |
steps=steps,
|
|
|
|
| 144 |
reasoning_log=self._vlm.reasoning_log,
|
| 145 |
grounding_logs=list(self._vlm.grounding_logs),
|
| 146 |
answer_log=self._vlm.answer_log,
|
| 147 |
+
timings=timings,
|
| 148 |
+
total_duration_ms=total_duration,
|
| 149 |
)
|
| 150 |
|
| 151 |
|
corgi/types.py
CHANGED
|
@@ -38,6 +38,15 @@ class PromptLog:
|
|
| 38 |
stage: Optional[str] = None
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def steps_to_serializable(steps: List[ReasoningStep]) -> List[Dict[str, object]]:
|
| 42 |
"""Helper to convert steps into JSON-friendly dictionaries."""
|
| 43 |
|
|
@@ -85,3 +94,16 @@ def prompt_logs_to_serializable(logs: List[PromptLog]) -> List[Dict[str, object]
|
|
| 85 |
item["stage"] = log.stage
|
| 86 |
serializable.append(item)
|
| 87 |
return serializable
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
stage: Optional[str] = None
|
| 39 |
|
| 40 |
|
| 41 |
+
@dataclass(frozen=True)
|
| 42 |
+
class StageTiming:
|
| 43 |
+
"""Timing metadata for a pipeline stage or sub-step."""
|
| 44 |
+
|
| 45 |
+
name: str
|
| 46 |
+
duration_ms: float
|
| 47 |
+
step_index: Optional[int] = None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
def steps_to_serializable(steps: List[ReasoningStep]) -> List[Dict[str, object]]:
|
| 51 |
"""Helper to convert steps into JSON-friendly dictionaries."""
|
| 52 |
|
|
|
|
| 94 |
item["stage"] = log.stage
|
| 95 |
serializable.append(item)
|
| 96 |
return serializable
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def stage_timings_to_serializable(timings: List[StageTiming]) -> List[Dict[str, object]]:
|
| 100 |
+
serializable: List[Dict[str, object]] = []
|
| 101 |
+
for timing in timings:
|
| 102 |
+
item: Dict[str, object] = {
|
| 103 |
+
"name": timing.name,
|
| 104 |
+
"duration_ms": timing.duration_ms,
|
| 105 |
+
}
|
| 106 |
+
if timing.step_index is not None:
|
| 107 |
+
item["step_index"] = timing.step_index
|
| 108 |
+
serializable.append(item)
|
| 109 |
+
return serializable
|