phase 9
This commit is contained in:
116
detect/stages/aggregator.py
Normal file
116
detect/stages/aggregator.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
Stage 8 — Report compilation
|
||||
|
||||
Groups all detections by brand, merges contiguous appearances,
|
||||
and builds the final DetectionReport.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from detect import emit
|
||||
from detect.models import BrandDetection, BrandStats, DetectionReport, PipelineStats
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _merge_contiguous(detections: list[BrandDetection], gap_threshold: float = 2.0) -> list[BrandDetection]:
|
||||
"""
|
||||
Merge detections of the same brand that are close in time.
|
||||
|
||||
If two detections of the same brand are within gap_threshold seconds,
|
||||
they're merged into one detection spanning the full range.
|
||||
"""
|
||||
if not detections:
|
||||
return []
|
||||
|
||||
sorted_dets = sorted(detections, key=lambda d: (d.brand, d.timestamp))
|
||||
merged: list[BrandDetection] = []
|
||||
current = sorted_dets[0]
|
||||
|
||||
for det in sorted_dets[1:]:
|
||||
if (det.brand == current.brand
|
||||
and det.timestamp <= current.timestamp + current.duration + gap_threshold):
|
||||
end = max(current.timestamp + current.duration,
|
||||
det.timestamp + det.duration)
|
||||
current = BrandDetection(
|
||||
brand=current.brand,
|
||||
timestamp=current.timestamp,
|
||||
duration=end - current.timestamp,
|
||||
confidence=max(current.confidence, det.confidence),
|
||||
source=current.source,
|
||||
bbox=current.bbox,
|
||||
frame_ref=current.frame_ref,
|
||||
content_type=current.content_type,
|
||||
)
|
||||
else:
|
||||
merged.append(current)
|
||||
current = det
|
||||
|
||||
merged.append(current)
|
||||
return merged
|
||||
|
||||
|
||||
def compile_report(
|
||||
detections: list[BrandDetection],
|
||||
stats: PipelineStats,
|
||||
video_source: str = "",
|
||||
content_type: str = "",
|
||||
duration_seconds: float = 0.0,
|
||||
job_id: str | None = None,
|
||||
) -> DetectionReport:
|
||||
"""
|
||||
Build the final detection report from all accumulated detections.
|
||||
|
||||
Merges contiguous detections, computes per-brand stats,
|
||||
and emits the job_complete event.
|
||||
"""
|
||||
merged = _merge_contiguous(detections)
|
||||
|
||||
brands: dict[str, BrandStats] = {}
|
||||
for d in merged:
|
||||
if d.brand not in brands:
|
||||
brands[d.brand] = BrandStats()
|
||||
s = brands[d.brand]
|
||||
s.total_appearances += 1
|
||||
s.total_screen_time += d.duration
|
||||
s.avg_confidence = (
|
||||
(s.avg_confidence * (s.total_appearances - 1) + d.confidence)
|
||||
/ s.total_appearances
|
||||
)
|
||||
if s.first_seen == 0.0 or d.timestamp < s.first_seen:
|
||||
s.first_seen = d.timestamp
|
||||
if d.timestamp > s.last_seen:
|
||||
s.last_seen = d.timestamp
|
||||
|
||||
report = DetectionReport(
|
||||
video_source=video_source,
|
||||
content_type=content_type,
|
||||
duration_seconds=duration_seconds,
|
||||
brands=brands,
|
||||
timeline=sorted(merged, key=lambda d: d.timestamp),
|
||||
pipeline_stats=stats,
|
||||
)
|
||||
|
||||
emit.log(job_id, "Aggregator", "INFO",
|
||||
f"Report: {len(brands)} brands, {len(merged)} detections "
|
||||
f"(merged from {len(detections)} raw)")
|
||||
|
||||
emit.job_complete(job_id, {
|
||||
"video_source": report.video_source,
|
||||
"content_type": report.content_type,
|
||||
"duration_seconds": report.duration_seconds,
|
||||
"brands": {
|
||||
k: {
|
||||
"total_appearances": v.total_appearances,
|
||||
"total_screen_time": v.total_screen_time,
|
||||
"avg_confidence": round(v.avg_confidence, 3),
|
||||
"first_seen": v.first_seen,
|
||||
"last_seen": v.last_seen,
|
||||
}
|
||||
for k, v in brands.items()
|
||||
},
|
||||
})
|
||||
|
||||
return report
|
||||
168
detect/stages/vlm_cloud.py
Normal file
168
detect/stages/vlm_cloud.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Stage 7 — Cloud LLM escalation
|
||||
|
||||
Last resort for crops the local VLM couldn't resolve.
|
||||
Provider-agnostic — switch via CLOUD_LLM_PROVIDER env var.
|
||||
Each provider has its own file under detect/providers/.
|
||||
|
||||
Tracks token usage and cost.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from detect import emit
|
||||
from detect.models import BrandDetection, PipelineStats, TextCandidate
|
||||
from detect.profiles.base import CropContext
|
||||
from detect.providers import get_provider, has_api_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ESTIMATED_TOKENS_PER_CROP = 500
|
||||
|
||||
|
||||
def _encode_crop(crop: np.ndarray) -> str:
|
||||
img = Image.fromarray(crop)
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="JPEG", quality=85)
|
||||
return base64.b64encode(buf.getvalue()).decode()
|
||||
|
||||
|
||||
def _crop_image(candidate: TextCandidate) -> np.ndarray:
|
||||
frame = candidate.frame
|
||||
box = candidate.bbox
|
||||
h, w = frame.image.shape[:2]
|
||||
x1 = max(0, box.x)
|
||||
y1 = max(0, box.y)
|
||||
x2 = min(w, box.x + box.w)
|
||||
y2 = min(h, box.y + box.h)
|
||||
return frame.image[y1:y2, x1:x2]
|
||||
|
||||
|
||||
def _parse_response(answer: str, total_tokens: int) -> dict:
|
||||
"""Parse LLM free-text response into structured output."""
|
||||
parts = [p.strip() for p in answer.split(",", 2)]
|
||||
|
||||
brand = parts[0] if parts else ""
|
||||
confidence = 0.5
|
||||
reasoning = answer
|
||||
|
||||
if len(parts) >= 2:
|
||||
try:
|
||||
confidence = float(parts[1])
|
||||
confidence = max(0.0, min(1.0, confidence))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if len(parts) >= 3:
|
||||
reasoning = parts[2]
|
||||
|
||||
return {
|
||||
"brand": brand,
|
||||
"confidence": confidence,
|
||||
"reasoning": reasoning,
|
||||
"tokens": total_tokens or ESTIMATED_TOKENS_PER_CROP,
|
||||
}
|
||||
|
||||
|
||||
def _call_cloud_api(image_b64: str, prompt: str) -> dict:
|
||||
"""Route to the configured provider and parse the response."""
|
||||
provider = get_provider()
|
||||
result = provider.call(image_b64, prompt)
|
||||
return _parse_response(result.answer, result.total_tokens)
|
||||
|
||||
|
||||
def escalate_cloud(
|
||||
candidates: list[TextCandidate],
|
||||
vlm_prompt_fn,
|
||||
stats: PipelineStats,
|
||||
min_confidence: float = 0.4,
|
||||
content_type: str = "",
|
||||
job_id: str | None = None,
|
||||
) -> list[BrandDetection]:
|
||||
"""
|
||||
Send remaining unresolved crops to cloud LLM.
|
||||
|
||||
Provider is selected via CLOUD_LLM_PROVIDER env var (groq, gemini, openai).
|
||||
Updates stats with call count and cost.
|
||||
"""
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
if not has_api_key():
|
||||
emit.log(job_id, "CloudLLM", "WARNING",
|
||||
f"No API key set for cloud provider, skipping {len(candidates)} crops")
|
||||
return []
|
||||
|
||||
provider = get_provider()
|
||||
emit.log(job_id, "CloudLLM", "INFO",
|
||||
f"Escalating {len(candidates)} crops to {provider.name}")
|
||||
|
||||
matched: list[BrandDetection] = []
|
||||
total_cost = 0.0
|
||||
|
||||
for candidate in candidates:
|
||||
crop = _crop_image(candidate)
|
||||
if crop.size == 0:
|
||||
continue
|
||||
|
||||
crop_context = CropContext(
|
||||
image=b"",
|
||||
surrounding_text=candidate.text,
|
||||
position_hint=f"frame {candidate.frame.sequence}",
|
||||
)
|
||||
prompt = vlm_prompt_fn(crop_context)
|
||||
image_b64 = _encode_crop(crop)
|
||||
|
||||
try:
|
||||
result = _call_cloud_api(image_b64, prompt)
|
||||
except Exception as e:
|
||||
logger.warning("Cloud LLM failed for '%s': %s", candidate.text, e)
|
||||
continue
|
||||
|
||||
stats.cloud_llm_calls += 1
|
||||
model_info = provider.models.get(provider.model)
|
||||
cost_per_token = model_info.cost_per_input_token if model_info else 0.00001
|
||||
call_cost = result["tokens"] * cost_per_token
|
||||
total_cost += call_cost
|
||||
|
||||
brand = result["brand"]
|
||||
confidence = result["confidence"]
|
||||
|
||||
if brand and confidence >= min_confidence:
|
||||
detection = BrandDetection(
|
||||
brand=brand,
|
||||
timestamp=candidate.frame.timestamp,
|
||||
duration=0.5,
|
||||
confidence=confidence,
|
||||
source="cloud_llm",
|
||||
bbox=candidate.bbox,
|
||||
frame_ref=candidate.frame.sequence,
|
||||
content_type=content_type,
|
||||
)
|
||||
matched.append(detection)
|
||||
|
||||
emit.detection(
|
||||
job_id,
|
||||
brand=brand,
|
||||
confidence=confidence,
|
||||
source="cloud_llm",
|
||||
timestamp=candidate.frame.timestamp,
|
||||
content_type=content_type,
|
||||
frame_ref=candidate.frame.sequence,
|
||||
)
|
||||
|
||||
stats.estimated_cloud_cost_usd += total_cost
|
||||
stats.regions_escalated_to_cloud_llm = len(candidates)
|
||||
|
||||
emit.log(job_id, "CloudLLM", "INFO",
|
||||
f"Cloud resolved {len(matched)}/{len(candidates)} — "
|
||||
f"cost ${total_cost:.4f} ({stats.cloud_llm_calls} calls total)")
|
||||
|
||||
return matched
|
||||
124
detect/stages/vlm_local.py
Normal file
124
detect/stages/vlm_local.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
Stage 6 — Local VLM escalation (moondream2)
|
||||
|
||||
Processes unresolved text candidates by sending crop images + prompt
|
||||
to the local VLM on the inference server. Produces BrandDetection
|
||||
objects for crops the VLM can identify.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
from detect import emit
|
||||
from detect.models import BrandDetection, TextCandidate
|
||||
from detect.profiles.base import CropContext
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _crop_image(candidate: TextCandidate) -> np.ndarray:
|
||||
frame = candidate.frame
|
||||
box = candidate.bbox
|
||||
h, w = frame.image.shape[:2]
|
||||
x1 = max(0, box.x)
|
||||
y1 = max(0, box.y)
|
||||
x2 = min(w, box.x + box.w)
|
||||
y2 = min(h, box.y + box.h)
|
||||
return frame.image[y1:y2, x1:x2]
|
||||
|
||||
|
||||
def escalate_vlm(
|
||||
candidates: list[TextCandidate],
|
||||
vlm_prompt_fn,
|
||||
inference_url: str | None = None,
|
||||
min_confidence: float = 0.5,
|
||||
content_type: str = "",
|
||||
job_id: str | None = None,
|
||||
) -> tuple[list[BrandDetection], list[TextCandidate]]:
|
||||
"""
|
||||
Send unresolved crops to local VLM for brand identification.
|
||||
|
||||
Returns:
|
||||
- matched: BrandDetections the VLM confirmed
|
||||
- still_unresolved: candidates the VLM couldn't resolve (→ cloud escalation)
|
||||
"""
|
||||
if not candidates:
|
||||
return [], []
|
||||
|
||||
emit.log(job_id, "VLMLocal", "INFO",
|
||||
f"Processing {len(candidates)} unresolved crops with moondream2")
|
||||
|
||||
matched: list[BrandDetection] = []
|
||||
still_unresolved: list[TextCandidate] = []
|
||||
|
||||
if inference_url:
|
||||
from detect.inference import InferenceClient
|
||||
client = InferenceClient(base_url=inference_url)
|
||||
|
||||
for candidate in candidates:
|
||||
crop = _crop_image(candidate)
|
||||
if crop.size == 0:
|
||||
still_unresolved.append(candidate)
|
||||
continue
|
||||
|
||||
crop_context = CropContext(
|
||||
image=b"", # not used for prompt generation
|
||||
surrounding_text=candidate.text,
|
||||
position_hint=f"frame {candidate.frame.sequence}",
|
||||
)
|
||||
prompt = vlm_prompt_fn(crop_context)
|
||||
|
||||
try:
|
||||
if inference_url:
|
||||
result = client.vlm(image=crop, prompt=prompt)
|
||||
brand = result.brand
|
||||
confidence = result.confidence
|
||||
reasoning = result.reasoning
|
||||
else:
|
||||
brand, confidence, reasoning = _vlm_local(crop, prompt)
|
||||
except Exception as e:
|
||||
logger.warning("VLM failed for candidate '%s': %s", candidate.text, e)
|
||||
still_unresolved.append(candidate)
|
||||
continue
|
||||
|
||||
if brand and confidence >= min_confidence:
|
||||
detection = BrandDetection(
|
||||
brand=brand,
|
||||
timestamp=candidate.frame.timestamp,
|
||||
duration=0.5,
|
||||
confidence=confidence,
|
||||
source="local_vlm",
|
||||
bbox=candidate.bbox,
|
||||
frame_ref=candidate.frame.sequence,
|
||||
content_type=content_type,
|
||||
)
|
||||
matched.append(detection)
|
||||
|
||||
emit.detection(
|
||||
job_id,
|
||||
brand=brand,
|
||||
confidence=confidence,
|
||||
source="local_vlm",
|
||||
timestamp=candidate.frame.timestamp,
|
||||
content_type=content_type,
|
||||
frame_ref=candidate.frame.sequence,
|
||||
)
|
||||
|
||||
logger.debug("VLM matched: %s (%.2f) — %s", brand, confidence, reasoning)
|
||||
else:
|
||||
still_unresolved.append(candidate)
|
||||
|
||||
emit.log(job_id, "VLMLocal", "INFO",
|
||||
f"VLM resolved {len(matched)}, unresolved {len(still_unresolved)} → cloud")
|
||||
|
||||
return matched, still_unresolved
|
||||
|
||||
|
||||
def _vlm_local(crop: np.ndarray, prompt: str) -> tuple[str, float, str]:
|
||||
"""Run moondream2 in-process (single-box mode)."""
|
||||
from gpu.models.vlm import query
|
||||
result = query(crop, prompt)
|
||||
return result["brand"], result["confidence"], result["reasoning"]
|
||||
Reference in New Issue
Block a user