Skip to main content
Open In Colab

Faster RCNN Inference Pipeline

Notebook 6 of 6 in the Faster RCNN from-scratch series We load the checkpoint saved in notebook 05, run inference on COCO validation images streamed from Hugging Face, and visualise detections. Topics covered:
  • Loading and verifying a checkpoint
  • Running model.eval() forward pass (proposal generation + postprocessing)
  • Visualising class-agnostic proposals and final detections
  • Measuring per-image inference latency
  • Pseudo-evaluation: counting detections above a confidence threshold
Expected outcome after 5-step training: the checkpoint from notebook 05 is a sanity-check model, not a trained detector. Expect zero or very few final detections above the 0.05 score threshold — the ROI classifier has seen only 5 images and its weights are essentially random. What you should see is a healthy set of RPN proposals (cyan boxes), confirming the proposal pipeline works end-to-end. A properly trained COCO model (≥12 epochs) would show labelled red boxes on every image.
import sys, os, pathlib
# Locate frcnn_common.py — works whether run via papermill or interactively
_nb_candidates = [
    pathlib.Path.cwd().parent,  # interactive: cwd is the notebook dir
    pathlib.Path.cwd() / 'notebooks' / 'scene-understanding' / 'object-detection' / 'faster-rcnn' / 'pytorch',  # papermill: cwd is repo root
]
for _p in _nb_candidates:
    if (_p / 'frcnn_common.py').exists():
        sys.path.insert(0, str(_p))
        break

import torch
import torchvision.transforms.functional as TF
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from datasets import load_dataset
import time

from frcnn_common import (
    IMG_SIZE, NUM_CLASSES, DEVICE,
    IMAGENET_MEAN, IMAGENET_STD,
    COCO_NAMES,
    FasterRCNN,
)

print(f"Device: {DEVICE}")
/workspaces/eng-ai-agents/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Device: cuda
# All model components are imported from frcnn_common.
# FasterRCNN assembles: ResNet50 + FPN + RPN + ROI heads.
print(f"Model classes imported. NUM_CLASSES={NUM_CLASSES}, IMG_SIZE={IMG_SIZE}")
Model classes imported. NUM_CLASSES=81, IMG_SIZE=400
# ─── Load checkpoint ──────────────────────────────────────────────────────────

CKPT_PATH = 'checkpoints/faster_rcnn_demo.pth'
assert os.path.exists(CKPT_PATH), f"Checkpoint not found at {CKPT_PATH}. Run notebook 05 first."

model = FasterRCNN(num_classes=NUM_CLASSES).to(DEVICE)
ckpt  = torch.load(CKPT_PATH, map_location=DEVICE)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

print(f"Checkpoint loaded: {CKPT_PATH}")
print(f"  Trained for {ckpt['steps_trained']} steps")
print(f"  Final losses: { {k: f'{v:.4f}' for k,v in ckpt['final_losses'].items()} }")
total = sum(p.numel() for p in model.parameters())
print(f"  Parameters: {total/1e6:.1f}M")
Checkpoint loaded: checkpoints/faster_rcnn_demo.pth
  Trained for 5 steps
  Final losses: {'rpn_cls': '0.6011', 'rpn_box': '0.1212', 'roi_cls': '1.6939', 'roi_box': '0.0611', 'total': '2.4773'}
  Parameters: 41.8M
# ─── Inference on 4 validation images ─────────────────────────────────────────

val_ds = load_dataset('detection-datasets/coco', split='val', streaming=True)

images_pil, results_list, latencies = [], [], []
NUM_IMAGES = 4

with torch.no_grad():
    for i, sample in enumerate(val_ds):
        if i >= NUM_IMAGES:
            break
        img_pil = sample['image'].convert('RGB')
        images_pil.append(img_pil)
        t = ((TF.to_tensor(img_pil.resize((IMG_SIZE, IMG_SIZE))) - IMAGENET_MEAN) / IMAGENET_STD)
        t = t.unsqueeze(0).to(DEVICE)

        t0 = time.perf_counter()
        dets, proposals = model(t)
        if DEVICE.type == 'cuda': torch.cuda.synchronize()
        latencies.append((time.perf_counter() - t0) * 1000)
        results_list.append(dets[0])

print(f"Mean latency: {sum(latencies)/len(latencies):.1f} ms  ({IMG_SIZE}x{IMG_SIZE} input)")
print("Detections per image:", [len(r['boxes']) for r in results_list])
print("(Detections are random — model trained only 5 steps)")
Mean latency: 262.9 ms  (400x400 input)
Detections per image: [0, 0, 0, 0]
(Detections are random — model trained only 5 steps)
# ─── Visualise proposals + detections ─────────────────────────────────────────

fig, axes = plt.subplots(2, NUM_IMAGES, figsize=(5*NUM_IMAGES, 10))
TOP_K_PROPS = 30

with torch.no_grad():
    for col, sample in enumerate(load_dataset('detection-datasets/coco',
                                              split='val', streaming=True)):
        if col >= NUM_IMAGES: break
        img_pil = sample['image'].convert('RGB')
        img_res = img_pil.resize((IMG_SIZE, IMG_SIZE))
        t = ((TF.to_tensor(img_res) - IMAGENET_MEAN) / IMAGENET_STD).unsqueeze(0).to(DEVICE)

        _, proposals = model(t)
        props = proposals[0].cpu()[:TOP_K_PROPS]
        dets  = results_list[col]

        # Row 0: RPN proposals
        ax = axes[0][col]
        ax.imshow(img_res); ax.axis('off')
        ax.set_title(f'Image {col+1}: top-{TOP_K_PROPS} proposals', fontsize=9)
        for box in props.tolist():
            x1,y1,x2,y2=box
            ax.add_patch(patches.Rectangle((x1,y1),x2-x1,y2-y1,
                                            linewidth=1,edgecolor='cyan',facecolor='none'))

        # Row 1: final detections
        ax = axes[1][col]
        ax.imshow(img_res); ax.axis('off')
        n_det = len(dets['boxes'])
        ax.set_title(f'Image {col+1}: {n_det} detections', fontsize=9)
        for box,score,label in zip(dets['boxes'].tolist(),
                                   dets['scores'].tolist(),
                                   dets['labels'].tolist()):
            x1,y1,x2,y2=box
            cls_name = COCO_NAMES[label] if label < len(COCO_NAMES) else str(label)
            ax.add_patch(patches.Rectangle((x1,y1),x2-x1,y2-y1,
                                            linewidth=1.5,edgecolor='red',facecolor='none'))
            ax.text(x1,y1-2,f'{cls_name} {score:.2f}',
                    color='white',fontsize=6,backgroundcolor='red')

plt.suptitle('Row 1: RPN proposals  |  Row 2: Final detections (5-step model)', y=1.01)
plt.tight_layout()
os.makedirs('images', exist_ok=True)
plt.savefig('images/inference_results.png', dpi=100, bbox_inches='tight')
plt.show()
Output from cell 5
# ─── Latency bar chart ─────────────────────────────────────────────────────────

fig, ax = plt.subplots(figsize=(7, 4))
ax.bar(range(1, NUM_IMAGES+1), latencies, color='steelblue', edgecolor='white')
ax.axhline(sum(latencies)/len(latencies), color='red', linestyle='--', label='mean')
ax.set_xlabel('Image index'); ax.set_ylabel('Latency (ms)')
ax.set_title(f'Per-image inference latency on {str(DEVICE).upper()} ({IMG_SIZE}x{IMG_SIZE})')
ax.legend()
plt.tight_layout()
plt.savefig('images/latency.png', dpi=100, bbox_inches='tight')
plt.show()
print("\nSeries complete. Faster RCNN from scratch — all 6 notebooks executed.")
Output from cell 6

Series complete. Faster RCNN from scratch — all 6 notebooks executed.

Pseudo-evaluation: detection counts above threshold

This is not mAP evaluation. It counts detections that survive the score threshold (0.05) and NMS — a rough proxy for “did the model produce any confident predictions?” on a small sample. For real evaluation use the COCO evaluation API with a fully trained checkpoint.
# ─── Pseudo-evaluation: detection counts (NOT mAP) ────────────────────────────

SCORE_THRESHOLDS = [0.05, 0.1, 0.3, 0.5]
EVAL_IMAGES = NUM_IMAGES  # reuse the 4 images already run above

print(f"Detection counts across {EVAL_IMAGES} validation images")
print(f"{'Threshold':>12}  {'Total dets':>12}  {'Images with ≥1 det':>20}")
print("-" * 50)

for thr in SCORE_THRESHOLDS:
    total_dets = 0
    images_with_dets = 0
    for r in results_list:
        high_conf = (r['scores'] >= thr).sum().item()
        total_dets += high_conf
        if high_conf > 0:
            images_with_dets += 1
    print(f"{thr:>12.2f}  {total_dets:>12d}  {images_with_dets:>20d}/{EVAL_IMAGES}")

print()
print("Note: 0 detections at all thresholds is expected for a 5-step checkpoint.")
print("A converged model typically produces 10–50 detections per image on COCO val.")
Detection counts across 4 validation images
   Threshold    Total dets    Images with ≥1 det
--------------------------------------------------
        0.05             0                     0/4
        0.10             0                     0/4
        0.30             0                     0/4
        0.50             0                     0/4

Note: 0 detections at all thresholds is expected for a 5-step checkpoint.
A converged model typically produces 10–50 detections per image on COCO val.
Key references: (Redmon & Farhadi, 2016; Ren et al., 2015; Redmon et al., 2015; Szegedy et al., 2016; Faster -, 2022)

References

  • (2022). Faster R-CNN.
  • Redmon, J., Farhadi, A. (2016). YOLO9000: Better, Faster, Stronger.
  • Redmon, J., Divvala, S., Girshick, R., Farhadi, A. (2015). You only look once: Unified, real-time object detection.
  • Ren, S., He, K., Girshick, R., Sun, J. (2015). Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks.
  • Szegedy, C., Ioffe, S., Vanhoucke, V., Alemi, A. (2016). Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning.