Skip to main content
Open In Colab

COCO Data Pipeline for Faster RCNN

Notebook 1 of 6 in the Faster RCNN from-scratch series Dataset: COCO 2017 streamed from Hugging Face — no local download required.
# Install dependencies (run once in container)
!pip install datasets --quiet
/usr/bin/sh: 1: pip: not found
import sys, os, pathlib
# Locate frcnn_common.py — works whether run via papermill or interactively
_nb_candidates = [
    pathlib.Path.cwd().parent,  # interactive: cwd is the notebook dir
    pathlib.Path.cwd() / 'notebooks' / 'scene-understanding' / 'object-detection' / 'faster-rcnn' / 'pytorch',  # papermill: cwd is repo root
]
for _p in _nb_candidates:
    if (_p / 'frcnn_common.py').exists():
        sys.path.insert(0, str(_p))
        break

import random
from collections import Counter
from typing import List, Tuple

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

import torch
from torch.utils.data import DataLoader

from frcnn_common import (
    IMG_SIZE, NUM_CLASSES, DEVICE,
    IMAGENET_MEAN, IMAGENET_STD,
    COCO_NAMES,
    COCOStreamDataset, frcnn_collate_fn,
    box_iou,
)

# Aliases used in visualization cells below
MEAN = IMAGENET_MEAN.squeeze().numpy()
STD  = IMAGENET_STD.squeeze().numpy()

print(f"Device: {DEVICE}")
print(f"IMG_SIZE={IMG_SIZE}, NUM_CLASSES={NUM_CLASSES}")
print(f"COCO categories: {len(COCO_NAMES)} (including background)")
/workspaces/eng-ai-agents/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Device: cuda
IMG_SIZE=400, NUM_CLASSES=81
COCO categories: 81 (including background)

Dataset

We stream COCO 2017 directly from detection-datasets/coco on the Hugging Face Hub — no local download or annotation files required. The HF dataset provides bounding boxes in COCO format [x, y, w, h] (pixels, top-left corner) with 0-indexed category labels. Faster RCNN requires:
  • Boxes in [x1, y1, x2, y2] pixel coordinates, scaled to the resized image
  • 1-indexed labels (0 = background, 1–80 = COCO categories)
  • ImageNet-normalized image tensors of shape (3, 800, 800)
# COCOStreamDataset is imported from frcnn_common — inspect its signature:
help(COCOStreamDataset.__init__)
Help on function __init__ in module frcnn_common:

__init__(self, split: str = 'train', max_samples: Optional[int] = None)
    Initialize the streaming COCO dataset reader and optional sample cap for quick experiments.
# frcnn_collate_fn is imported from frcnn_common — inspect its signature:
help(frcnn_collate_fn)
Help on function frcnn_collate_fn in module frcnn_common:

frcnn_collate_fn(batch)
    Stack images but keep target dicts in a Python list for variable lengths.
class AnchorTargetGenerator:
    """Assign GT labels and regression targets to pre-computed anchors.

    For each image in a batch:
    - Positive anchor: IoU with any GT >= 0.7 (or highest-IoU anchor per GT)
    - Negative anchor: IoU with all GTs < 0.3
    - Neutral anchor: everything else (ignored during loss)

    Samples 256 anchors per image at 1:1 pos/neg ratio.
    """

    def __init__(self, pos_iou: float = 0.7, neg_iou: float = 0.3,
                 total_samples: int = 256, pos_fraction: float = 0.5):
        self.pos_iou = pos_iou
        self.neg_iou = neg_iou
        self.total_samples = total_samples
        self.n_pos = int(total_samples * pos_fraction)

    def compute_iou(self, anchors: torch.Tensor, gt_boxes: torch.Tensor) -> torch.Tensor:
        """Compute IoU matrix: (N_anchors, N_gt)."""
        ax1, ay1, ax2, ay2 = anchors.unbind(1)
        gx1, gy1, gx2, gy2 = gt_boxes.unbind(1)

        inter_x1 = torch.max(ax1[:, None], gx1[None, :])
        inter_y1 = torch.max(ay1[:, None], gy1[None, :])
        inter_x2 = torch.min(ax2[:, None], gx2[None, :])
        inter_y2 = torch.min(ay2[:, None], gy2[None, :])

        inter_w = (inter_x2 - inter_x1).clamp(min=0)
        inter_h = (inter_y2 - inter_y1).clamp(min=0)
        inter = inter_w * inter_h

        area_a = (ax2 - ax1) * (ay2 - ay1)
        area_g = (gx2 - gx1) * (gy2 - gy1)
        union = area_a[:, None] + area_g[None, :] - inter

        return inter / union.clamp(min=1e-6)

    def __call__(self, anchors: torch.Tensor, gt_boxes: torch.Tensor
                 ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Returns:
            labels: (N_anchors,) — 1=positive, 0=negative, -1=neutral/ignore
            matched_gt: (N_anchors, 4) — GT box matched to each anchor
        """
        # Start with every anchor marked as "ignore" and no regression targets.
        N = len(anchors)
        labels = torch.full((N,), -1, dtype=torch.int64)
        matched_gt = torch.zeros((N, 4), dtype=torch.float32)

        # If an image has zero GT boxes, randomly sample negatives and bail out early.
        if len(gt_boxes) == 0:
            neg_idx = torch.randperm(N)[:self.total_samples]
            labels[neg_idx] = 0
            return labels, matched_gt

        # Pairwise IoU over anchors × GT boxes drives both classification and regression.
        iou = self.compute_iou(anchors, gt_boxes)  # (N, M)

        # Track best GT per anchor and best anchor per GT (to enforce at least one positive).
        max_iou_per_anchor, best_gt_idx = iou.max(dim=1)
        _, best_anchor_per_gt = iou.max(dim=0)

        # Thresholds carve up positive / negative regions; anything in between stays neutral.
        labels[max_iou_per_anchor >= self.pos_iou] = 1
        labels[max_iou_per_anchor < self.neg_iou] = 0
        labels[best_anchor_per_gt] = 1  # force-positive best anchors

        pos_idx = torch.where(labels == 1)[0]
        neg_idx = torch.where(labels == 0)[0]

        # Subsample to the requested 256 anchors with the desired pos/neg ratio.
        n_pos = min(len(pos_idx), self.n_pos)
        n_neg = min(len(neg_idx), self.total_samples - n_pos)

        pos_idx = pos_idx[torch.randperm(len(pos_idx))[:n_pos]]
        neg_idx = neg_idx[torch.randperm(len(neg_idx))[:n_neg]]

        # Everything not selected for the mini-batch goes back to "ignore" (label = -1).
        keep = torch.zeros(N, dtype=torch.bool)
        keep[pos_idx] = True
        keep[neg_idx] = True
        labels[~keep] = -1

        # Matched GT coordinates provide the regression targets for every anchor.
        matched_gt = gt_boxes[best_gt_idx]
        return labels, matched_gt
# Stream a small batch for inspection (no local data needed)
dataset = COCOStreamDataset(split='train', max_samples=64)
loader  = DataLoader(dataset, batch_size=2, collate_fn=frcnn_collate_fn,
                     num_workers=0)

imgs, targets = next(iter(loader))
print(f"Image batch : {imgs.shape}")
print(f"First target: boxes={targets[0]['boxes'].shape}, labels={targets[0]['labels'].shape}")
Image batch : torch.Size([2, 3, 400, 400])
First target: boxes=torch.Size([8, 4]), labels=torch.Size([8])
# Inspection: visualize 2 images with GT boxes
cat_names = {i: name for i, name in enumerate(COCO_NAMES)}  # 0=background, 1+=categories

fig, axes = plt.subplots(1, 2, figsize=(18, 9))
for i, ax in enumerate(axes):
    img = imgs[i].permute(1, 2, 0).numpy()
    img = img * STD + MEAN      # denormalize
    img = np.clip(img, 0, 1)
    ax.imshow(img)
    for box, lbl in zip(targets[i]['boxes'], targets[i]['labels']):
        x1, y1, x2, y2 = box.tolist()
        rect = patches.Rectangle((x1, y1), x2-x1, y2-y1,
                                  linewidth=2, edgecolor='lime', facecolor='none')
        ax.add_patch(rect)
        ax.text(x1, y1 - 4, cat_names.get(lbl.item(), '?'),
                fontsize=8, color='lime', weight='bold')
    ax.set_title(f"Image {i}: {len(targets[i]['boxes'])} objects")
    ax.axis('off')
plt.tight_layout()
plt.show()
Output from cell 7
# Inspection: anchor label distribution on first image in batch
atg = AnchorTargetGenerator()
dummy_anchors = torch.rand(200, 4) * IMG_SIZE
dummy_anchors[:, 2:] = dummy_anchors[:, :2] + torch.rand(200, 2) * 200
anchor_labels, _ = atg(dummy_anchors, targets[0]['boxes'])

# IMPORTANT: This histogram reflects raw labels before the 256-anchor subsample.
# Positives are naturally rare because most randomly placed anchors miss GT boxes.
# In the RPN training loop we sample ~50/50 pos/neg; to visualize that distribution,
# move the counting logic inside AnchorTargetGenerator after the subsampling "keep" mask.
pos = (anchor_labels == 1).sum().item()
neg = (anchor_labels == 0).sum().item()
neu = (anchor_labels == -1).sum().item()

fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(['positive', 'negative', 'neutral'], [pos, neg, neu],
       color=['green', 'red', 'gray'])
ax.set_title('Anchor sampling (200 dummy anchors, sample image)')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('images/anchor_stats.png', dpi=100, bbox_inches='tight')
plt.show()
print(f"Pos: {pos}, Neg: {neg}, Neutral: {neu}")
Output from cell 8
Pos: 8, Neg: 187, Neutral: 5
# Inspection: class distribution across 200 streamed samples
sample_ds = COCOStreamDataset(split='train', max_samples=200)
all_labels = []
for _, t in sample_ds:
    all_labels.extend(t['labels'].tolist())

counter = Counter(all_labels)
top20 = sorted(counter.items(), key=lambda x: -x[1])[:20]
names  = [cat_names.get(k, str(k)) for k, _ in top20]
counts = [v for _, v in top20]

fig, ax = plt.subplots(figsize=(14, 4))
ax.bar(names, counts)
ax.set_title('Top-20 categories by annotation count (200 COCO train samples)')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('images/class_distribution.png', dpi=100, bbox_inches='tight')
plt.show()
Output from cell 9
Key references: (Create your own -style d, 2018; Szegedy et al., 2016; Zimmermann & Siems, 2018; Ren et al., 2015; Chen et al., 2017)

References

  • Chen, L., Papandreou, G., Schroff, F., Adam, H. (2017). Rethinking Atrous Convolution for Semantic Image Segmentation.
  • (2018). Create your own COCO-style datasets. waspinator.
  • Ren, S., He, K., Girshick, R., Sun, J. (2015). Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks.
  • Szegedy, C., Ioffe, S., Vanhoucke, V., Alemi, A. (2016). Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning.
  • Zimmermann, R., Siems, J. (2018). Faster Training of Mask R-CNN by Focusing on Instance Boundaries.