COCO Data Pipeline for Faster RCNN
Notebook 1 of 6 in the Faster RCNN from-scratch series Dataset: COCO 2017 streamed from Hugging Face — no local download required.Copy
# Install dependencies (run once in container)
!pip install datasets --quiet
Copy
/usr/bin/sh: 1: pip: not found
Copy
import sys, os, pathlib
# Locate frcnn_common.py — works whether run via papermill or interactively
_nb_candidates = [
pathlib.Path.cwd().parent, # interactive: cwd is the notebook dir
pathlib.Path.cwd() / 'notebooks' / 'scene-understanding' / 'object-detection' / 'faster-rcnn' / 'pytorch', # papermill: cwd is repo root
]
for _p in _nb_candidates:
if (_p / 'frcnn_common.py').exists():
sys.path.insert(0, str(_p))
break
import random
from collections import Counter
from typing import List, Tuple
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch
from torch.utils.data import DataLoader
from frcnn_common import (
IMG_SIZE, NUM_CLASSES, DEVICE,
IMAGENET_MEAN, IMAGENET_STD,
COCO_NAMES,
COCOStreamDataset, frcnn_collate_fn,
box_iou,
)
# Aliases used in visualization cells below
MEAN = IMAGENET_MEAN.squeeze().numpy()
STD = IMAGENET_STD.squeeze().numpy()
print(f"Device: {DEVICE}")
print(f"IMG_SIZE={IMG_SIZE}, NUM_CLASSES={NUM_CLASSES}")
print(f"COCO categories: {len(COCO_NAMES)} (including background)")
Copy
/workspaces/eng-ai-agents/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Copy
Device: cuda
IMG_SIZE=400, NUM_CLASSES=81
COCO categories: 81 (including background)
Dataset
We stream COCO 2017 directly from detection-datasets/coco on the Hugging Face Hub — no local download or annotation files required. The HF dataset provides bounding boxes in COCO format[x, y, w, h] (pixels, top-left corner) with 0-indexed category labels. Faster RCNN requires:
- Boxes in
[x1, y1, x2, y2]pixel coordinates, scaled to the resized image - 1-indexed labels (0 = background, 1–80 = COCO categories)
- ImageNet-normalized image tensors of shape
(3, 800, 800)
Copy
# COCOStreamDataset is imported from frcnn_common — inspect its signature:
help(COCOStreamDataset.__init__)
Copy
Help on function __init__ in module frcnn_common:
__init__(self, split: str = 'train', max_samples: Optional[int] = None)
Initialize the streaming COCO dataset reader and optional sample cap for quick experiments.
Copy
# frcnn_collate_fn is imported from frcnn_common — inspect its signature:
help(frcnn_collate_fn)
Copy
Help on function frcnn_collate_fn in module frcnn_common:
frcnn_collate_fn(batch)
Stack images but keep target dicts in a Python list for variable lengths.
Copy
class AnchorTargetGenerator:
"""Assign GT labels and regression targets to pre-computed anchors.
For each image in a batch:
- Positive anchor: IoU with any GT >= 0.7 (or highest-IoU anchor per GT)
- Negative anchor: IoU with all GTs < 0.3
- Neutral anchor: everything else (ignored during loss)
Samples 256 anchors per image at 1:1 pos/neg ratio.
"""
def __init__(self, pos_iou: float = 0.7, neg_iou: float = 0.3,
total_samples: int = 256, pos_fraction: float = 0.5):
self.pos_iou = pos_iou
self.neg_iou = neg_iou
self.total_samples = total_samples
self.n_pos = int(total_samples * pos_fraction)
def compute_iou(self, anchors: torch.Tensor, gt_boxes: torch.Tensor) -> torch.Tensor:
"""Compute IoU matrix: (N_anchors, N_gt)."""
ax1, ay1, ax2, ay2 = anchors.unbind(1)
gx1, gy1, gx2, gy2 = gt_boxes.unbind(1)
inter_x1 = torch.max(ax1[:, None], gx1[None, :])
inter_y1 = torch.max(ay1[:, None], gy1[None, :])
inter_x2 = torch.min(ax2[:, None], gx2[None, :])
inter_y2 = torch.min(ay2[:, None], gy2[None, :])
inter_w = (inter_x2 - inter_x1).clamp(min=0)
inter_h = (inter_y2 - inter_y1).clamp(min=0)
inter = inter_w * inter_h
area_a = (ax2 - ax1) * (ay2 - ay1)
area_g = (gx2 - gx1) * (gy2 - gy1)
union = area_a[:, None] + area_g[None, :] - inter
return inter / union.clamp(min=1e-6)
def __call__(self, anchors: torch.Tensor, gt_boxes: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Returns:
labels: (N_anchors,) — 1=positive, 0=negative, -1=neutral/ignore
matched_gt: (N_anchors, 4) — GT box matched to each anchor
"""
# Start with every anchor marked as "ignore" and no regression targets.
N = len(anchors)
labels = torch.full((N,), -1, dtype=torch.int64)
matched_gt = torch.zeros((N, 4), dtype=torch.float32)
# If an image has zero GT boxes, randomly sample negatives and bail out early.
if len(gt_boxes) == 0:
neg_idx = torch.randperm(N)[:self.total_samples]
labels[neg_idx] = 0
return labels, matched_gt
# Pairwise IoU over anchors × GT boxes drives both classification and regression.
iou = self.compute_iou(anchors, gt_boxes) # (N, M)
# Track best GT per anchor and best anchor per GT (to enforce at least one positive).
max_iou_per_anchor, best_gt_idx = iou.max(dim=1)
_, best_anchor_per_gt = iou.max(dim=0)
# Thresholds carve up positive / negative regions; anything in between stays neutral.
labels[max_iou_per_anchor >= self.pos_iou] = 1
labels[max_iou_per_anchor < self.neg_iou] = 0
labels[best_anchor_per_gt] = 1 # force-positive best anchors
pos_idx = torch.where(labels == 1)[0]
neg_idx = torch.where(labels == 0)[0]
# Subsample to the requested 256 anchors with the desired pos/neg ratio.
n_pos = min(len(pos_idx), self.n_pos)
n_neg = min(len(neg_idx), self.total_samples - n_pos)
pos_idx = pos_idx[torch.randperm(len(pos_idx))[:n_pos]]
neg_idx = neg_idx[torch.randperm(len(neg_idx))[:n_neg]]
# Everything not selected for the mini-batch goes back to "ignore" (label = -1).
keep = torch.zeros(N, dtype=torch.bool)
keep[pos_idx] = True
keep[neg_idx] = True
labels[~keep] = -1
# Matched GT coordinates provide the regression targets for every anchor.
matched_gt = gt_boxes[best_gt_idx]
return labels, matched_gt
Copy
# Stream a small batch for inspection (no local data needed)
dataset = COCOStreamDataset(split='train', max_samples=64)
loader = DataLoader(dataset, batch_size=2, collate_fn=frcnn_collate_fn,
num_workers=0)
imgs, targets = next(iter(loader))
print(f"Image batch : {imgs.shape}")
print(f"First target: boxes={targets[0]['boxes'].shape}, labels={targets[0]['labels'].shape}")
Copy
Image batch : torch.Size([2, 3, 400, 400])
First target: boxes=torch.Size([8, 4]), labels=torch.Size([8])
Copy
# Inspection: visualize 2 images with GT boxes
cat_names = {i: name for i, name in enumerate(COCO_NAMES)} # 0=background, 1+=categories
fig, axes = plt.subplots(1, 2, figsize=(18, 9))
for i, ax in enumerate(axes):
img = imgs[i].permute(1, 2, 0).numpy()
img = img * STD + MEAN # denormalize
img = np.clip(img, 0, 1)
ax.imshow(img)
for box, lbl in zip(targets[i]['boxes'], targets[i]['labels']):
x1, y1, x2, y2 = box.tolist()
rect = patches.Rectangle((x1, y1), x2-x1, y2-y1,
linewidth=2, edgecolor='lime', facecolor='none')
ax.add_patch(rect)
ax.text(x1, y1 - 4, cat_names.get(lbl.item(), '?'),
fontsize=8, color='lime', weight='bold')
ax.set_title(f"Image {i}: {len(targets[i]['boxes'])} objects")
ax.axis('off')
plt.tight_layout()
plt.show()

Copy
# Inspection: anchor label distribution on first image in batch
atg = AnchorTargetGenerator()
dummy_anchors = torch.rand(200, 4) * IMG_SIZE
dummy_anchors[:, 2:] = dummy_anchors[:, :2] + torch.rand(200, 2) * 200
anchor_labels, _ = atg(dummy_anchors, targets[0]['boxes'])
# IMPORTANT: This histogram reflects raw labels before the 256-anchor subsample.
# Positives are naturally rare because most randomly placed anchors miss GT boxes.
# In the RPN training loop we sample ~50/50 pos/neg; to visualize that distribution,
# move the counting logic inside AnchorTargetGenerator after the subsampling "keep" mask.
pos = (anchor_labels == 1).sum().item()
neg = (anchor_labels == 0).sum().item()
neu = (anchor_labels == -1).sum().item()
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(['positive', 'negative', 'neutral'], [pos, neg, neu],
color=['green', 'red', 'gray'])
ax.set_title('Anchor sampling (200 dummy anchors, sample image)')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('images/anchor_stats.png', dpi=100, bbox_inches='tight')
plt.show()
print(f"Pos: {pos}, Neg: {neg}, Neutral: {neu}")

Copy
Pos: 8, Neg: 187, Neutral: 5
Copy
# Inspection: class distribution across 200 streamed samples
sample_ds = COCOStreamDataset(split='train', max_samples=200)
all_labels = []
for _, t in sample_ds:
all_labels.extend(t['labels'].tolist())
counter = Counter(all_labels)
top20 = sorted(counter.items(), key=lambda x: -x[1])[:20]
names = [cat_names.get(k, str(k)) for k, _ in top20]
counts = [v for _, v in top20]
fig, ax = plt.subplots(figsize=(14, 4))
ax.bar(names, counts)
ax.set_title('Top-20 categories by annotation count (200 COCO train samples)')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('images/class_distribution.png', dpi=100, bbox_inches='tight')
plt.show()

References
- Chen, L., Papandreou, G., Schroff, F., Adam, H. (2017). Rethinking Atrous Convolution for Semantic Image Segmentation.
- (2018). Create your own COCO-style datasets. waspinator.
- Ren, S., He, K., Girshick, R., Sun, J. (2015). Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks.
- Szegedy, C., Ioffe, S., Vanhoucke, V., Alemi, A. (2016). Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning.
- Zimmermann, R., Siems, J. (2018). Faster Training of Mask R-CNN by Focusing on Instance Boundaries.

