Skip to main content
Open In ColabOpen Slides

Backbone: ResNet50 + Feature Pyramid Network

Notebook 2 of 6 in the Faster RCNN from-scratch series We build the feature extractor from scratch: ResNet50 bottleneck blocks, followed by an FPN that produces P2–P5 feature maps at strides 4, 8, 16, 32.
import sys, os, pathlib
# Locate frcnn_common.py — works whether run via papermill or interactively
_nb_candidates = [
    pathlib.Path.cwd().parent,  # interactive: cwd is the notebook dir
    pathlib.Path.cwd() / 'notebooks' / 'scene-understanding' / 'object-detection' / 'faster-rcnn' / 'pytorch',  # papermill: cwd is repo root
]
for _p in _nb_candidates:
    if (_p / 'frcnn_common.py').exists():
        sys.path.insert(0, str(_p))
        break

import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

from frcnn_common import (
    Bottleneck, ResNet50, FPN,
    IMG_SIZE, DEVICE,
)

print(f"Device: {DEVICE}")
/workspaces/eng-ai-agents/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Device: cuda
# Bottleneck is imported from frcnn_common — inspect its structure:
print(Bottleneck(64, 64))
Bottleneck(
  (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
# ResNet50 is imported from frcnn_common — inspect its structure:
backbone = ResNet50()
print(f"ResNet50 stages: stem, layer1-4")
print(f"Total params: {sum(p.numel() for p in backbone.parameters())/1e6:.1f}M")
ResNet50 stages: stem, layer1-4
Total params: 23.5M
backbone = ResNet50()
x = torch.randn(1, 3, IMG_SIZE, IMG_SIZE)
with torch.no_grad():
    c2, c3, c4, c5 = backbone(x)
print(f"C2: {c2.shape}  C3: {c3.shape}  C4: {c4.shape}  C5: {c5.shape}")
C2: torch.Size([1, 256, 100, 100])  C3: torch.Size([1, 512, 50, 50])  C4: torch.Size([1, 1024, 25, 25])  C5: torch.Size([1, 2048, 13, 13])
# FPN is imported from frcnn_common — inspect its structure:
fpn = FPN()
print(fpn)
FPN(
  (lateral): ModuleList(
    (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (2): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (3): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
  )
  (output): ModuleList(
    (0-3): 4 x Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (p6): MaxPool2d(kernel_size=1, stride=2, padding=0, dilation=1, ceil_mode=False)
)
fpn = FPN()
with torch.no_grad():
    p2, p3, p4, p5, p6 = fpn((c2, c3, c4, c5))
for name, feat in [('P2',p2),('P3',p3),('P4',p4),('P5',p5),('P6',p6)]:
    print(f"{name}: {feat.shape}")
P2: torch.Size([1, 256, 100, 100])
P3: torch.Size([1, 256, 50, 50])
P4: torch.Size([1, 256, 25, 25])
P5: torch.Size([1, 256, 13, 13])
P6: torch.Size([1, 256, 7, 7])
# Load one real image and pass through backbone + FPN
# (reuse COCODataset from notebook 01 or load a single image)
# img should be a normalized tensor of shape [1, 3, H, W]; we use noise as a placeholder
img = torch.randn(1, 3, 800, 800)  # replace with real image tensor once dataset wiring is in place
# Run the backbone to obtain C2–C5 feature maps at multiple strides
c2, c3, c4, c5 = backbone(img)
# Feed the multi-scale backbone outputs through the FPN to get pyramid levels P2–P6
p2, p3, p4, p5, p6 = fpn((c2, c3, c4, c5))

# Visualize the mean activation of each pyramid level to sanity-check spatial resolution
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
# Iterate over each level, squeeze to spatial dimensions, and plot the channel-wise mean heatmap
for ax, (name, feat) in zip(axes, [('P2',p2),('P3',p3),('P4',p4),('P5',p5),('P6',p6)]):
    fmap = feat[0].mean(dim=0).detach().cpu().numpy()  # mean over channels to a single 2D map
    ax.imshow(fmap, cmap='viridis')
    ax.set_title(f"{name}\n{feat.shape[-2]}x{feat.shape[-1]}")
    ax.axis('off')
# Persist the visualization for later inspection and display inline
plt.suptitle("FPN Feature Maps (mean activation)", fontsize=13)
plt.savefig("images/fpn_features.png", dpi=100, bbox_inches='tight')
plt.show()
Output from cell 7
fig, axes = plt.subplots(1, 4, figsize=(16, 3))
for ax, (name, feat) in zip(axes, [('P2',p2),('P3',p3),('P4',p4),('P5',p5)]):
    vals = feat[0].detach().cpu().numpy().flatten()
    ax.hist(vals, bins=50, color='steelblue', alpha=0.7)
    ax.set_title(f"{name} activations")
    ax.set_xlabel("Value")
plt.tight_layout()
plt.savefig("images/activation_histograms.png", dpi=100, bbox_inches='tight')
plt.show()
Output from cell 8
total = sum(p.numel() for p in backbone.parameters()) + \
        sum(p.numel() for p in fpn.parameters())
print(f"Total backbone+FPN parameters: {total:,}")
# Expected: ~25M backbone + ~1.5M FPN ~= 26.5M
Total backbone+FPN parameters: 26,852,416
Key references: (Zagoruyko & Komodakis, 2016; Lin et al., 2016; Lin et al., 2016; Tan & Le, 2019; Wightman et al., 2021)

References

  • Lin, T., Dollár, P., Girshick, R., He, K., Hariharan, B., et al. (2016). Feature Pyramid Networks for Object Detection.
  • Lin, T., Dollár, P., Girshick, R., He, K., Hariharan, B., et al. (2016). Feature Pyramid Networks for Object Detection.
  • Tan, M., Le, Q. (2019). EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
  • Wightman, R., Touvron, H., Jégou, H. (2021). ResNet strikes back: An improved training procedure in timm.
  • Zagoruyko, S., Komodakis, N. (2016). Wide Residual Networks.