초보 개발자의 이야기, 릿허브

3D Point Cloud Processing 익히기

릿99 — Thu, 18 Dec 2025 22:47:27 +0900

1. 주어진 point cloud 분류 데이터 (각 샘플은 (N, 3) 포인트, 라벨은 K 클래스)로 다음을 구현하라.

전처리: 중심화 (centering) + 스케일 정규화 (scale normalization) + 고정 포인트 수 N으로 샘플링 (random sampling)
Custom Dataset 및 DataLoader
kNN 기반 local feature (edge feature) 생성
간단한 DGCNN 스타일 분류 모델 구현
loss (cross entropy) 및 학습/검증 루프 구현 후, validation accuracy 출력

# Colab cell 1: imports, seed
import math
import random
from dataclasses import dataclass
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Colab cell 2: point cloud preprocessing utilities

def center_and_scale(points: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
    """
    points: (N, 3) 또는 (B, N, 3)
    - centering: 평균을 원점으로 이동
    - scaling: (각 샘플마다) 원점 기준 최대 거리로 나눔 (단위 구에 가깝게)
    """
    if points.dim() == 2:
        # (N, 3)
        centroid = points.mean(dim=0, keepdim=True)             # (1, 3)
        centered = points - centroid                             # (N, 3)
        scale = centered.norm(dim=-1).max().clamp(min=eps)       # scalar
        return centered / scale
    elif points.dim() == 3:
        # (B, N, 3)
        centroid = points.mean(dim=1, keepdim=True)              # (B, 1, 3)
        centered = points - centroid                             # (B, N, 3)
        scale = centered.norm(dim=-1).max(dim=1, keepdim=True).values  # (B, 1)
        scale = scale.unsqueeze(-1).clamp(min=eps)               # (B, 1, 1)
        return centered / scale
    else:
        raise ValueError("points must have shape (N,3) or (B,N,3)")

def random_sample(points: torch.Tensor, n_points: int) -> torch.Tensor:
    """
    points: (N, 3)
    n_points: 고정 포인트 수
    - N >= n_points: 무작위로 n_points개 선택
    - N < n_points: 부족한 만큼 중복 샘플링 (시험에서 자주 필요)
    """
    N = points.size(0)
    if N == n_points:
        return points
    if N > n_points:
        idx = torch.randperm(N)[:n_points]
        return points[idx]
    # N < n_points
    pad = n_points - N
    extra_idx = torch.randint(low=0, high=N, size=(pad,))
    return torch.cat([points, points[extra_idx]], dim=0)

@torch.no_grad()
def pairwise_sq_dist(x: torch.Tensor) -> torch.Tensor:
    """
    x: (B, N, C)
    return: (B, N, N) squared distance
    - 메모리 사용이 큰 편이라 N이 커지면 병목
    - 시험에서는 N이 보통 512~2048 정도로 제한됨
    """
    # (x - y)^2 = x^2 + y^2 - 2xy
    # x2: (B, N, 1), y2: (B, 1, N), xy: (B, N, N)
    x2 = (x * x).sum(dim=-1, keepdim=True)
    y2 = x2.transpose(1, 2)
    xy = x @ x.transpose(1, 2)
    dist = x2 + y2 - 2.0 * xy
    return dist.clamp(min=0.0)

def knn_indices(x: torch.Tensor, k: int) -> torch.Tensor:
    """
    x: (B, N, C)
    return: idx (B, N, k)
    - 자기 자신 (거리 0) 포함되므로, 보통 k+1 뽑고 첫 번째 제거
    """
    dist = pairwise_sq_dist(x)  # (B, N, N)
    # 가장 가까운 순서로 k+1개 (자기 자신 포함) -> 자기 자신 제외
    idx = dist.topk(k=k+1, dim=-1, largest=False).indices  # (B, N, k+1)
    return idx[..., 1:]  # (B, N, k)

# Colab cell 3: edge feature (DGCNN style)

def batched_index_select(feat: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
    """
    feat: (B, N, C)
    idx: (B, N, k)
    return: gathered (B, N, k, C)

    torch.gather는 같은 rank에서 동작하므로 idx를 (B, N, k, C)로 확장해 사용
    """
    B, N, C = feat.shape
    k = idx.size(-1)

    idx_expand = idx.unsqueeze(-1).expand(B, N, k, C)  # (B, N, k, C)
    feat_expand = feat.unsqueeze(2).expand(B, N, k, C) # (B, N, k, C)

    # gather는 dim 기준으로 idx가 참조하는 축이 필요
    # 여기서는 N 축을 선택해야 하므로, feat를 (B, N, 1, C)로 만든 뒤
    # dim=1에서 gather하면 깔끔하지만 rank가 맞지 않음
    # 가장 안전한 방식: feat를 (B, N, C) -> (B, 1, N, C)로 바꾸고 dim=2에서 gather
    feat_ = feat.unsqueeze(1).expand(B, N, N, C)       # (B, N, N, C)
    # idx_expand는 (B, N, k, C)인데, gather할 dim=2에 맞춰야 하므로 그대로 사용 가능
    gathered = torch.gather(feat_, dim=2, index=idx_expand)  # (B, N, k, C)
    return gathered

def edge_feature(x: torch.Tensor, k: int) -> torch.Tensor:
    """
    x: (B, N, C)
    return: edge (B, N, k, 2C)
    - edge = concat( x_center, x_neighbor - x_center )
    """
    idx = knn_indices(x, k=k)               # (B, N, k)
    neigh = batched_index_select(x, idx)    # (B, N, k, C)
    center = x.unsqueeze(2).expand_as(neigh)  # (B, N, k, C)
    edge = torch.cat([center, neigh - center], dim=-1)       # (B, N, k, 2C)
    return edge

# Colab cell 4: synthetic dataset (classification)

def make_synthetic_cloud(label: int, n_raw: int = 1024) -> torch.Tensor:
    """
    label 0: 구 (sphere-like)
    label 1: 평면 (plane-like)
    label 2: 원기둥 (cylinder-like)
    - 파일에서 읽어오면 됨. 여기선 파이프라인 검증용.
    """
    if label == 0:
        # sphere: 랜덤 방향 + 반지름 약간 변형
        v = torch.randn(n_raw, 3)
        v = v / (v.norm(dim=-1, keepdim=True) + 1e-6)
        r = 1.0 + 0.05 * torch.randn(n_raw, 1)
        pts = v * r
    elif label == 1:
        # plane: z ~ 0
        xy = torch.randn(n_raw, 2)
        z = 0.02 * torch.randn(n_raw, 1)
        pts = torch.cat([xy, z], dim=-1)
    else:
        # cylinder: x^2 + y^2 ~ 1, z 랜덤
        theta = 2 * math.pi * torch.rand(n_raw, 1)
        x = torch.cos(theta)
        y = torch.sin(theta)
        z = 0.5 * torch.randn(n_raw, 1)
        pts = torch.cat([x, y, z], dim=-1)
        pts = pts + 0.02 * torch.randn_like(pts)

    return pts.float()

class PointCloudClsDataset(Dataset):
    def __init__(self, n_samples: int, n_points: int, n_classes: int = 3):
        self.n_samples = n_samples
        self.n_points = n_points
        self.n_classes = n_classes

        # 미리 생성 (실제 사용 시, load 후 리스트에 저장)
        self.labels = torch.randint(low=0, high=n_classes, size=(n_samples,))
        self.clouds = [make_synthetic_cloud(int(y), n_raw=1024) for y in self.labels]

    def __len__(self) -> int:
        return self.n_samples

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        pts = self.clouds[idx]                     # (N_raw, 3)
        pts = center_and_scale(pts)                # (N_raw, 3)
        pts = random_sample(pts, self.n_points)    # (n_points, 3)
        y = self.labels[idx].long()                # ()
        return pts, y

def cls_collate(batch):
    # batch: list of (points (N,3), label ())
    pts = torch.stack([b[0] for b in batch], dim=0)   # (B, N, 3)
    y = torch.stack([b[1] for b in batch], dim=0)     # (B,)
    return pts, y

# Colab cell 5: DGCNN-style classifier (minimal)

class EdgeConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, edge: torch.Tensor) -> torch.Tensor:
        """
        edge: (B, N, k, Cin) but Conv2d expects (B, Cin, N, k)
        return: (B, Cout, N, k)
        """
        x = edge.permute(0, 3, 1, 2).contiguous()
        return self.mlp(x)

class DGCNNClassifier(nn.Module):
    def __init__(self, k: int = 16, num_classes: int = 3, emb_dim: int = 128):
        super().__init__()
        self.k = k

        # 입력 feature는 xyz 3차원
        # edge feature는 2C -> 6 채널
        self.ec1 = EdgeConvBlock(in_channels=6, out_channels=64)
        self.ec2 = EdgeConvBlock(in_channels=128, out_channels=64)  # 2*64
        self.ec3 = EdgeConvBlock(in_channels=128, out_channels=emb_dim)

        self.cls_head = nn.Sequential(
            nn.Linear(64 + 64 + emb_dim, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.2),
            nn.Linear(256, num_classes),
        )

    def forward(self, pts: torch.Tensor) -> torch.Tensor:
        """
        pts: (B, N, 3)
        return logits: (B, num_classes)
        """
        B, N, _ = pts.shape

        # 1) edge conv block 1
        e1 = edge_feature(pts, k=self.k)                 # (B, N, k, 6)
        f1 = self.ec1(e1)                                # (B, 64, N, k)
        f1 = f1.max(dim=-1).values                       # neighbor max -> (B, 64, N)

        # 2) block 2 uses feature space knn (common in DGCNN)
        x1 = f1.permute(0, 2, 1).contiguous()            # (B, N, 64)
        e2 = edge_feature(x1, k=self.k)                  # (B, N, k, 128)
        f2 = self.ec2(e2)                                # (B, 64, N, k)
        f2 = f2.max(dim=-1).values                       # (B, 64, N)

        # 3) block 3
        x2 = f2.permute(0, 2, 1).contiguous()            # (B, N, 64)
        e3 = edge_feature(x2, k=self.k)                  # (B, N, k, 128)
        f3 = self.ec3(e3)                                # (B, emb, N, k)
        f3 = f3.max(dim=-1).values                       # (B, emb, N)

        # global pooling over points
        g1 = f1.max(dim=-1).values                       # (B, 64)
        g2 = f2.max(dim=-1).values                       # (B, 64)
        g3 = f3.max(dim=-1).values                       # (B, emb)

        g = torch.cat([g1, g2, g3], dim=-1)              # (B, 64+64+emb)
        logits = self.cls_head(g)                        # (B, num_classes)
        return logits

# Colab cell 6: train / eval loop

@dataclass
class TrainConfig:
    n_points: int = 512
    k: int = 16
    num_classes: int = 3
    batch_size: int = 16
    lr: float = 1e-3
    epochs: int = 8

def accuracy(logits: torch.Tensor, y: torch.Tensor) -> float:
    pred = logits.argmax(dim=-1)
    return (pred == y).float().mean().item()

def train_one_epoch(model, loader, optim):
    model.train()
    total_loss = 0.0
    total_acc = 0.0

    for pts, y in loader:
        pts = pts.to(device)
        y = y.to(device)

        logits = model(pts)
        loss = F.cross_entropy(logits, y)

        optim.zero_grad()
        loss.backward()
        optim.step()

        total_loss += loss.item()
        total_acc += accuracy(logits.detach(), y)

    return total_loss / len(loader), total_acc / len(loader)

@torch.no_grad()
def eval_one_epoch(model, loader):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0

    for pts, y in loader:
        pts = pts.to(device)
        y = y.to(device)

        logits = model(pts)
        loss = F.cross_entropy(logits, y)

        total_loss += loss.item()
        total_acc += accuracy(logits, y)

    return total_loss / len(loader), total_acc / len(loader)

cfg = TrainConfig()

train_ds = PointCloudClsDataset(n_samples=400, n_points=cfg.n_points, n_classes=cfg.num_classes)
val_ds   = PointCloudClsDataset(n_samples=120, n_points=cfg.n_points, n_classes=cfg.num_classes)

train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, collate_fn=cls_collate, num_workers=0)
val_loader   = DataLoader(val_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=cls_collate, num_workers=0)

model = DGCNNClassifier(k=cfg.k, num_classes=cfg.num_classes).to(device)
optim = torch.optim.Adam(model.parameters(), lr=cfg.lr)

for epoch in range(1, cfg.epochs + 1):
    tr_loss, tr_acc = train_one_epoch(model, train_loader, optim)
    va_loss, va_acc = eval_one_epoch(model, val_loader)
    print(f"epoch {epoch:02d} | train loss {tr_loss:.4f} acc {tr_acc:.3f} | val loss {va_loss:.4f} acc {va_acc:.3f}")

샘플마다 포인트 수가 다를 때 dataloader에서 터지는 문제 (fixed N sampling으로 해결)
knn 이후 gather shape 정리 (B, N, k, C) 만들기
edge feature 차원 변환 (Conv2d 입력은 (B, C, N, k))
segmentation이면 CE 입력 shape (B, C, N) 또는 flatten (B·N, C)로 바꿔야 함

2. 주어진 point cloud segmentation 데이터 (각 샘플 (N, 3), 각 포인트 라벨 (N,) )에 대해 다음을 구현하라.

전처리 및 고정 포인트 수 처리
Custom Dataset 및 DataLoader
PointNet 스타일 segmentation 모델 구현 (global feature를 per-point feature에 concat)
per-point cross entropy loss 구현 (shape 정리 포함)
학습 후 validation per-point accuracy 출력

# Colab cell 7: synthetic segmentation dataset

def make_synthetic_seg_cloud(n_raw: int = 1024) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    간단한 segmentation 태스크:
    - 구 표면 포인트를 만들고
    - z > 0 이면 class 1, else class 0 (반구 분할)
    """
    v = torch.randn(n_raw, 3)
    v = v / (v.norm(dim=-1, keepdim=True) + 1e-6)
    r = 1.0 + 0.02 * torch.randn(n_raw, 1)
    pts = (v * r).float()

    labels = (pts[:, 2] > 0).long()  # (N_raw,)
    return pts, labels

class PointCloudSegDataset(Dataset):
    def __init__(self, n_samples: int, n_points: int):
        self.n_samples = n_samples
        self.n_points = n_points
        self.data = [make_synthetic_seg_cloud(1024) for _ in range(n_samples)]

    def __len__(self) -> int:
        return self.n_samples

    def __getitem__(self, idx: int):
        pts, y = self.data[idx]                    # pts (N_raw,3), y (N_raw,)
        pts = center_and_scale(pts)

        # fixed N sampling 시, labels도 같은 idx로 뽑아야 함
        N = pts.size(0)
        if N >= self.n_points:
            idxs = torch.randperm(N)[:self.n_points]
        else:
            pad = self.n_points - N
            extra = torch.randint(low=0, high=N, size=(pad,))
            idxs = torch.cat([torch.arange(N), extra], dim=0)

        pts = pts[idxs]                            # (n_points,3)
        y = y[idxs]                                # (n_points,)
        return pts, y

def seg_collate(batch):
    pts = torch.stack([b[0] for b in batch], dim=0)  # (B,N,3)
    y = torch.stack([b[1] for b in batch], dim=0)    # (B,N)
    return pts, y

# Colab cell 8: PointNet segmentation model

class PointNetSeg(nn.Module):
    def __init__(self, num_classes: int = 2):
        super().__init__()
        # per-point feature extractor (shared MLP)
        self.mlp1 = nn.Sequential(
            nn.Conv1d(3, 64, 1, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Conv1d(64, 128, 1, bias=False),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
        )
        # global feature
        self.mlp_global = nn.Sequential(
            nn.Conv1d(128, 256, 1, bias=False),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
        )
        # segmentation head: concat (local 128 + global 256) = 384
        self.seg_head = nn.Sequential(
            nn.Conv1d(384, 256, 1, bias=False),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.2),
            nn.Conv1d(256, num_classes, 1),
        )

    def forward(self, pts: torch.Tensor) -> torch.Tensor:
        """
        pts: (B, N, 3)
        return logits: (B, N, num_classes)
        """
        x = pts.permute(0, 2, 1).contiguous()  # (B,3,N)

        local = self.mlp1(x)                   # (B,128,N)
        g = self.mlp_global(local)             # (B,256,N)

        # global max pooling over points
        gmax = g.max(dim=-1, keepdim=True).values     # (B,256,1)
        gmax = gmax.expand(-1, -1, local.size(-1))    # (B,256,N)

        feat = torch.cat([local, gmax], dim=1)        # (B,384,N)
        out = self.seg_head(feat)                     # (B,C,N)

        return out.permute(0, 2, 1).contiguous()      # (B,N,C)

# Colab cell 9: segmentation train / eval

def per_point_accuracy(logits: torch.Tensor, y: torch.Tensor) -> float:
    # logits (B,N,C), y (B,N)
    pred = logits.argmax(dim=-1)
    return (pred == y).float().mean().item()

def train_one_epoch_seg(model, loader, optim):
    model.train()
    total_loss = 0.0
    total_acc = 0.0

    for pts, y in loader:
        pts = pts.to(device)
        y = y.to(device)

        logits = model(pts)  # (B,N,C)

        # CE는 입력이 (B,C,...) 형태를 선호하므로 reshape 또는 permute가 필요
        # 방법 1: flatten
        B, N, C = logits.shape
        loss = F.cross_entropy(logits.view(B * N, C), y.view(B * N))

        optim.zero_grad()
        loss.backward()
        optim.step()

        total_loss += loss.item()
        total_acc += per_point_accuracy(logits.detach(), y)

    return total_loss / len(loader), total_acc / len(loader)

@torch.no_grad()
def eval_one_epoch_seg(model, loader):
    model.eval()
    total_loss = 0.0
    total_acc = 0.0

    for pts, y in loader:
        pts = pts.to(device)
        y = y.to(device)

        logits = model(pts)
        B, N, C = logits.shape
        loss = F.cross_entropy(logits.view(B * N, C), y.view(B * N))

        total_loss += loss.item()
        total_acc += per_point_accuracy(logits, y)

    return total_loss / len(loader), total_acc / len(loader)

train_ds = PointCloudSegDataset(n_samples=300, n_points=512)
val_ds   = PointCloudSegDataset(n_samples=100, n_points=512)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=seg_collate, num_workers=0)
val_loader   = DataLoader(val_ds, batch_size=16, shuffle=False, collate_fn=seg_collate, num_workers=0)

model = PointNetSeg(num_classes=2).to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, 7):
    tr_loss, tr_acc = train_one_epoch_seg(model, train_loader, optim)
    va_loss, va_acc = eval_one_epoch_seg(model, val_loader)
    print(f"epoch {epoch:02d} | train loss {tr_loss:.4f} acc {tr_acc:.3f} | val loss {va_loss:.4f} acc {va_acc:.3f}")

[Python] 백준 2563번 색종이

릿99 — Tue, 3 Sep 2024 21:38:01 +0900

1. 문제이해

https://www.acmicpc.net/problem/2563

가로, 세로의 크기가 각각 100인 정사각형 모양의 도화지가 존재한다. 이 도화지 위에 가로, 세로의 크기가 각각 10인 정사각형 모양의 검은색 색종이를 색종이의 변과 도화지의 변이 평행하도록 붙인다. 이러한 방식으로 색종이를 한 장 또는 여러 장 붙인 후 색종이가 붙은 검은 영역의 넓이를 구하는 것이 목표이다.

2. 문제 풀이

실버 5의 단순 구현 문제이다.

문제를 처음 접근하면서, 단순히 색종이를 하나씩 붙인다는 생각으로 접근하면 매우 복잡한 문제가 되어버린다.

하나씩 종이를 붙여가면서 겹치는 영역을 빼줄 생각을 하면 어렵다는 거다.

몇개의 색종이를 붙일지도, 몇개의 색종이가 겹칠지도, 겹치는 영역이 몇개일지도 정해진 것이 없다.

예를 들어, 10개의 종이를 붙인다 했을때, 10개의 겹치는 영역이 생긴다고 하면 다 빼주고 9번 더해줄것인가?

그렇게 되면 문제가 너무 복잡해진다는거다.

정해진 것, 주어진 것 위주로 생각하자.

우리에게 주어진 것은 전체 종이가 가로, 세로의 크기가 각각 100인 정사각형 모양의 도화지이며,

가로, 세로의 크기가 각각 10인 정사각형 모양의 검은색 색종이를 계속해서 붙여간다는거다.

그렇다면 반대로 생각해보자. 붙이는게 아니고 빼주는거다.

전체 100x100 크기의 도화지에서, 10x10 크기의 종이들에 해당하는 영역을 오려내준다고 생각하면 문제가 편해진다.

한번 오려낸 영역에 대해서는 다시 겹치는 것을 생각해 줄 필요가 없다.

이를 코드로 구현해보면, 전체 100x100 크기의 도화지를 100x100 크기의 행렬로 두고,

N개의 색종이의 각 시작 좌표 (x, y)에 대해 1 0x10 크기의 종이들에 해당하는 영역에 대해 값을 빼주면 된다.

자세한 코드는 아래와 같다.

3. 소스코드

paper = []  # 전체 도화지
for i in range(100):
    paper.append([-1]*100)  # -1값으로 모두 초기화

N = int(input()) # 색종이 갯수
for i in range(N):
    x, y = map(int, input().split())
    max_x = x+10
    max_y = y+10
    # 색종이에 해당하는 영역에 대해
    for j in range(x, max_x):   
        for k in range(y, max_y):
            paper[j][k] += 1    # 값을 더해준다

count_region = 0
for i in range(100):
    for j in range(100):
        if paper[i][j] != -1:   # -1이 아니면, 일단 겹치던 아니던 색종이가 붙은 영역이 됨
            count_region += 1   # 이 영역의 크기를 구함

print(count_region)

[CV] Bilateral Filter (양방향 필터)

릿99 — Wed, 6 Dec 2023 19:16:47 +0900

Bilateral Filter

1. Introduction

우리는 일상생활에서 수많은 이미지를 접하게됩니다.

이러한 이미지들은 모두 '픽셀(pixel)'이라는 색상 정보를 포함하는 요소로 구성되어있습니다.

각 픽셀은 RGB 값으로 색상을 나타내며 각 색상의 강도(intensity)를 나타냅니다.

이 강도는 일반적으로 0~255 범위의 단일 바이트로 개별적으로 표시됩니다.

Example of color image patch

오늘 소개할 Bilateral filter(양방향 필터)는 image transformation technique의 일종으로 그 중에서도 filtering의 일종입니다.

Filtering이란 이미지의 각 픽셀 값을 변경하는 것을 의미합니다.

이 "intensity" 값을 변경하는 것은 이미지의 "range"를 변경하는 것으로 표현되기도 합니다.

아래 그림은 image filtering의 예시로, 다양한 이미지 필터를 보여줍니다.

임의의 이미지 'F' 에 임의의 filter 'h'를 적용한 결과, 'G'와 같이 filtering된 이미지가 출력됩니다.

보시다시피 이미지의 높이나 너비에는 변화가 없으나, pixel intensity, range가 변경된 것을 볼 수 있습니다.

Example of the image filtering

다음은 다양한 이미지 필터와 시각화 결과입니다.

(a)의 경우 중앙에 1, 주변은 '0'으로 이루어진 filter로, 이를 이미지에 적용하면 중앙 부분이 강조된 이미지가 출력됩니다.

이러한 filter를 Edge detection filter라고 하며 이는 이미지의 중요한 정보 중 하나인 edge, 가장자리를 감지하는 데 사용됩니다.

다음 (b)와 (c)는 Gaussian filter의 일종으로, 이미지를 부드럽게 하거나 노이즈를 줄이는 데 사용됩니다.

이미지의 중앙 부분은 유지되고 주변 부분은 흐려집니다.

Example of filter kernels

오늘 소개할 Bilateral filter는 앞서 언급했듯 위와 같은 이미지 filtering의 일종입니다.

Bilateral filtering은 아래 그림과 같이 가장자리를 유지하면서 이미지를 매끄럽게(smoothing) 만드는 기술입니다.

이는 영상의 spatial한 부분과 intensity(range) 부분을 모두 고려하기 때문인데요,

제안하는 방법에 대해서는 아래에 더 자세하게 기술하도록 하겠습니다.

Application example of bilateral filter

2. Related work

Bilateral filter를 이해하기 위해서는 먼저 Gaussian filter(가우시안 필터)에 대한 이해가 필요합니다.

Gaussian filter는 아래와 같이 Gaussian kernel을 이용하여 영상을 처리하는 필터입니다.

Image smoothing, noise reduction, blurring 등에 사용됩니다.

아래 그림은 Gaussian filter를 적용한 결과로,
왼쪽의 입력 영상에 Gaussian filter를 적용하면 오른쪽 그림과 같이 전체적으로 흐릿한 영상이 나타납니다.

Application example of Gaussian filter

Gaussian filter는 아래 식과 같이 표현됩니다.

Gaussian filter

이해를 위해 식을 단순화하여, 입력 이미지가 Grayscale 이미지라고 가정하겠습니다.

다음 방정식에서 I_p는 픽셀 위치 p의 이미지 값입니다.

||p − q|| 는 픽셀 위치 p와 q 사이의 Euclidean distance이고 G_sigma는 이미지에 적용된 Gaussian kernel입니다.

Gaussian kernel은 아래와 같은 형태로 표현됩니다.

Gaussian distribution and result according to sigma value

앞서 설명했듯, Gaussian filtering은 중심 위치 p까지의 spatial(공간적) distance에 따라 weight가 변화합니다.

즉, Gaussian filtering은 인접한 위치의 intensity에 대한 weighted average입니다.
픽셀 q의 가중치는 Gaussian G_σ로 정의됩니다.

σ는 neighbor size를 정의하는 값으로, 픽셀의 값(intensity)이 아닌 픽셀 사이의 spatial distance에만 의존합니다.

예를 들어, 밝은 픽셀은 인접한 어두운 픽셀에 큰 영향을 미칩니다.

비록 이 두 픽셀 값이 다르더라도 말입니다.

즉, smoothing degree는 spatial degree(sigma)에 의해 조정됩니다.

결과적으로 불연속적인 픽셀의 평균이 함께 계산되므로 이미지 가장자리가 흐려지게(blurred)됩니다.
위의 그림과 같이 왼쪽으로 갈수록, 시그마 값이 클수록 중심보다 이웃 값을 더 많이 고려하게됩니다.

그래서 시그마의 값이 커질수록 원본에 비해 부드러워진 것을 알 수 있습니다.

아래 그림은 앞서 설명한 시그마 값에 따른 smoothing 변화의 또 다른 예입니다.

그림의 윗 행은 Gaussian kernel을 보여주고 아래 행은 해당 Gaussian kernel의 filtering으로 얻은 결과입니다.

σ 값이 높으면 훨씬 더 넓은 영역에 걸쳐 평균화가 수행되기 때문에 edge 정보가 손실됩니다.

Gaussian distribution and result according to sigma value

원본 이미지에서 noise를 최대한 제거하고 edge 정보를 보존하는 것은 주요한 challenge 중 하나입니다.

일반적으로 noise는 대부분 high frequency(고주파) 성분으로 구성됩니다.

궁극적으로 우리의 목표는 이러한 high frequency 성분을 제거하는 것입니다.

이러한 고주파 성분을 제거하기 위해 주로 low-pass filter가 사용되었는데,

그 대표적인 예가 앞서 말씀드린 Gaussian filter입니다.

그러나 앞선 결과를 통해 확인했듯, Gaussian filter를 사용하면 이미지 전체가 흐려지게됩니다.

즉, noise는 제거되나 edge 정보 또한 손실되게 됩니다.

이러한 이유는 Gaussian filter가 픽셀의 값(intensity)이 아닌 픽셀 사이의 spatial distance만 고려하기 때문입니다.

따라서, 이미지 edge 부근의 픽셀 값을 평탄화시키는 단점이 있습니다.

이러한 문제점은 Edge-aware filter를 사용하여 해결할 수 있습니다.

Application example of Gaussian filter

3. Bilateral filter

Bilateral filter는 Edge-aware filter의 대표적인 예 중 하나입니다.

Bilateral filter는 아래와 같이 이미지 패치의 특성을 고려하여 커널의 모양을 달리해 필터링하는 방법입니다.

Application example of bilateral filter

첫 번째 패치를 보면 Edge 요소가 없어 Gaussian 커널을 적용할 수 있습니다.

문제는 2, 3번째 패치입니다.

이러한 패치의 경우 Gaussian kernel의 개념을 취하되, 각 픽셀의 intensity value를 고려하여 커널을 구성합니다.

두 번째 패치에서는 상대적으로 왼쪽의 픽셀값이 더 밝고 오른쪽의 픽셀값이 어둡기때문에, 다음과 같은 커널을 사용하게 됩니다.

Bilateral filter는 대략 다음과 같은 계산을 통해 출력 이미지를 생성합니다.

Bilateral filter는 neighbor 픽셀의 weighted average로 정의되는데, 이는 Gaussian convolution과 매우 유사합니다.

두 방식간의 가장 큰 차이점이자 Bilateral filter의 가장 큰 특징은, edge를 보존하기 위해 intensity 차이를 고려한다는 것입니다.

Bilateral filter는 위와 같이 정의됩니다. 전체적인 구조는 이전 Gaussian filter 매우 유사합니다.

Bilateral filter는 다음과 같이 크게 3가지 종류로 구성됩니다.

Normalization factor W_p, Spatial weighting G_sigma_s, Range weighting G_sigma_r

기존 Gaussian filter에서 normalization factor와 range weighting이 추가된 형태입니다.

Definition of bilateral filter

이제 두 필터의 정의가 어떻게 다른지, 각 구성요소가 어떤 역할을 하는지 살펴보겠습니다.

1. Spatial weighting
Gaussian filter에서는 동일한 spatial weighting function이 모든 픽셀에 곱해집니다.

이 방정식은 convolution 연산의 정의를 따릅니다.

Bilateral filter 또한 spatial weighting function이 수행됩니다.

이 기능은 대상 픽셀의 근처(spatial한 관점에서)에서 큰 비율로 발생합니다.

Gaussian filter의 목적은 값을 주변 픽셀과 유사하도록 blending하는 것입니다.

따라서 spatial weighting function은 대상 픽셀에 가까울수록 더 큰 값을 곱해주게 됩니다.

2. Range weighting

Bilateral filter의 intensity range weighting은 매우 중요한 개념인 동시에, 가우시안 필터와의 가장 큰 차이점이기도 합니다.

Range weighting function의 역할은 대상 픽셀과 intensity가 유사한 픽셀에 더 큰 가중치를 할당하는 것입니다.

이 함수는 Gaussian의 spatial weighting function과 유사해 보이지만 상당히 다릅니다.

Spatial weighting function은 픽셀 간의 거리를 기반으로 하고, range weighting function은 intensity 차이를 기반으로 합니다.

따라서 비슷한 intensity 값을 가진 픽셀에 더 큰 가중치가 부여됩니다.

3. Normalization factor

Bilateral filter와 Gaussian filter의 또 다른 차이점은 normalization factor입니다.

이는 출력 이미지를 정규화하는 역할을 합니다.

Gaussian filter는 대부분의 spatial weight의 합이 1이기 때문에 이러한 정규화 과정이 필요하지 않습니다.
반면에 Bilateral filter에는 두 가지 가중치가 있어, 그 합이 더 이상 1이 되지 않기 때문에 해당 인자가 필요합니다.

Definition and comparison of Gaussian filter and bilateral filter

앞서 설명한 Gaussian과 Bilateral filter를 시각화한 결과입니다.

먼저 Gaussian의 경우, 아래 그림의 오른편의 noisy한 계단 형태의입력 I_q가 들어오면
가운데 Gaussian filter를 사용하여 맨 왼쪽 형태와 같은 결과가 도출됩니다.

출력을 보면 입력에 비해 가장자리가 기울어지고 detail이 사라져, 전체적으로 부드러워지는 것을 볼 수 있습니다.

Visualization of Gaussian filter

반면 Bilateral filter의 경우, Gaussian filter와 달리 각 픽셀의 intensity 값인 range weight도 고려합니다.

따라서 그림을 보면 동일한 입력이 들어왔을 때, 각 픽셀 값에 적합한 spatial, range weight를 고려하여 필터가 생성되며,

edge 부분의 디테일은 살아있되, 전체적으로 매끄러워진 결과를 볼 수 있습니다.

Visualization of bilateral filter

최종적으로, Gaussian filter와 Bilateral filter의 차이에 대해 다시 한번 정리해봅시다.

Gaussian은 spatial distance만을 고려하여 전체 영상을 매끄럽게 만드는 기술로, 결과적으로 아래와 같이 전체적으로 흐릿한 영상이 됩니다.
반면, Bilateral은 spatial, range(intensity) distance를 모두 고려하여 픽셀의 공간과 강도에 따라 스무딩을 수행합니다.

따라서 Gaussian filter를 적용했을 때보다 모서리가 더 잘 보존되는 것을 볼 수 있습니다.

Comparison of result of gaussian filter and bilateral filter

4. Experiment result

Bilateral filter는 두 개의 매개변수 σs 및 σr를 갖습니다.

σs는 spatial weight parameter, σr은 range weight parameter입니다.

이러한 Bilateral filter의 spatial, range parameter를 통해 Gaussian에 비해 더 다양한 컨트롤이 가능합니다.

아래 그림은 필자가 구현한 Biternal filter 코드의 결과입니다.

사용된 이미지 크기는 512x512이며, 각 이미지당 약 1분 정도 소요되었습니다.

(built-in function을 사용하지 않고 구현했기에, 시간이 다소 소요되었습니다)

σs 값과 σr 값을 변경하여 출력을 확인한 결과, 각 값이 0에 가까워질수록 원래 값에 가까워지는 것을 확인할 수 있습니다.

그리고 각 값이 증가할수록 흐려지는 정도가 증가합니다.

Images according to changes in sigma value

Gaussian, Bilateral filter를 적용 및 비교한 결과는 다음과 같습니다.

σr 에는 10을, σs에는 50을 사용했습니다.

앞서 언급했듯, 동일한 입력 및 parameter 값에 대해 Gaussian은 전체적으로 흐릿한 이미지를 생성하는 반면

Bilateral은 상대적으로 edge를 보존하면서 노이즈가 제거된 이미지를 생성합니다.

Comparison of result of gaussian filter and bilateral filter

5. Conclusion

정리하자면, Bilateral filter는 spatial weight와 range weight를 사용하여
edge와 디테일을 유지하면서 이미지를 효과적으로 매끄럽게 만드는 기술입니다.
대표적인 low-pass filter인 Gaussian filter는 spatial distance에만 초점을 맞추고 전체적으로 흐릿해집니다.

반면, Bilateral filter 의 경우 spatial, range distance를 모두 고려하여
노이즈를 효과적으로 제거하면서도 edge가 선명한 영상을 얻을 수 있습니다.

[논문리뷰] NeRF (Representing Scenes as Neural Radiance Fields for View Synthesis)

릿99 — Mon, 19 Jun 2023 18:02:05 +0900

NeRF

https://www.matthewtancik.com/nerf

NeRF: Neural Radiance Fields

A method for synthesizing novel views of complex scenes by optimizing an underlying continuous volumetric scene function using a sparse set of input views.

www.matthewtancik.com

1. Introduction

NeRF, Representing Scenes as Neural Radiance Fields for View Synthesis는

3D 장면을 재구성하는 새로운 방법 중 하나로 떠오른 View Synthesis 모델입니다.

사실, 대부분 사람들이 NeRF는 2D 이미지를 3D로 바꾸는 모델로 알고 있지만, 엄밀히 이야기하자면 다릅니다.

NeRF는 객체를 여러 각도에서 촬영한 여러 이미지들을 입력으로 사용하여

새로운 시점에서의 이미지를 생성하는 모델입니다.

Various results of NeRF

위와 같이, NeRF에 다양한 방향에서의 드럼의 이미지를 입력으로 부여하면,

입력 세트에 없는 새로운 시점에서의 드럼 이미지를 생성할 수 있습니다.

즉, 예를 들어, 한 객체의 앞, 뒤, 좌, 우 방향에서 촬영한 몇 장의 이미지를 제공하면,

NeRF는 나머지 시점에서의 이미지를 생성할 수 있습니다.

이러한 모든 이미지를 결합하면, 객체를 3D로 보는 것과 같은 효과가 나타나는 것입니다.

Input and Output of NeRF

NeRF의 가장 핵심적인 아이디어는 3D 장면을 연속적인 5차원의 함수로 표현하는 것입니다.

이 함수는 5차원의 좌표 spatial location (x,y,z), viewing direction (θ,ϕ) 를 입력으로 받아

해당 지점에서의 RGB color (r,g,b), and volume density (σ)를 반환하는 Fully-connected network입니다.

여기서, 입력에 해당하는 3D spatial location (x, y, z)는 관찰 지점(시점),

2D viewing direction (θ,ϕ)는 객체를 바라보는 각도를 나타냅니다.

2. NeRF

2.1. Neural radiance field scene representation

앞서 언급한대로, NeRF는 3D spatial location과 2D viewing direction으로 구성된 5차원 좌표를 입력으로 받아,

RGB color와 volume density를 출력합니다.

NeRF는 neural network architecture를 이용하여 이 5차원 함수를 모델링하며,

서로 다른 시점에서 촬영한 대규모 이미지 세트를 사용하여 훈련됩니다.

Input and output of NeRF

아래 그림은 NeRF의 네트워크 구조를 나타낸 그림으로, 총 9개의 레이어로 구성됩니다.

이 그림에서 주목해야 할 것은, spatial loaction 'x'와 viewing direction 'd'가 함께 입력으로 제공되지 않는다는 점입니다.

NeRF는 먼저 객체의 spatial location (x, y, z)만을 8개의 레이어에 통과시켜, 객체의 volume density 'σ'를 예측합니다.

아래 그림에서 왜 입력 layer의 차원이 각각 60, 24인지는 차후 설명드리겠습니다.

객체의 spatial location이 이러한 8개의 레이어를 통과하면서,

객체의 volume density 'σ'와 함께 256개의 feature vector가 생성됩니다.

Fully-connected network architecture of NeRF (1)

8개의 레이어를 통과한 feature vector는 이후 viewing direction 'd'와 결합하여 최종 레이어의 input으로 사용됩니다.

아래 그림이 이를 나타낸 그림으로, viewing direction 'd'와 feature vector의 결합 정보를 활용하여 color 'c'를 예측합니다.

Fully-connected network architecture of NeRF (2)

이렇듯 레이어를 입력을 2가지로 분류하여 학습하는 이유는 바로, 비램버시안 효과(Non-Lambertian Effect) 때문입니다.

비램버시안 효과란, 관찰 방향에 따라 색상과 반사율이 달라지는 현상을 말합니다.

우리가 보는 대부분의 물체들은 비램버시안 효과를 가지고 있습니다.

아래 그림에서 좌측은 실제 비램버시안 표면인 물체를 나타내고, 우측은 램버시안 표면을 나타냅니다.

실제 세계에서는, 비램버시안 효과와 같이 관찰 각도를 변경하면 특정 부분의 색상이 변화합니다.

따라서, 이를 NeRF의 구조로 돌아와 생각해보면, density 값은 spatial location 'x'에,

RGB 값은 spatial location 'x', viewing direction 'd', density ' ' 에 종속되기 때문에

이전 그림과 같이 분리된 입력을 가진 구조를 갖게됩니다.

Lambertian effect

2.2. Volume rendering with radiance fields

NeRF의 네트워크 구조를 살펴보았으니, 이제 NeRF의 렌더링 과정과 네트워크에 대해 살펴보겠습니다.

Volume rendering은 네트워크에서 얻은 RGB 색상 'c'와 density ' '를 결합하여, 이를 단일 픽셀로 변환하는 과정입니다.

이 과정을 이해하기 전에 알아야 할 개념이 있는데, 바로 'ray'입니다.

volume rendering

ray는 카메라의 위치에서 3D 객체의 한 점을 바라볼 때 형성되는 직선입니다.

ray는 카메라의 지점 'o'로부터 특정 방향 'd'으로 't'만큼 이동한 점들의 집합입니다.

아래 이미지의 검은색 선이 바로 camera ray입니다.

이렇듯, 카메라의 위치 'o'와 viewing direction 'd'가 결정된다면, 아래 그림과 같은 방정식을 통해 ray를 계산할 수 있습니다.

Camera ray

앞서 설명한 ray를 이해했다면, 밀도 density와 ray와의 관계에 대해 이해해야합니다.

density는 간단히 말하자면, 투명도(transparency)의 반대 개념입니다.

density(밀도)가 높다면, 해당 물체는 투명도가 낮아 불투명할 것이고,

density(밀도)가 낮다면, 반대로 투명도가 높아 투명할 것입니다.

아래 그림과 같이, 나무 'A'를 촬영하려고 할 때 물체 'B'가 그 앞을 막고 있다고 가정해봅시다.

여기서 우리가 촬영하려고 하는 나무 'A'는, 'B'의 밀도에 따라 명확하게 촬영될 수도, 그렇지 않을 수도 있습니다.

만약 'B'의 밀도가 그림과 같이 낮다면, 즉 투명하다면, A 나무는 깔끔하게 촬영될 것입니다.

Density and weigth (1)

반면, 아래 그림처럼 'B'가 높은 밀도를 가진다면, 즉 'B'가 투명하지 않다는 것을 의미합니다.

따라서 나무 A는 잘 보이지 않을 것입니다.

Density and weigth (2)

다시 ray로 돌아가 보겠습니다.

아래 그림과 같이 ray 위의 모든 각각의 점 't'는 B와 같은 역할을 합니다.

이러한 하나의 ray 위의 모든 점들의 density에 따라, 결과 이미지의 픽셀 값은 증가하거나 감소할 수 있습니다.

본 논문의 저자들은 ray 위의 모든 점들의 RGB 값과 density의 가중합(weighted sum)을 계산했습니다.

다시 말해, 점 't'의 밀도가 높다는 것은 해당 점이 투명하지 않고 물체가 잘 보이지 않는다는 의미이므로,

점 't'에 대한 가중치는 작은 값을 갖습니다.

반대로, 't'의 밀도가 낮다는 것은, 투명하기때문에 뒤의 물체가 쉽게 볼 수 있다는 의미이므로,

해당 점 't'의 가중치는 높게 설정됩니다.

Weight of points on the ray

앞선 내용을 모두 정리한 식이 바로 아래 그림의 식입니다.

'r'은 하나의 ray, 'tn', 'tf'는 각각 ray가 물체를 통과할 때의 시작점과 끝점을 나타냅니다.

'c'와 'σ'는 각각 점 't'에서의 색상과 밀도(density)를 나타냅니다.

T(t)는 특정 점 't' 앞에 있는 점들의 밀도 합을 나타내며, 밀도 σ(시그마)의 적분을 통해 얻어집니다.

여기서, (-) 기호는 이 값이 크면 원하는 물체가 점 't' 앞에 있는 점들에 의해 가려진다는 것을 나타냅니다.

Volume rendering formula

ray는 직선이고, 사실 이 직선 상에는 무수히 많은 점들이 존재합니다.

만약 이 무수히 많은 점들에 대해 앞선 계산을 모두 수행한다면, 너무 시간이 많이 소요될 것입니다.

따라서 본 논문의 저자들은 ray 상에서 몇 개의 점을 샘플링하고 추출하는 방법을 사용했습니다.

그러나 ray 위의 점들을 무작위로 선택할 경우, 한 부분에서 너무 많은 점들이 선택되거나 하나의 점도 선택되지 않을 수 있습니다.

따라서 저자들은 ray를 'n'개의 동일한 부분으로 분할하고, uniform 하게 sampling 되도록 했습니다.

이에 따라 변형된 식은 아래와 같습니다.

Stratified sampling approach

2.3. Optimizing a neural network radiance field

앞선 설명들을 통해 NeRF의 volume rendering 과정에 대해 알아보았으니,

이제 NeRF에서 사용되는 추가적인 구조 및 기법에 대해 알아보겠습니다.

2.3.1. Positional encoding

NeRF에서 사용한 첫 번째 추가적인 기법은 바로 Positional encoding 입니다.

Positional encoding은 NeRF의 네트워크가 high frequency(detail) 영역까지 학습할 수 있도록 하는 기법입니다.

사실 이 기법은 새로운 것은 아니고, low dimension 입력 값을 high dimensional space로 mapping 할 때,

네트워크 입력 전에 high frequency function을 이용하면 효과적이라는 이전 연구 결과를 차용한 것입니다.

예를 들어, 값 1과 2가 MLP(Multi-Layer Perceptron) 레이어를 통과하면 매우 유사한 값이 나오게 됩니다.

이는 레이어가 가중치 합인데, 값의 차이가 크지 않으면 결과 값도 매우 유사해지기 때문입니다.

이를 해결하기 위해 Positional encoding을 사용하여 고차원으로 변환합니다.

아래 그림은 Positional encoding의 식과 결과로,

그림에서 볼 수 있듯, Positional encoding이 포함되지 않은 경우 고해상도를 잘 표현할 수 없지만,

Positional encoding을 사용하면 고해상도 부분이 잘 표현되는 것을 확인할 수 있습니다.

Result of positional encoding

앞서 설명한대로, Positional encoding에는 사인(sin)과 코사인(cos) 함수가 사용되며,
이로 인해 dimension이 2L배로 증가합니다.

본 논문에서는 위치 'x'와 viewing direction 'd'에 대해 다른 크기의 L을 적용했습니다.

위치에는 viewing direction보다 더 많은 정보가 있기 때문에 위치에는 L=10을 사용하고,

viewing direction에는 L=4를 사용했습니다.(이 값은 논문의 저자들에 의해 결정)

아래 그림의 빨간 상자를 살펴보면, Positional encoding을 통해 각각 60과 24의 차원으로 변환된 것을 볼 수 있습니다.

Positional encoding of NeRF

2.3.2. Hierarchical volume sampling

NeRF에서 사용한 두 번째 추가적인 기법은 Hierarchical volume sampling 입니다.

이전 과정을 살펴보면 하나의 네트워크만 필요한 것처럼 보이지만, 사실 논문에서는 두 개의 네트워크를 사용합니다.

바로 coarse 네트워크와 fine 네트워크 입니다.

각각의 네트워크는 coarse한(전체적인) 정보와 fine한(세부적인) 정보를 다루며,

ray 위의 점 t를 어떻게 샘플링하는지에 따라 차이가 있습니다.

샘플링을 수행할 때, ray 위의 한 점이 객체에서 선택될 수도 있고, 아무것도 없는 지점이 선택될 수도 있습니다.

저자들은 이 아이디어에서 출발하여, 아무것도 없는 지점보다는

객체가 존재하는 지점에서 추가적인 학습을 수행하는 것이 효과가 더 좋을 것이라고 생각했습니다.

따라서, 앞서 이야기한 두 개의 네트워크가 사용됩니다.

첫 번째로, coarse 네트워크는 전체 레이에서 샘플링을 통해 학습을 수행합니다.

그리고 fine 네트워크는 학습된 결과 중에서 밀도 값이 큰 부분만 선택하여 추가적인 학습을 위해 재샘플링을 수행합니다.

최종 결과는 이 두 개의 네트워크를 결합하여 얻어집니다.

Hierarchical volume sampling

Hierarchical volume sampling에서 coarse 네트워크는 이전에 설명한대로 모든 레이에서 균일하게 t를 샘플링하여 학습을 수행합니다.

이렇게 샘플링된 점들은 coarse 네트워크를 통과하게 됩니다.

그리고 이 coarse 네트워크에서 얻은 가중치 값은 다시 샘플링을 위한 확률 분포로 사용됩니다.

이렇게 재샘플링된 점들은 fine 네트워크의 학습에 사용됩니다.

loss function 및 동작과정은 아래와 같습니다.

아래 식의 첫 번째 부분(연두색)은 coarse 네트워크, 두 번째 부분(주황색)은 fine 네트워크의 loss에 해당합니다.

NeRF는 일부 학습 데이터로부터 학습을 수행하고, 즉시 새로운 시점에서 이미지를 생성합니다.

이 새롭게 생성된 이미지는 나머지 학습 데이터와 비교하게되는데,

이 때, 나머지 데이터는 Ground Truth와 동일한 역할을 하며, 두 이미지 간의 손실을 비교하여 역전파를 통한 학습을 수행합니다.

Hierarchical volume sampling

3. Experiment results

다음으로 NeRF의 실험 결과입니다.

NeRF의 저자들은 두 개의 데이터셋을 기반으로 모델을 평가했습니다.

첫 번째로, synthetic dataset입니다.

해당 데이터셋은 DeepVoxels dataset과 generated dataset으로 나뉘며, 아래 그림과 같습니다.

Synthetic dataset

두 번째로, real-world dataset입니다.

해당 데이터셋은 말 그대로 실제 환경에서 찍힌 이미지를 이용한 데이터셋으로, 아래 그림과 같습니다.

Real world dataset

이전에 언급한 바와 같이, NeRF의 저자들은 2개의 데이터셋(세부적으로는 3개)을 테스트했습니다.

Diffuse Synthetic는 DeepVoxels 데이터셋을 사용한 synthetic dataset이며,

Realistic Synthetic은 generated synthetic dataset입니다.

마지막으로, Real Forward-Facing은 real-world dataset입니다.

아래 표와 같이 거의 모든 데이터셋과 평가 지표에서 NeRF가 기존의 방법들보다 우수한 성능을 보였으며,

그림에서도 볼 수 있듯, 다른 기법들에 비해 high frequency 성분을 잘 나타내는 것을 확인 할 수 있습니다.

Comparison with other methods (1)

Comparison with other methods (2)

Comparison with other methods (3)

4. Conclusion

NeRF는 다양한 방향에서 촬영된 이미지를 입력으로 받아, 다른 시점에서의 이미지를 반환하는 기술로,

현실적인 3D 모양을 생성 및 렌더링하는데 자주 사용됩니다.

NeRF는 이러한 고품질의 3D 재구성 및 렌더링 기술을 제공하지만, 명백한 단점이 있습니다.

NeRF는 단 하나의 장면을 최적화하기 위해 약 100에서 300번의 iteration을 필요로 하며, 실제로는 1~2일이 소요됩니다.

이렇게 NeRF는 매우 큰 계산 비용과 메모리를 요구합니다.

최근에는 이러한 단점을 극복하기 위해 FastNeRF 등의 방법들이 제안되기도 했습니다.

Reference

https://modulabs.co.kr/blog/nerf-from-2d-to-3d/

NeRF: 2D 이미지를 3D로 바꿔준다고요?

요즘 인공지능 분야에서 핫한 분야가 무엇일까요? 아마도 NERF가 아닐까 싶습니다. NeRF(Neural radiance Fields)는 2D 이미지를 3D로 변환해주는 모델입니다. 이번 콘텐츠에서는 NeRF에 대해 알아보겠습니

modulabs.co.kr

https://cobslab.com/%EC%B5%9C%EC%B4%88%EC%9D%98-%EC%97%B0%EC%86%8D%EC%A0%81%EC%9D%B8-%EC%8B%A0%EA%B2%BD-%EC%9E%A5%EB%A9%B4-%ED%91%9C%ED%98%84-nerf-representing-scenes-as-neural-radiance-fields-for-view-synthesis/

최초의 연속적인 신경 장면 표현- NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis - cobslab

안녕하세요 콥스랩(COBS LAB)입니다. 오늘 소개해 드릴 논문은 ‘NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis’입니다. 해당 내용은 유튜브 ‘딥러닝 논문 읽기 모임’ 중 ‘NeRF’ 영상

cobslab.com

https://woochan-autobiography.tistory.com/933#3.1

Paper Review - NeRF (Representing Scenes as Neural Radiance Fields for View Synthesis)

NeRF 논문 리뷰 포스팅입니다. 3D Vision 공부를 시작한 이후 첫 논문 리뷰 및 정리인 만큼 잘못된 부분이 많을 수 있으니, 잘못된 부분은 언제든 코멘트 해주시기 바랍니다. Introduction NeRF는 어떤 물

woochan-autobiography.tistory.com

[논문구현] DenseNet (Densely Connected Convolutional Networks) 구현

릿99 — Fri, 27 Jan 2023 17:24:55 +0900

DenseNet 에 대한 논문 리뷰

https://beginnerdeveloper-lit.tistory.com/161

[논문리뷰] DenseNet (Densely Connected Convolutional Networks)

DenseNet https://arxiv.org/abs/1608.06993 Densely Connected Convolutional Networks Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close

beginnerdeveloper-lit.tistory.com

DenseNet

출처 : Densely Connected Convolutional Networks

지난번 리뷰한 "Densely Connected Convolutional Networks" 논문의

DenseNet을 구현해보고자 한다.

DenseNet이란, 모든 Layer들을 densely하게 연결하는 dense connectivity pattern을 사용한 네트워크로,

다른 구조들에 비해 적은 파라미터 수를 가지고도 뛰어난 결과와 낮은 연산량을 가진 것이 특징이다.

DenseNet의 자세한 구조는 아래 Table을 참고하자.

출처 : Densely Connected Convolutional Networks

이 중에서 필자는 비교적 간단한 DenseNet 121-Layer를 구현했다.

다른 모델에 비해 필터의 개수가 적은 것이 특징이다.

모든 DenseNet 모델들은 위와 같은 구조를 갖는다.

down-sampling을 용이하게 하기 위해, 네트워크를 다음과 같이 3개의 Dense Block으로 나누었다.

DenseNet은 이렇듯 3개의 Dense Block과 각 Dense Block 사이의 Transition Layer로 구성되며,

각 Transition Layer는 Convolution과 Pooling을 수행한다.

DenseNet은 앞서 언급했듯, 단순히 모든 Layer를 연결하는 방식이다.

ResNet과의 연결 방식의 차이점은 summation이 아닌 concatenation을 사용한다는 점이다.

이는 하단의 코드에서 자세히 살펴보도록 하자.

Environment & Parameter

❗ 해당 논문의 DenseNet-121 모델의 구조에 초점을 맞춰 구현하였으며,

그 외 세부적인 사항까지 완벽하게 구현하지는 못했습니다.

❗ 또한, 논문에 사용된 Dataset과 다른 Dataset을 사용했으므로,

Parameter들 또한 상이하다는 점 양해 부탁드립니다.

실험에 사용한 환경은 아래와 같습니다.

Language : Python

Framework : Tensorflow (GPU)

Dataset : Kaggle Dog & Cat 중 일부 사용 (train : Dog 5000, Cat 5000 / validation : Dog 2000, Cat 2000)

(https://www.kaggle.com/datasets/tongpython/cat-and-dog?select=training_set)

(Dataset 중 일부 훼손된 이미지가 있어, 해당 이미지들 필수 삭제 후 훈련 필요)

Image Size : 224 x 224 x 3

Batch Size : 32

Epoch : 50

Learning Rate : 0.001

DenseNet Code

<DenseNet Layers Code>

코드를 보면서 DenseNet Model의 각 Layer들을 하나씩 살펴보도록 하자.

먼저 다시 DenseNet-121의 구조를 보면 아래 Table과 같다.

출처 : Densely Connected Convolutional Networks

본격적인 구현에 앞서, 중요한 Parameter들을 하나씩 설정하고 넘어가자.

바로, Growth Rate와 Compression이다.

Growth Rate란, 각 Layer에서 몇 개의 feature map을 뽑을지 결정하는 Parameter이며,

각 Layer가 전체 output에 어느정도 기여할지를 결정하는 Parameter이기도 하다.

모델의 compactness를 향상시키기 위해, transition Layer의 feature map의 개수를 줄일 수 있다.

만약 dense block이 m개의 feature map을 가지고 있다면,

그 뒤의 transition Layer는 θm개의 output feature map을 반환할 것이다.

여기서, θ가 바로 compression factor이다. (단, 0 < θ <= 1)

본 논문의 실험에서는 θ = 0.5 로 설정했기에, 필자도 동일하게 설정했다.

def DenseNet(x):
    # input = 224 x 224 x 3
    k = 32  # Grow Rate
    compression = 0.5   # compression factor

[ Convolution ]

첫 번째 Layer는 Convolution Layer로, 위 Table과 같이 kernel size = 7 x 7 (stride = 2) 연산을 수행한다.

output size = 112 x 112 이다.

# 1. Convolution
x = layers.Conv2D(k * 2, (7, 7), strides=2, padding='same', input_shape=(224, 224, 3))(x)    # 112x112x64
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

[ Pooling ]

다음은 Pooling Layer로, 위 Table과 같이 kernel size = 3 x 3 (stride = 2) max pooling 연산을 수행한다.

output size = 56 x 56이다.

# 2. Pooling
x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64

[ Dense Block (1) ]

다음은 Dense Block이다. model마다 Dense Block의 구조는 약간씩 다르다.

DenseNet-121에서는 1 x 1 Convolution과 3 x 3 Convolution을 각각 6, 12, 24, 16회 반복한다.

첫 번째 Dense Block은 위와 같은 연산을 6번 반복하면 된다.

단, 여기서 한 가지 중요한 점이 있다.

바로, Densely Connectivity 이다.

DenseNet이란 모든 Layer들을 연결한다는 개념에서 시작한 Network이다.

이전 모든 feature map들에 대해 접근 및 ResNet과는 달리 이를 concat하여 구현함에 주의해야한다.

DenseNet의 중심이자 주요 알고리즘인 이 connectivity를 필자는 다음과 같이 구현했다.

# 3. Dense Block (1)
for i in range(6) :
    x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)    # 56x56x128
    x_l = layers.BatchNormalization()(x_l)
    x_l = layers.Activation('relu')(x_l)

    x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)  # 56x56x32
    x_l = layers.BatchNormalization()(x_l)
    x_l = layers.Activation('relu')(x_l)

    x = layers.Concatenate()([x, x_l])  # 96 -> 128 -> 160 -> 192 -> 224 -> 256

n번째 Layer는 이전의 모든 Layer들의 feature map(X0, ... ,Xn-1)을 input으로 받게 된다.

이를 수식으로 나타내면 위와 같다.

위 식에 따라 구현된 코드를 살펴보면,

이전 Layer의 output인 x를 현재 Dense Block 내의 연산을 거친 feature map x_l과 concat한다.

(여기서, 저번 ResNet의 경우 Concatenate이 아닌, Add 를 사용했다.)

이렇게 연결된 x는 다시 해당 Dense Block의 input으로 들어가게 되고,

이러한 과정을 Table에 적힌 횟수만큼 반복하면 된다.

이 과정을 반복하게 되면, 결국 모든 feature map들은 위 식과 같이 이어지게 되고,

이것이 해당 Layer, Dense Block의 결과가 된다.

[ Transition Layer (1) ]

다음은 Transition Layer 이다.

Transition Layer는 Convolution과 Pooling 연산을 하는 Layer로,

Table과 같은 연산을 적용해주기만 하면 된다.

단, Convolution 과정에서 위에서 언급한 compression, 압축률이 사용되며

이를 통해 filter의 개수가 조절된다.

    # 4. Transition Layer (1)
    current_shape = int(x.shape[-1]) # 56x56x256
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 28x28

이후 나머지 Dense Block과 Transition Layer의 구조는 모두 동일하므로,

설명을 생략하도록 하겠다.

[ Classification Layer ]

네트워크 가장 끝단의 Classification Layer에 대해 살펴보자.

이 Layer에서는 Table과 같이 global average pooling 및 1000D fc-connected, softmax 연산이 이루어진다.

단, 필자는 2개의 class로 분류되는 dataset을 이용했으며, dataset에 따라 변경이 필요하다.

# 10. Classification Layer
x = layers.GlobalAveragePooling2D()(x)
# classes = 2 (softmax)
x = layers.Dense(2, activation='softmax')(x)

<DenseNet Model Code>

'''
< DenseNet Architecture>
- Dense Connectivity pattern
- Dense-121, Dense-169, Dense-201, Dense-264
- Implement Dense-121 (6, 12, 24, 16)
'''

def DenseNet(x):
    # input = 224 x 224 x 3
    k = 32  # Grow Rate
    compression = 0.5   # compression factor

    # 1. Convolution
    x = layers.Conv2D(k * 2, (7, 7), strides=2, padding='same', input_shape=(224, 224, 3))(x)    # 112x112x64
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)

    # 2. Pooling
    x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64

    # 3. Dense Block (1)
    for i in range(6) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)    # 56x56x128
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)  # 56x56x32
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])  # 96 -> 128 -> 160 -> 192 -> 224 -> 256

    # 4. Transition Layer (1)
    current_shape = int(x.shape[-1]) # 56x56x256
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 28x28

    # 5. Dense Block (2)
    for i in range(12) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])

    # 6. Transition Layer (2)
    current_shape = int(x.shape[-1])
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 14x14

    # 7. Dense Block (3)
    for i in range(24) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])

    # 8. Transition Layer (3)
    current_shape = int(x.shape[-1])
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 7x7

    # 9. Dense Block (4)
    for i in range(16) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])

    # 10. Classification Layer
    x = layers.GlobalAveragePooling2D()(x)
    # classes = 2 (softmax)
    x = layers.Dense(2, activation='softmax')(x)

    return x

<Entire Code>

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

tf.test.is_gpu_available()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

'''
< DenseNet Architecture>
- Dense Connectivity pattern
- Dense-121, Dense-169, Dense-201, Dense-264
- Implement Dense-121 (6, 12, 24, 16)
'''

def DenseNet(x):
    # input = 224 x 224 x 3
    k = 32  # Grow Rate
    compression = 0.5   # compression factor

    # 1. Convolution
    x = layers.Conv2D(k * 2, (7, 7), strides=2, padding='same', input_shape=(224, 224, 3))(x)    # 112x112x64
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)

    # 2. Pooling
    x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64

    # 3. Dense Block (1)
    for i in range(6) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)    # 56x56x128
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)  # 56x56x32
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])  # 96 -> 128 -> 160 -> 192 -> 224 -> 256

    # 4. Transition Layer (1)
    current_shape = int(x.shape[-1]) # 56x56x256
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 28x28

    # 5. Dense Block (2)
    for i in range(12) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])

    # 6. Transition Layer (2)
    current_shape = int(x.shape[-1])
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 14x14

    # 7. Dense Block (3)
    for i in range(24) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])

    # 8. Transition Layer (3)
    current_shape = int(x.shape[-1])
    x = layers.Conv2D(int(current_shape * compression), (1, 1), strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.AveragePooling2D((2, 2), strides=2, padding='same')(x)   # 7x7

    # 9. Dense Block (4)
    for i in range(16) :
        x_l = layers.Conv2D(k * 4, (1, 1), strides=1, padding='same')(x)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x_l = layers.Conv2D(k, (3, 3), strides=1, padding='same')(x_l)
        x_l = layers.BatchNormalization()(x_l)
        x_l = layers.Activation('relu')(x_l)

        x = layers.Concatenate()([x, x_l])

    # 10. Classification Layer
    x = layers.GlobalAveragePooling2D()(x)
    # classes = 2 (softmax)
    x = layers.Dense(2, activation='softmax')(x)

    return x


# Parameter
batch_size = 32
epoch = 50
learning_rate = 0.001

# Dataset (Kaggle Cat and Dog Dataset)
dataset_path = os.path.join('/home/kellybjs/Cat_Dog_Dataset')
train_dataset_path = dataset_path + '/train_set'
train_data_generator = ImageDataGenerator(rescale=1. / 255)
train_dataset = train_data_generator.flow_from_directory(train_dataset_path,
                                                         shuffle=True,
                                                         target_size=(224, 224),
                                                         batch_size=batch_size,
                                                         class_mode='categorical')

valid_dataset_path = dataset_path + '/validation_set'
valid_data_generator = ImageDataGenerator(rescale=1. / 255)
valid_dataset = valid_data_generator.flow_from_directory(valid_dataset_path,
                                                         shuffle=True,
                                                         target_size=(224, 224),
                                                         batch_size=batch_size,
                                                         class_mode='categorical')


# Train
input_shape = layers.Input(shape=(224, 224, 3), dtype='float32', name='input')
model = tf.keras.Model(input_shape, DenseNet(input_shape))
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['acc'])
model.summary()
train = model.fit_generator(train_dataset, epochs=epoch, validation_data=valid_dataset)

# Accuracy graph
plt.figure(1)
plt.plot(train.history['acc'])
plt.plot(train.history['val_acc'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('DenseNet_Accuracy_1.png')

# Loss graph
plt.figure(2)
plt.plot(train.history['loss'])
plt.plot(train.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('DenseNet_Loss_1.png')

Result

DenseNet Accuracy

위 코드를 적용한 Train 및 Validation Accuracy 결과이다.

Train 시에는 최대 약 90~100 %, Validation 시에는 약 80~90%의 정확도가 나오는 것을 볼 수 있다.

DenseNet Loss

위 코드를 적용한 Train 및 Validation Loss 결과이다.

두 그래프 모두 점차 Loss 가 줄어드는 것이 보이나, Validation의 경우 후반에 많이 진동하는 점이 아쉽다.

[논문리뷰] DenseNet (Densely Connected Convolutional Networks)

릿99 — Fri, 27 Jan 2023 14:51:06 +0900

DenseNet

https://arxiv.org/abs/1608.06993

Densely Connected Convolutional Networks

Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observa

arxiv.org

1. INTRODUCTION

CNN(Convolutional Neural Network)은 객체 검출 및 인식을 위한 주요한 머신러닝 기법으로 자리잡았다.

수십년이 지난 지금, 컴퓨터 하드웨어와 네트워크 구조 등의 발전으로 인해

더 깊은 CNN을 구축하고 학습할 수 있게 되었다.

특히 ResNet 같은 경우, 100-Layer의 장벽을 뛰어넘기도 했다.

CNN이 점차 깊은 구조를 갖게되면서, 여러 문제들이 대두되기 시작됐다.

그 중 하나가 input이 수많은 Layer를 거치게 되면서, 네트워크 끝단에 다다를수록 정보가 소실된다는 것이다.

이러한 문제들을 위해 다양한 연구들이 진행되었으나, ^[각주:1] ^[각주:2] ^[각주:3]

모두 앞 Layer에서 뒷 Layer로의 short path를 사용한다는 공통점이 있었다.

이러한 아이디어들을 바탕으로, 본 논문에서는 간단한 아이디어의 connectivity pattern을 하나 제안했다.

바로, 네트워크의 레이어들 사이의 maximum information flow를 보장하기 위해,

모든 레이어들을 서로 연결하는 것이다.

아래 그림을 보자.

출처 : Densely Connected Convolutional Networks

위 그림이 바로 DenseNet의 구조를 가장 잘 나타내는 그림이다.

저번에 리뷰한 ResNet과의 가장 큰 차이점은,
ResNet이 Feature들이 Layer로 전달되기 전, summation을 통해 결합되는 반면

DenseNet은 Feature들을 concatenation하여 결합한다는 것이다.

다시 말해, ResNet의 경우 자기자신인 Identity를 더해 다음 Layer의 input으로 부여했다면,

DenseNet은 단순 덧셈이 아닌, feature들을 "연결"하여 부여하게 된다.

따라서 n번째 Layer에는 이전의 모든 convolution block들의 feature map으로 이루어진 n개의 input이 있다.

이러한 feature map들은 (L - n)개의 subsequent Layer를 지나게 되고,

결국 L-layer network는 L(L + 1)/2 개의 connection을 갖게 된다.

(기존 구조들의 경우(ex) ResNet), L개의 connection을 갖는다.)

이렇듯 모든 Layer들을 촘촘하게(densely) 연결하는 dense connectivity pattern을 사용한 것이

바로 Dense Convolutional Network, DenseNet이다.

본 논문에서 소개한 DenseNet은 기준으로 삼은 다른 알고리즘들보다 더 놀라운 성과를 보였다.

그렇다면, 이러한 DenseNet의 장점에 대해서 간단히 알아보자.

1. Fewer Parameter

DenseNet은 모든 Layer를 연결한다는 점에서 기존 방식들보다 많은 Parameter가 필요할 것 같지만,

사실 DenseNet은 불필요한 feature map들을 재학습할 필요가 없기 때문에 더 적은 Parameter를 갖는다.

2. Information Preserved

기존 방식 중 하나인 ResNet의 경우 identity transformation을 통해 정보를 보존한다. ^[각주:4]

또한, ResNet을 변형한 다양한 연구들의 경우 training 중 random하게 정보가 누락되기도 한다. ^[각주:5]

하지만 DenseNet은 네트워크에 추가되는 정보와 보존되는 정보를 명확히 구분하여 정보를 보존한다.

DenseNet의 Layer들은 매우 좁은 형태를 갖는데, (Layer당 12개의 필터)

네트워크의 "collective knowledge"에 작은 세트의 feature map만 더하며,

나머지 feature map들은 변경되지 않은 상태로 유지한다.

마지막 classifier는 네트워크의 모든 feature-map에 기반하여 decision한다.

3. Improve Flow of Information and Gradient

DenseNet의 가장 큰 장점은 training에 용이하도록 정보와 gradient의 flow를 개선시켰다는 점이다.

각 Layer들은 Loss function과 original input signal의 gradient에 직접적으로 접근가능하다.

(모든 Layer들이 서로 연결되어 있기 때문)

위와 같은 접근은 더 깊은 네트워크 구조에서의 training을 용이하게 해줄 뿐만 아니라,

Overfitting을 방지하는 Regularization 효과까지 얻을 수 있다.

2. RELATED WORK

DenseNet과 유사한 cascade 구조는 1980년대 신경망 문헌에서 이미 연구된 바 있으며, ^[각주:6]

비교적 최근에도 batch gradient decent 방식을 통해 train된

fully-connected cascade network가 연구된 바가 있다. ^[각주:7]

이는 소규모 Dataset에는 효과적이었지만, 수백 개의 Parameter를 가진 네트워크에만 적용할 수 있었다.

다른 여러 연구들을 통해 skip-connection을 통해 CNN의 multi-level feature를 활용하는 것이

다양한 vision 작업에 효과적인 것으로 드러났다. ^[각주:8] ^[각주:9] ^[각주:10] ^[각주:11]

특히 DenseNet과 유사한 cross-layer connection을 가진 네트워크 프레임워크를 도출한 연구도 존재했다. ^[각주:12]

Highway Network는 처음으로 100 Layer가 넘는 end-to-end 네트워크를 효과적으로 학습한 구조이다.

gate unit와 bypassing path를 이용해 죄적화했는데, bypassing path가 이 네트워크의 핵심이다.

ResNet의 경우, pure identity mapping이 바로 ResNet의 bypassing path이다. ^[각주:13]

ResNet 또한 깊은 Layer에도 불구하고, ImageNet과 COCO 객체 검출에서 뛰어난 성과를 보여주었다.

최근에는 1202-Layer ResNet을 성공적으로 학습시키기 위해 stochastic depth 라는 것이 제안되었는데, ^[각주:14]

stochastic depth는 학습 중 랜덤으로 Layer를 drop하면서, 깊은 residual network의 학습을 용이하게 해준다.

이러한 기법은 학습 시 모든 Layer가 필요하지 않을 수도 있다는 것과 함께,

깊은 네트워크에 엄청난 양의 중복(Redundancy)이 존재함을 보여주었다.

네트워크를 더 깊게 만드는 가장 직관적인 방법은 네트워크의 폭을 늘리는 것이다.

GoogleNet의 경우, 다양한 크기의 필터에 의해 생성된 feature map들을 concatenate한
"Inception module"을 사용했다. ^[각주:15] ^[각주:16]

사실 깊이가 충분하다면, ResNet의 각 Layer를 구성하는 필터 개수를 늘리기만하면 성능은 향상된다.

FractalNet 또한 넓은 네트워크 구조를 이용해 여러 Dataset에서 좋은 성능을 거두었다.

DenseNet은 깊거나 넓은 구조를 이용해 성능을 높이는 것과는 달리,

feature 재사용을 통해 parameter 효율성이 높고 훈련하기 쉬운 간단한 모델을 생성한다.

서로 다른 Layer들에 의해 학습된 feature map들을 concatenating 함으로써,

subsequent Layer들의 input 변화가 증가하고, 효율성 또한 향상된다.

이는 앞서 언급한 DenseNet과 ResNet 사이의 가장 큰 차이점과 동일하다.

(ResNet은 비교적 깊고 넓은 구조인 반면, DenseNet은 좁은 구조이다.)

ResNet뿐만 아니라, 다른 Layer들을 concatenate하는 Inception Network ^[각주:17] ^[각주:18]의 경우와 비교했을때에도

DenseNet은 더 단순하고 효율적인 구조를 갖는다.

마지막으로, 주목할만한 결과를 낸 혁신적인 네트워크 구조들에 대해서 살펴보자.

º Network in Network(NIN) ^[각주:19]구조는 더 복잡한 feature들을 도출하기위해

convolutional Layer의 filter에 micro multi-layer perceptron(다층 퍼셉트론)을 추가했다.

º Deeply Supervised Network(DSN) ^[각주:20]에서 내부 Layer들은 auxiliary classifier들을 통해 supervised되며,

이를 통해 이전 Layer들로부터 받은 gradient를 강화했다.

º Ladder Network ^[각주:21] ^[각주:22]는 autoencoder에 lateral connection을 도입하여,

semi-supervised learning에서 높은 정확도를 제공한다.

º Deeply-Fused Net(DFN) ^[각주:23]은 서로 다른 base 네트워크들의 중간 Layer를 combining 함으로써,

information flow를 보다 향상시켰다.

º Reconstruction Loss를 최소화하는 pathway를 사용해 네트워크를 확대함으로서,

이미지 분류 모델 또한 개선되었다. ^[각주:24]

3. DenseNets

단일 이미지 X0가 convolutional network를 통과한다고 하자.

네트워크는 L개의 Layer로 구성되어있고, 각 Layer에서는 non-linear transformation H(n)를 수행한다.

(여기서 n은 Layer의 index를 의미)

H는 Batch Normalization, RELU, Pooling, Convolution과 같은 동작들로 이루어져있으며,

n번째 Layer의 결과를 Xn이라 하자.

< ResNets >

기존의 Convolutional feed-forward network는

n번째 Layer의 output을 그 다음 Layer인 (n + 1)번째 Layer의 input으로 연결한다.

식으로 나타내면 다음과 같다.

Traditional Net

ResNet은 identity function을 통해 non-linear transformation을 건너뛰는 skip connection을 추가한다.

이 또한 식으로 나타내면 다음과 같다.

ResNet

ResNet의 장점은 identity function을 통해 Later Layer에서 Earlier Layer로
gradient가 직접적으로 흐를 수 있다는 점이다.

하지만 identity function과 H의 output이 summation을 통해 합쳐지면서
네트워크의 information flow를 방해할 수도 있다.

< Dense Connectivity >

Layer들 간의 information flow를 향상시키기 위해, 본 논문에서는 다른 connectivity pattern을 제안했다.
어떤 Layer 던지 subsequent Layer로 직접적으로 연결하는, 모든 Layer를 연결하는 방식이다.

아래 그림을 통해 자세히 살펴보자.

출처 : Densely Connected Convolutional Networks

Figure2.는 DenseNet의 구조를 도식화한 그림이다.

결과적으로, n번째 Layer는 이전의 모든 Layer들의 feature map(X0, ... ,Xn-1)을 input으로 받게 된다.

이를 수식으로 나타내면 다음과 같다.

위 식에서 [X0, X1, ...]은 각 Layer에서 만들어진 feature map들의 concatenation이다.

이러한 dense connectivity 때문에, 이러한 구조를 DenseNet이라 명명했다.

< Composite Function >

다른 연구^[각주:25]에서 영감을 받아, H(n)을 다음 3가지 operation의 composite function으로 정의했다.
Batch Normalization, RELU, 3 x 3 Convolution

< Pooling Layers >

위의 식(2)에서 사용된 concatenation operation은 feature map의 사이즈가 바뀌면 사용할 수 없다.

하지만, Convolutional network의 down-sampling Layer를 통해 feature map의 사이즈를 바꿀 수 있다.

down-sampling을 용이하게 하기 위해, 네트워크를 서로 밀집하게 연결된 여러개의 dense block으로 나누었다.

위의 Figure 2.를 다시 보자.

각 Block 사이에서 Convolution과 Pooling을 수행하는 Layer를 Transition Layer라고 한다.

본 논문에서의 Transition Layer는 Batch Normalization Layer, 1 x 1 Convolutional Layer,

그리고 2 x 2 Pooling Layer로 구성되어있다.

< Growth Rate >

만약 각각의 function H(n)이 K개의 feature map을 만든다고 하면,

n번째 Layer의 feature map의 개수는 K0 + K(n - 1)개가 될 것이다.

(여기서 K0은 input Layer의 채널 수)

DenseNet과 기존 네트워크 구조들 간의 차이점은, DenseNet이 비교적 좁은 Layer를 가졌다는 것이다. (K = 12)

여기서 이 K를 네트워크의 Growth Rate라고 한다.

Growth Rate란, 다시 말해, 각 Layer에서 몇 개의 feature map을 뽑을지 결정하는 Parameter이며,

각 Layer가 전체 output에 어느정도 기여할지를 결정하는 Parameter이기도 하다.

< Bottleneck Layers >

각 Layer들은 K개의 output feature map을 생성하지만, 일반적으로 그에 비해 더 많은 input이 필요하다.

저번 ResNet 논문 리뷰에서도 언급했듯이, bottleneck 구조는 1 x 1 convoltion을 통해
3 x 3 convolution에서의 input을 줄임으로서 computational efficiency를 높일 수 있다.

DenseNet에서도 이러한 Bottleneck 구조를 사용했다.

< Compression >

모델의 compactness를 향상시키기 위해, transition Layer의 feature map의 개수를 줄일 수 있다.

만약 dense block이 m개의 feature map을 가지고 있다면,

그 뒤의 transition Layer는 θm개의 output feature map을 반환할 것이다.

여기서, θ가 바로 compression factor이다. (단, 0 < θ <= 1)

만약 θ < 1 인 경우, 이러한 DenseNet은 DenseNet-C라고 하며,

본 논문의 실험에서는 θ = 0.5 로 설정했다.

bottleneck, transition Layer 모두 θ < 1 인 경우, DenseNet-BC라고 명명했다.

< Implementation Details >

ImageNet dataset을 제외한 모든 dataset에서 사용된 DenseNet은

모두 동일한 개수의 Layer를 가진 3개의 Dense Block으로 구성된다.

첫 번째 Dense Block을 들어가기 전, input image에 대해 convolution with 16 output channel을 수행했다.

feature map size를 고정하기 위해, 커널 사이즈가 3 x 3 인 convolution Layer에서의 input은

모두 1 pixel씩 zero-padding 했으며,

인접한 두 Dense Block 사이의 Transition Layer로는 1 x 1 Convolution, 2 x 2 average pooling 을 사용했다.

마지막 Dense Block의 끝에는 global average pooling, softmax classifier가 실행된다.

3개의 Dense Block의 feature map size는 각각 32× 32, 16×16, 8×8이다.

ImageNet dataset에 대한 DenseNet은 DenseNet-BC를 사용했다.

224 x 224 이미지를 input으로 하며, Dense Block의 개수는 4개이다.

자세한 네트워크 구성은 아래 Table 1. 을 참고하자.

출처 : Densely Connected Convolutional Networks

4. EXPERIMENTS

DenseNet의 효과를 입증하기 위해 여러 데이터셋에 대해 실험을 진행했으며,

ResNet과 같은 최신 구조와 성능을 비교했다.

4.1. DATASETS

< CIFAR >

두 개의 CIFAR dataset(32 x 32 pixel의 color 이미지)을 사용했으며, (CIFAR-10(C10), CIFAR-100(C100))

training, test set은 각각 50,000, 10,000개이며, training set 중 5,000개는 validation에 사용했다.

data augmentation 방식으로는 가장 표준적인 mirroring/shifting 을 사용했다.

data augmentation을 거친 dataset의 이름에는 "+" 기호를 붙였다.(ex) C10+)

전처리로는 channel 평균과 표준편차를 이용하여 데이터를 정규화했으며,

최종 실행 시 50,000개의 data를 모두 사용하고 training이 끝나면 최종 test error를 report 했다.

< SVHN >

Street View House Numbers (SVHN) dataset(32 x 32 pixel의 color 이미지)을 사용했으며,

training, test set은 각각 73,257, 26,032개이며, 추가적인 training을 위해 531,131개의 이미지를 사용했다.

data augmentation 없이 training을 진행했으며, training set 중 6,000개는 validation에 이용했다.

training 중 가장 낮은 validation error를 보인 모델을 선택하고 test error를 report 했다.

추가적으로, 픽셀값이 [0, 1] 범위 내에 있도록 255로 나눠주었다.

< ImageNet >

ILSVRC 2012 classification dataset을 사용했으며,

training, validation set은 각각 1.2million, 50,000개이며, 1,000개의 class로 나뉜다.

모든 training image에 대해 동일한 data augmentation을 적용했으며,

test 시 single-crop 또는 10-crop(with size 224 x 224) 를 적용했다.

classification error는 validation set의 결과를 바탕으로 했다.

4.2. Training

Optimization : SGD

Batch size : 64(CIFAR & SVHN), 256(ImageNet)

Epoch : 300(CIFAR), 40(SVHN), 90(ImageNet)

Learning rate : 0.1

(CIFAR & SVHN : divided by 10 at 50% and 75% of the total number of training epochs)

(ImageNet : lowered by 10 times at epoch 30 and 60)

Wight Decay : 0.0001 (Nesterov Momentum of 0.9)

Dropout : 0.2 (C10, C100, SVHN)

4.3. Classification Result on CIFAR and SVHN

서로 다른 깊이(L)및 growth rate(k)로 DenseNet 훈련을 진행했다.
CIFAR, SVHN에 대한 결과는 Table2. 를 참고하자.

general trends를 나타내기 위해 기존 기술보다 높은 성능을 보인 결과는 굵은 글씨로 표기하였으며,

전체적으로 최상의 결과를 보인 네트워크는 파란색으로 표기했다.

출처 : Densely Connected Convolutional Networks

< Accuracy >

가장 주목할만한 trend는 Table 2.의 가장 아래줄에서 확인할 수 있다.

DenseNet-BC, 그 중에서도 특히 k = 40의 경우 CIFAR dataset에 대해 기존보다 훨씬 높은 성능을 보였다.

C10+, C100+에서 각각 3.46, 17.18%의 error rate를 보였으며,

이는 wide ResNet 구조에서 달성한 error rate보다 훨씬 낮은 수치이다.

C10과 C100(without data augmentation)에 대한 결과는 더욱 놀랍다.

두 경우에 대한 결과 모두 drop-path regularization을 사용한 Fractal Net보다 30% 가까이 낮았다.

dropout을 사용한 SVHN dataset에서는 k = 24인 DenseNet이 가장 좋은 결과를 보였다.

하지만, 250-Layer DenseNet-BC는 shortcut counterpart에 비해 더 좋은 성능을 내지 못했다.

SVHN이 비교적 쉬운 dataset에 속하며, 너무 깊은 모델의 경우 training set에 overfit되기 쉽기 때문에

이러한 현상이 발생한 것으로 보고 있다.

< Capacity >

compression이나 bottleneck layer가 없는 DenseNet의 경우, L과 k가 증가할수록 높은 성능을 보였다.

이는 model capacity의 growth와 주로 일치하는 것으로 보였는데, C10+와 C100+에서 이를 확인할 수 있다.

C10+의 경우, parameter의 개수가 증가할수록, error가 점차 줄어들었다. C100+ 또한 비슷한 양상을 보였다.

이를 통해 DenseNet이 크고 깊은 모델의 representational power를 이용할 수 있으며,

residual network에서 발생하는 overfitting이나 optimization문제가 발생하지 않음을 알 수 있다.

< Parameter Efficiency >

Table 2. 에서, DenseNet이 다른 구조들에 비해 parameter를 더 효과적으로 사용할 수 있음을 알 수 있다.

bottleneck 구조와 transition layer에서의 dimension reduction을 수행한 DenseNet-BC가 특히 그렇다.

예를 들어, 15.3M개의 parameter를 가진 250-Layer 모델이

다른 30M개 이상의 parameter를 갖는 여러 모델들(FractalNet, Wide ResNet)보다 높은 성능을 보였다.

또한, 1001-Layer의 ResNet 모델과 비교했을때,

DenseNet-BC가 90% 정도 적은 parameter를 가졌음에도 비슷한 성능을 보였다.

출처 : Densely Connected Convolutional Networks

위 그림은 C10+의 2개의 네트워크에 대한 training loss와 test error를 보여준다.

1001-Layer의 깊은 ResNet의 training loss값은 낮게 수렴하지만, test error는 비슷하다.

이에 대해서는 차후 자세히 분석 예정이다.

< Overfitting >

parameter를 더 효율적으로 사용함에 따라 발생하는 긍정적인 부작용 중 하나는

DenseNet이 overfitting 되는 경향이 적다는 것이다.

본 논문에서는 data augmentation을 하지않은 dataset에서

DenseNet 구조의 improvement가 더 잘 두드러진다는 것을 확인했으며,

single setting에서의 잠재적인 overfitting을 확인했다.

이를 해결하기 위해서는 DenseNet-BC bottleneck and compression layer가 효과적일 것으로 보인다.

4.4. Classification Results on ImageNet

이제 ImageNet dataset에 대한 결과를 확인해보자.

DenseNet-BC의 depth와 growth rate를 바꿔가며 진행했으며, 기존 ResNet구조와 비교했다.

두 구조간의 공정한 비교를 위해, Torch implementation 된 ResNet을 사용함으로서

데이터 전처리나 최적화 세팅 같은 다를 수 있는 요소들을 모두 제거했다.

단순히 ResNet 모델을 DenseNet-BC 네트워크로 바꿨으며, 실험 환경은 ResNet과 동일하게 설정했다.

출처 : Densely Connected Convolutional Networks

ImageNet에 DenseNet을 적용한 single-crop과 10-crop validation error 결과는 위와 같다.

Figure 3. DenseNet의 single-crop top1 validation error값과의 비교를 위한 그림으로,

왼편은 ResNet의 Parameter, 오른편은 FLOP에 대한 결과이다.

결과를 보면 알 수 있듯, DenseNet의 parameter 수와 computation이 훨씬 적음에도 불구하고,

ResNet과 비슷한 성능을 내는 것을 확인할 수 있다.

본 논문에서, 실험 설정이 ResNet에는 최적화되었지만,

DenseNet에는 최적화되어 있지 않다는 점 또한 주목할 필요가 있다.

만약 ImageNet에 맞는 DenseNet의 parameter를 보다 최적화한다면,

더 높은 성능을 낼 수 있을 것으로 기대된다.

5. DISCUSSION

표면적으로 보았을 때, DenseNet의 구조는 ResNet과 상당히 유사하다.

하지만, 두 구조간의 작은 차이는 실질적으로 두 네트워크 구조 간의 아주 큰 차이로 이어진다.

< Model Compactness >

input concatenation의 직접적인 결과로,

DenseNet에서 학습된 feature map들은 모든 subsequent Layer에 접근 가능하다.

이는 네트워크 전체의 feature 재사용성을 높이고, 보다 compact한 모델이 되도록 한다.

출처 : Densely Connected Convolutional Networks

위 두 그래프를 살펴보자.

왼쪽 그래프는 DenseNet의 parameter efficiency를 비교하는 그래프,

오른쪽 그래프는 ResNet 구조와 DenseNet을 비교하는 그래프이다.

C10+에서 다양한 깊이의 여러 소규모 네트워크들을 train하고,

network parameter들의 function으로 test 정확도를 plot했다.

AlexNet이나 VGG-net과 같은 여러 유명한 네트워크 구조와 비교했을 때,

pre-activation된 ResNet은 적은 parameter에도 일반적으로 더 나은 결과를 냈다.

그렇다면 본 논문에서 제안된 DenseNet은 어떨까?

결론부터 이야기하자면, 앞서 언급했던 것과 동일하게,

DenseNet이 ResNet에 비해 parameter대비 더 좋은 성능을 냈다.

그래프상으로 보았을 때, DenseNet-BC가 가장 효율적인 parameter efficient varient였다.

또한, 동일한 수준의 정확도를 달성하는데 ResNet의 약 1/3 정도의 parameter만 이용했다.

즉, DenseNet의 model이 더 compact하고, fewer parameter를 가진다.

< Implicit Deep Supervision >

dense convolutional network의 accuracy가 향상되는 이유는

각각의 Layer들이 short connection을 통해 loss function의 추가적인 supervision을 받기 때문이다.

DenseNet을 이용해 일종의 "Deep supervision"을 수행할 수 있다.

Deep supervision의 이점은 이미 deeply-supervised nets(DSN)^[각주:26]에서 증명된 바 있다.

(모든 hidden Layer에 classifier가 더해지며, intermediate Layer들이 보다 차별적인 특징을 학습하도록 한다)

DenseNet은 implicit한 방식으로 위와 유사한 Deep supervision을 수행한다.

네트워크 상단의 single classifier는 최대 2~3개의 transition Layer를 통해

모든 Layer를 직접적으로 supervision한다.

하지만 DenseNet의 loss function과 gradient의 경우,

모든 Layer에서 같은 loss function을 공유하기 때문에 훨씬 덜 복잡하다.

< Stochastic vs. deterministic connection >

dense convolutional network와 residual network의 stochastic depth regularization사이에는

흥미로운 connection이 하나 있다.

stochastic depth에서, residual network의 Layer들은 랜덤하게 drop되어

주변 Layer들 간의 direct connection이 생성된다.

pooling Layer는 drop하지 않기 때문에 네트워크는 DenseNet과 유사한 connectivity pattern을 갖는다.

방법은 궁극적으로 다를지 몰라도, stochastic depth에 대한 DenseNet interpretation은

이러한 정규화의 성공에 대한 통찰력을 제공한다.

< Feature Reuse >

DenseNet은 현재 Layer가 이전의 모든 Layer들의 feature map에 접근할 수 있도록 설계되었다.

본 논문에서는 train된 네트워크가 이러한 opportunity를

잘 활용하는지 조사하기 위해 다음과 같은 실험을 수행했다.

먼저, C10+, k = 12, L = 40인 DenseNet을 훈련시켰다.

각 블록 내의 convolutional Layer 'l'에 대해, Layer 's'와의 연결에 할당된 average weight를 계산했다.

출처 : Densely Connected Convolutional Networks

위 Figure 5. 는 모든 3개의 dense block에 대한 heat-map을 나타낸 그림이다.

average absolute weight란, 이전 Layer에서 Convolution Layer에 대한 의존성에 대한 정도이다.

위 그림에서 빨간색 점(l, s)은 Layer 'l'이 이전 Layer 's'에서 생성된 feature-map을

많이 사용한다는 것을 나타낸다.

자세한 내용은 위 그림을 더 참고하도록 하자.

6. CONCLUSION

본 논문에서는 Dense Convolutional Network, 일명 DenseNet이라고 불리는

새로운 Convolutional Network 구조에 대해 소개했다.

같은 feature map size를 가진 어떤 2개의 Layer들에 대한 direct connection에 대해 소개했으며,

optimization difficulty없이 Layer의 규모를 늘려나갈 수 있다는 것 또한 증명했다.

본 논문의 실험에서, DenseNet은

성능 저하나 overfitting 없이 parameter의 개수가 증가할수록 정확도가 향상되는 모습을 보였다.

또한 다양한 setting에서도 다른 기존 결과들에 비해 놀라운 성과를 보여주었으며,

심지어는 다른 구조들에 비해 적은 parameter 개수와 연산량으로 더 좋은 결과를 냈다.

본 논문에서 DenseNet의 parameter들은 ResNet환경에 맞게 setting되었는데,

이를 개선하면 더 좋은 결과가 나올 것으로 예상하고 있다.

DenseNet은 모든 Layer들을 연결한다는 간단한 connectivity rule을 따르면서도,

identity mapping, deep supervision, diversified depth 등의 특징을 모두 실현시켰다.

이러한 DenseNet 구조는 네트워크 전체에서

feature reuse를 더 용이하게 하고, model을 더 compact하게 만들었다.

DenseNet은 compact한 internal representations, feature redundancy 감소라는 장점을 통해

다양한 컴퓨터 비전 분야에서 훌륭한 feature extractor로 자리잡을 것이다.

< 논문 구현 >

https://beginnerdeveloper-lit.tistory.com/162

[논문구현] DenseNet (Densely Connected Convolutional Networks) 구현

DenseNet 에 대한 논문 리뷰 https://beginnerdeveloper-lit.tistory.com/161 [논문리뷰] DenseNet (Densely Connected Convolutional Networks) DenseNet https://arxiv.org/abs/1608.06993 Densely Connected Convolutional Networks Recent work has shown that

beginnerdeveloper-lit.tistory.com

REFERENCE

https://deep-learning-study.tistory.com/528

[논문 읽기] DenseNet(2017) 리뷰, Densely Connected Convolutional Networks

이번에 읽어볼 논문은 DenseNet, 'Densely Connected Convolutional Networks'입니다. DenseNet은 ResNet과 Pre-Activation ResNet보다 적은 파라미터 수로 더 높은 성능을 가진 모델입니다. DensNet은 모든 레이어의 피쳐맵

deep-learning-study.tistory.com

https://aijyh0725.tistory.com/2

Dense Net(2018)논문 정리

Dense Net 논문 정리 안녕하세요, 블로그에 처음 공부 내용을 정리하게 되었습니다. 저도 아직 가야할 길이 멀게만 느껴지지만, 열심히 공부하는 누군가가 저의 글을 보고 함께 달릴 수 있으시길

aijyh0725.tistory.com

https://ysbsb.github.io/cnn/2020/02/12/DenseNet.html

DenseNet 논문 리뷰 | mocha's machine learning

Densely Connected Convolutional Networks, Gao Huang, CVPR2017 Paper DenseNet 방법 요약 DenseNet은 이러한 short connection 방법을 바탕으로, 연속적으로 나오는 각각의 layer를 모두 연결하는 방법을 사용한다. 기존의 con

ysbsb.github.io

K. He, X. Zhang, S. Ren, and J. Sun. Deep residual learning for image recognition. In CVPR, 2016. [본문으로]
G. Huang, Y. Sun, Z. Liu, D. Sedra, and K. Q. Weinberger. Deep networks with stochastic depth. In ECCV, 2016. [본문으로]
G. Larsson, M. Maire, and G. Shakhnarovich. Fractalnet: Ultra-deep neural networks without residuals. arXiv preprint arXiv:1605.07648, 2016. [본문으로]
K. He, X. Zhang, S. Ren, and J. Sun. Deep residual learning for image recognition. In CVPR, 2016. [본문으로]
G. Huang, Y. Sun, Z. Liu, D. Sedra, and K. Q. Weinberger. Deep networks with stochastic depth. In ECCV, 2016. [본문으로]
S. E. Fahlman and C. Lebiere. The cascade-correlation learning architecture. In NIPS, 1989. [본문으로]
B. M. Wilamowski and H. Yu. Neural network learning without backpropagation. IEEE Transactions on Neural Networks, 21(11):1793–1803, 2010 [본문으로]
B. Hariharan, P. Arbeláez, R. Girshick, and J. Malik. Hyper columns for object segmentation and fine-grained localization. In CVPR, 2015. [본문으로]
J. Long, E. Shelhamer, and T. Darrell. Fully convolutional networks for semantic segmentation. In CVPR, 2015. [본문으로]
P. Sermanet, K. Kavukcuoglu, S. Chintala, and Y. LeCun. Pedestrian detection with unsupervised multi-stage feature learning. In CVPR, 2013. [본문으로]
S. Yang and D. Ramanan. Multi-scale recognition with dagcnns. In ICCV, 201 [본문으로]
J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li, and L. FeiFei. Imagenet: A large-scale hierarchical image database. In CVPR, 2009. [본문으로]
K. He, X. Zhang, S. Ren, and J. Sun. Deep residual learning for image recognition. In CVPR, 2016. [본문으로]
G. Huang, Y. Sun, Z. Liu, D. Sedra, and K. Q. Weinberger. Deep networks with stochastic depth. In ECCV, 2016. [본문으로]
C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich. Going deeper with convolutions. In CVPR, 2015. [본문으로]
C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens, and Z. Wojna. Rethinking the inception architecture for computer vision. In CVPR, 2016. [본문으로]
C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich. Going deeper with convolutions. In CVPR, 2015. [본문으로]
C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens, and Z. Wojna. Rethinking the inception architecture for computer vision. In CVPR, 2016. [본문으로]
M. Lin, Q. Chen, and S. Yan. Network in network. In ICLR, 2014. [본문으로]
C.-Y. Lee, S. Xie, P. Gallagher, Z. Zhang, and Z. Tu. Deeply supervised nets. In AISTATS, 2015. [본문으로]
M. Pezeshki, L. Fan, P. Brakel, A. Courville, and Y. Bengio. Deconstructing the ladder network architecture. In ICML, 2016. [본문으로]
A. Rasmus, M. Berglund, M. Honkala, H. Valpola, and T. Raiko. Semi-supervised learning with ladder networks. In NIPS, 2015. [본문으로]
J. Wang, Z. Wei, T. Zhang, and W. Zeng. Deeply-fused nets. arXiv preprint arXiv:1605.07716, 2016. [본문으로]
Y. Zhang, K. Lee, and H. Lee. Augmenting supervised neural networks with unsupervised objectives for large-scale image classification. In ICML, 2016. [본문으로]
K. He, X. Zhang, S. Ren, and J. Sun. Identity mappings in deep residual networks. In ECCV, 2016. [본문으로]
C.-Y. Lee, S. Xie, P. Gallagher, Z. Zhang, and Z. Tu. Deeply supervised nets. In AISTATS, 2015. [본문으로]

[논문구현] ResNet (Deep Residual Learning for Image Recognition) 구현

릿99 — Thu, 26 Jan 2023 12:01:55 +0900

ResNet에 대한 논문 리뷰

https://beginnerdeveloper-lit.tistory.com/159

[논문리뷰] ResNet (Deep Residual Learning for Image Recognition)

ResNet https://arxiv.org/abs/1512.03385 Deep Residual Learning for Image Recognition Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used

beginnerdeveloper-lit.tistory.com

ResNet50

출처 : Deep Residual Learning for Image Recognition

지난번 리뷰한 "Deep Residual Learning for Image Recognition" 논문의

ResNet 네트워크 중 ResNet50을 구현해보고자 한다.

ResNet50의 자세한 구조는 아래 Table을 참고하자.

출처 : Deep Residual Learning for Image Recognition

논문에서 주로 소개 및 비교하는 모델은 34-Layer지만, 필자는 50-Layer를 구현했다.

34-Layer 모델과의 차이점은 각 Convolution Layer의 앞 뒤로 1 x 1 convolution이 추가된 것인데,

이는 저번 논문 리뷰에서도 언급한 Bottleneck 구조이다.

Bottleneck 구조란 말 그대로 "병목 구조"인데,

다음과 같이 차원이 줄었다가 늘어나는 현상을 병목 현상이라 한다.

출처 : https://nearhome.tistory.com/129

ResNet에서 이야기하는 BottleNeck 구조 또한 이와 동일하다.

여기서 1 x 1 convolution은 Dimension을 줄였다가 키우는 역할을 하는데,

이는 3x3 Layer의 Input/Output Dimension을 줄이기 위해서이다.

출처 : Deep Residual Learning for Image Recognition

Environment & Parameter

❗ 해당 논문의 ResNet50 모델의 구조에 초점을 맞춰 구현하였으며,

그 외 세부적인 사항까지 완벽하게 구현하지는 못했습니다.

(image crop, mean subtract 등)

❗ 또한, 논문에 사용된 Dataset과 다른 Dataset을 사용했으므로,

Parameter들 또한 상이하다는 점 양해 부탁드립니다.

실험에 사용한 환경은 아래와 같습니다.

Language : Python

Framework : Tensorflow (GPU)

Dataset : Kaggle Dog & Cat 중 일부 사용 (train : Dog 5000, Cat 5000 / validation : Dog 2000, Cat 2000)

(https://www.kaggle.com/datasets/tongpython/cat-and-dog?select=training_set)

(Dataset 중 일부 훼손된 이미지가 있어, 해당 이미지들 필수 삭제 후 훈련 필요)

Image Size : 224 x 224 x 3

Batch Size : 32

Epoch : 50

Learning Rate : 0.001 (momentum = 0.9)

ResNet50 Code

<ResNet50 Layers Code>

코드를 보면서 ResNet50 Model의 각 Layer들을 하나씩 살펴보도록 하자.

먼저 다시 50-Layer ResNet의 구조를 보면 아래 Table과 같다.

출처 : Deep Residual Learning for Image Recognition

[ Conv 1 ]

첫번째 Convolution Layer부터 살펴보자.

이미지 input size는 224 x 224로, 64개의 filter, kernel size = 7 x 7, stride = 2를 사용한다.

stride = 2를 사용함으로서, output size = 112 x 112 가 된다.

ResNet 논문의 implementation에서

각 convolution 연산 뒤, activation 전에 Batch Normalization을 수행했기에 필자도 동일하게 구현했다.

# input = 224 x 224 x 3

# Conv1 -> 1
x = layers.Conv2D(64, (7, 7), strides=2, padding='same', input_shape=(224, 224, 3))(x)  # 112x112x64
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

[ Conv 2 ]

두번째 Convolution Layer이다.

두번째 Convolution 연산을 시작하기 전, 먼저 3 x 3 maxpooling(stride = 2)을 적용한다.

maxpooling 이후 output size는 56 x 56 이다.

x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64

이후 본격적인 Convolution 연산을 진행한다.

Table에 나와 있는 값에 따라 1 x 1, 3 x 3, 1 x 1 kernel size의 연산을

각각 64, 64, 256개의 filter를 이용해 계산한다.

위와 같은 계산을 3번 반복한다.

여기서 한 가지, ResNet에서 기억하고 구현해야할 중요한 요소가 하나있다.

바로 shortcut connection이다.

output에 input값을 다시 더해 residual한 구조를 갖게 하는 요소로,

input x에 대해 Layer를 거쳐 F(x) +x 라는 결과가 나오도록 해야한다.

shortcut = x

먼저, 본격적인 Convolution 연산에 앞서,

이전 Layer의 output이자 현재 Layer의 input이 될 x를 shortcut으로 두자.

# Conv2_x -> 3
for i in range(3) :
    if i==0 :
        x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)

        # In case of i = 0 (for Dimension Identity)
        # shortcut should enter as input with x
        shortcut = layers.Conv2D(256, (1, 1), strides=1, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
        shortcut = layers.Activation('relu')(shortcut)

        x = layers.Add()([x, shortcut])
        shortcut = x    # 56x56x256

for문은 앞서 이야기했던 convolution 연산을 3번 반복하기 위한 장치로,

ResNet의 경우, 각 Layer마다 3, 4, 6, 3번 반복한다.

우선, i == 0인 경우, 즉, 처음으로 해당 Layer의 연산을 시작할 경우를 살펴보자.

input x에 대해 위 Table대로 Convolution, BN, Activation 연산을 거친 x는 최종적으로 F(x)라 하자.

이렇게 연산을 거친 output x이자 F(x)의 dimension은 기존 input x(=shortcut)과 달라졌으므로,

동일한 dimension으로 맞춰주는 연산이 필요하다.

따라서, input, 즉, shortcut에 output F(x)에 마지막으로 적용한 연산을 적용하여 dimension을 맞춰준다.

이렇게 shortcut(input)과 x(=F(x)/output)의 dimension이 맞춰졌다면,

input인 shortcut에 대해 Layer를 걸쳐 나온 F(x)라는 결과에

입력인 shorcut를 그대로 다시 더해준다. (F(x) + x( = shortcut))

그리고 이렇게 생성된 output x는 다시 다음 Layer의 input으로 전해지게 되고,

다음 Layer의 shorcut(=input)은 현재의 x값으로 초기화해준다.

# Conv2_x -> 3
for i in range(3) :
    if i==0 :
        ...

    else :
        x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Add()([x, shortcut])
        shortcut = x    # 56x56x256

이제 i == 0이 아닌 경우를 살펴보자.

해당 경우에도 사실 마찬가지이다.

단, 여기서는 shortcut의 dimension이 output인 F(x)(=x)의 dimension과 동일하므로,

위와 같이 shortcut의 dimension을 맞춰주는 연산이 불필요하다.

이 외에는 위 작성한 코드와 동일하다.

2번째 Convolution Layer에 대한 전체 코드는 다음과 같다.

x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64
shortcut = x

# Conv2_x -> 3
for i in range(3) :
    if i==0 :
        x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)

        # In case of i = 0 (for Dimension Identity)
        # shortcut should enter as input with x
        shortcut = layers.Conv2D(256, (1, 1), strides=1, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
        shortcut = layers.Activation('relu')(shortcut)

        x = layers.Add()([x, shortcut])
        shortcut = x    # 56x56x256

    else :
        x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Add()([x, shortcut])
        shortcut = x    # 56x56x256

[ Conv 3 ]

세번째 Convolution Layer이다.

세번째 Convolution Layer부터 5번째 Convolution Layer까지는 사실 2번째 Layer에서 작성한 코드에서

Table에 있는 filter개수 및 kernel size, 반복횟수만 바꿔주면 된다.

위와 동일한 설명이므로 Convolution 3 ~5 Layer까지의 설명은 생략하겠다.

# Conv3_x -> 4
for i in range(4) :
    if i==0 :
        x = layers.Conv2D(128, (1, 1), strides=2, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(128, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)

        # In case of i = 0 (for Dimension Identity)
        # shortcut should enter as input with x, 112x112x64 -> 112x112x256
        shortcut = layers.Conv2D(512, (1, 1), strides=2, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
        shortcut = layers.Activation('relu')(shortcut)

        x = layers.Add()([x, shortcut])
        shortcut = x    # 28x28x512

    else :
        x = layers.Conv2D(128, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(128, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Add()([x, shortcut])
        shortcut = x    # 28x28x512

[ Conv 4 ]

# Conv4_x -> 6
for i in range(6) :
    if i==0 :
        x = layers.Conv2D(256, (1, 1), strides=2, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(256, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(1024, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)

        # In case of i = 0 (for Dimension Identity)
        # shortcut should enter as input with x, 112x112x64 -> 112x112x256
        shortcut = layers.Conv2D(1024, (1, 1), strides=2, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
        shortcut = layers.Activation('relu')(shortcut)

        x = layers.Add()([x, shortcut])
        shortcut = x    # 14x14x1024

    else :
        x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(256, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(1024, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Add()([x, shortcut])
        shortcut = x    # 14x14x1024

[ Conv 5 ]

# Conv5_x -> 3
for i in range(3) :
    if i==0 :
        x = layers.Conv2D(512, (1, 1), strides=2, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(512, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(2048, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)

        # In case of i = 0 (for Dimension Identity)
        # shortcut should enter as input with x, 112x112x64 -> 112x112x256
        shortcut = layers.Conv2D(2048, (1, 1), strides=2, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
        shortcut = layers.Activation('relu')(shortcut)

        x = layers.Add()([x, shortcut])
        shortcut = x    # 7x7x2048

    else :
        x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(512, (3, 3), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(2048, (1, 1), strides=1, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Add()([x, shortcut])
        shortcut = x    # 7x7x2048

[ FC Layer ]

마지막으로 네트워크의 끝단 연산부분에 대해 살펴보자.

여기서는 average pooling, 1000-d fc, softmax를 수행한다.

단, 필자는 class가 2개이므로, 이 부분은 dataset에 맞게 변경이 필요하다.

    # 2048 (same with AdaptiveAvgPool in Pytorch)
    x = layers.GlobalAveragePooling2D()(x)
    # classes = 2
    x = layers.Dense(2, activation='softmax')(x)

    return x

<Entire ResNet50 Model Code>

'''
< ResNet Architecture>
- ResNet "50"-layer
- 5_x Layer (1,3,4,6,3)
- skip connection
- Sequential model X
- Batch Normalization right after each convolution and before activation
'''

def ResNet(x):
    # input = 224 x 224 x 3

    # Conv1 -> 1
    x = layers.Conv2D(64, (7, 7), strides=2, padding='same', input_shape=(224, 224, 3))(x)  # 112x112x64
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64
    shortcut = x

    # Conv2_x -> 3
    for i in range(3) :
        if i==0 :
            x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x
            shortcut = layers.Conv2D(256, (1, 1), strides=1, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 56x56x256

        else :
            x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 56x56x256

    # Conv3_x -> 4
    for i in range(4) :
        if i==0 :
            x = layers.Conv2D(128, (1, 1), strides=2, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(128, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x, 112x112x64 -> 112x112x256
            shortcut = layers.Conv2D(512, (1, 1), strides=2, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 28x28x512

        else :
            x = layers.Conv2D(128, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(128, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 28x28x512

    # Conv4_x -> 6
    for i in range(6) :
        if i==0 :
            x = layers.Conv2D(256, (1, 1), strides=2, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(1024, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x, 112x112x64 -> 112x112x256
            shortcut = layers.Conv2D(1024, (1, 1), strides=2, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 14x14x1024

        else :
            x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(1024, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 14x14x1024

    # Conv5_x -> 3
    for i in range(3) :
        if i==0 :
            x = layers.Conv2D(512, (1, 1), strides=2, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(2048, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x, 112x112x64 -> 112x112x256
            shortcut = layers.Conv2D(2048, (1, 1), strides=2, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 7x7x2048

        else :
            x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(2048, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 7x7x2048

    # 2048 (same with AdaptiveAvgPool in Pytorch)
    x = layers.GlobalAveragePooling2D()(x)
    # classes = 2
    x = layers.Dense(2, activation='softmax')(x)

    return x

<Entire Code>

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

tf.test.is_gpu_available()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

'''
< ResNet Architecture>
- ResNet "50"-layer
- 5_x Layer (1,3,4,6,3)
- skip connection
- Sequential model X
- Batch Normalization right after each convolution and before activation
'''

def ResNet(x):
    # input = 224 x 224 x 3

    # Conv1 -> 1
    x = layers.Conv2D(64, (7, 7), strides=2, padding='same', input_shape=(224, 224, 3))(x)  # 112x112x64
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.MaxPool2D((3, 3), 2, padding='same')(x)  # 56x56x64
    shortcut = x

    # Conv2_x -> 3
    for i in range(3) :
        if i==0 :
            x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x
            shortcut = layers.Conv2D(256, (1, 1), strides=1, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 56x56x256

        else :
            x = layers.Conv2D(64, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(64, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 56x56x256

    # Conv3_x -> 4
    for i in range(4) :
        if i==0 :
            x = layers.Conv2D(128, (1, 1), strides=2, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(128, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x, 112x112x64 -> 112x112x256
            shortcut = layers.Conv2D(512, (1, 1), strides=2, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 28x28x512

        else :
            x = layers.Conv2D(128, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(128, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 28x28x512

    # Conv4_x -> 6
    for i in range(6) :
        if i==0 :
            x = layers.Conv2D(256, (1, 1), strides=2, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(1024, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x, 112x112x64 -> 112x112x256
            shortcut = layers.Conv2D(1024, (1, 1), strides=2, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 14x14x1024

        else :
            x = layers.Conv2D(256, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(256, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(1024, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 14x14x1024

    # Conv5_x -> 3
    for i in range(3) :
        if i==0 :
            x = layers.Conv2D(512, (1, 1), strides=2, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(2048, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)

            # In case of i = 0 (for Dimension Identity)
            # shortcut should enter as input with x, 112x112x64 -> 112x112x256
            shortcut = layers.Conv2D(2048, (1, 1), strides=2, padding='same')(shortcut)
            shortcut = layers.BatchNormalization()(shortcut)
            shortcut = layers.Activation('relu')(shortcut)

            x = layers.Add()([x, shortcut])
            shortcut = x    # 7x7x2048

        else :
            x = layers.Conv2D(512, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(512, (3, 3), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(2048, (1, 1), strides=1, padding='same')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Add()([x, shortcut])
            shortcut = x    # 7x7x2048

    # 2048 (same with AdaptiveAvgPool in Pytorch)
    x = layers.GlobalAveragePooling2D()(x)
    # classes = 2
    x = layers.Dense(2, activation='softmax')(x)

    return x


# Dataset (Kaggle Cat and Dog Dataset)
dataset_path = os.path.join('/home/kellybjs/Cat_Dog_Dataset')
train_dataset_path = dataset_path + '/train_set'
train_data_generator = ImageDataGenerator(rescale=1. / 255)
train_dataset = train_data_generator.flow_from_directory(train_dataset_path,
                                                         shuffle=True,
                                                         target_size=(224, 224),
                                                         batch_size=32,
                                                         class_mode='categorical')

valid_dataset_path = dataset_path + '/validation_set'
valid_data_generator = ImageDataGenerator(rescale=1. / 255)
valid_dataset = valid_data_generator.flow_from_directory(valid_dataset_path,
                                                         shuffle=True,
                                                         target_size=(224, 224),
                                                         batch_size=32,
                                                         class_mode='categorical')



input_shape = layers.Input(shape=(224, 224, 3), dtype='float32', name='input')
# Train
model = tf.keras.Model(input_shape, ResNet(input_shape))
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9),
                  loss='categorical_crossentropy',
                  metrics=['acc'])
model.summary()
train = model.fit_generator(train_dataset, epochs=50, validation_data=valid_dataset)

# Accuracy graph
plt.figure(1)
plt.plot(train.history['acc'])
plt.plot(train.history['val_acc'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('ResNet_Accuracy_1.png')

# Loss graph
plt.figure(0)
plt.plot(train.history['loss'])
plt.plot(train.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('ResNet_Loss_1.png')

Result

ResNet Accuracy

위 코드를 적용한 Train 및 Validation Accuracy 결과이다.

Train 시에는 최대 약 90%, Validation 시에는 약 60~70%의 정확도가 나오는 것을 볼 수 있다.

Validation 정확도가 비교적 높지 않아, 파라미터 수정 및 재학습 예정이다.

ResNet Loss

위 코드를 적용한 Train 및 Validation Loss 결과이다.

두 그래프 모두 점차 Loss 가 줄어드는 것이 보이나, Validation의 경우 후반에 많이 진동하는 점이 아쉽다.

이 또한 위 Accuracy 와 함께 보완해야할 점으로 보인다.

[논문리뷰] ResNet (Deep Residual Learning for Image Recognition)

릿99 — Wed, 25 Jan 2023 18:49:47 +0900

ResNet

https://arxiv.org/abs/1512.03385

Deep Residual Learning for Image Recognition

Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with

arxiv.org

1. INTRODUCTION

Deep convolutional neural network(DCNN)는 이미지 분류 문제를 해결하는데 큰 기여를 해왔다.

최근에는 네트워크의 깊이(depth)가 중요한 포인트로 떠올랐으며,

ImageNet dataset과 같은 challenge에서 "매우 깊은" 모델들이 좋은 성능을 보여주었다. ^[각주:1]^[각주:2]

그렇다면, 이렇듯 여러 Layer를 깊게 쌓은 네트워크가 학습 시 더 용이할까?

결론부터 이야기하자면, 아니다.

ResNet의 전체적인 아이디어 및 구조는 바로 이 의문에서부터 시작한다.

깊게 Layer를 쌓는데 있어 많은 문제들이 있는데, 그 중 하나가 바로 vanishing/exploding gradient 문제^[각주:3]이다.

하지만 해당 문제는 그동안 많이 제기 되어왔고, normalization 기법 등으로 어느정도 해결이 가능했다.

출처 : Deep Residual Learning for Image Recognition

하지만, Layer가 일정 깊이보다 깊어질 경우 또 다른 문제가 발생하게 되는데,

바로 Degradation 문제다.

Degradation이란, 네트워크의 깊이가 깊어질수록 정확도가 점차 안정화되다가 빠르게 감소하는 현상이다.

상단의 그림을 보자.

Figure 1. 은 각각 20-Layer와 56-Layer의 training error와 test error를 나타낸 그림이다.

이 역시 degradation으로 인해 training과 test 부분 모두에서 더 깊은 Layer가 더 높은 error를 보였다.

overfitting(과적합)으로 인한 문제일 것 같지만, 사실은 깊은 Layer로 인해 발생하는 문제이다.

(overfitting이면, 깊은 Layer의 train accuracy는 높고 test accuracy는 낮아야 한다)

해당 논문에서는 이 Degradation 문제를 해결하기 위해

Deep residual learning이라는 framework를 제안했다.

몇몇 Layer가 기존 mapping으로 바로 적합되는 것이 아닌, residual mapping 하도록 했다.

여기서 residual mapping이라는 개념이 잘 이해가 가지 않을 텐데, 아래 그림을 자세히 보자.

출처 : https://gaussian37.github.io/dl-concept-resnet/

왼쪽 (a)가 기존 방식, 오른쪽 (b)가 Residual 방식이다.

왼쪽이 일반적인 구조(Plain Network)의 네트워크로,

input x에 대해 Layer를 거쳐 H(x)라는 결과가 나오게 된다.

반면, 오른쪽 Residual 구조에 따르면,

input x에 대해 Layer를 걸쳐 나온 F(x)라는 결과에 입력인 x를 그대로 다시 더해준다.

즉, output에 input 값을 다시 더해주는 residual한 구조를 가지며,

input x에 대해 Layer를 거쳐 F(x) + x 라는 결과가 나오게 된다.

여기서, 하나 이상의 Layer를 건너뛰는 것을 "shortcut connection"이라 하는데,

identity x는 이러한 shortcut connection을 통해 mapping 되었다고 보면 된다.

여기서 한 가지 더 설명을 덧붙이자면, residual이란 "잔여, 잔차"를 뜻하는데,

잔여인 input 값을 output에 한번 더 더해준다는 느낌으로 받아들이면 편하겠다.

이렇듯 Residual mapping 구조는 input $x$ 의 값을 그대로 전달 받아 작은 변화에도 더 민감하게 반응하며,

ImageNet dataset에 해당 구조를 적용함으로서 다음과 같은 결과를 입증했다.

1. 제안된 Deep Residual Networks은 Optimize하기 쉬운 반면,

기존의 Plain Networks는 깊이가 깊어질수록 더 높은 train error가 나타난다.

2. 제안된 Deep Residual Networks은 기존 네트워크와 비교했을 때,

네트워크의 깊이가 깊고, 보다 높은 정확도를 보였다.

2. RELATED WORK

2.1. RESIDUAL REPRESENTATIONS

이미지 인식분야에서, VLAD^[각주:4]란, Residual vector를 이용하여 encoding 하는 방법이다.

Low-level 비전 및 그래픽스 분야에서 Partial Differential Equation(PDEs)를 해결하기 위해

여러 개의 Scale로 Subproblems를 재구성하는 Multigrid Method를 사용했으며,

이 과정에서 Residual Vector를 사용했다.

2.2. SHORTCUT CONNECTIONS

이전 연구들 중에서도 shorcut connection을 이용한 방법들은 다양하게 존재했다.

MLP에서 Linear Layer에 Input과 Output을 연결해주는 방법,^[각주:5]^[각주:6]

Inception Module에서도 shortcut branch를 사용한 것^[각주:7] 등 여러 연구를 통해 이용되어 왔다.

3. DEEP RESIDUAL LEARNING

3.1. RESIDUAL LEARNING

H(x)를 몇개의 Layer에 대한 기존 mapping 방식의 결과라 하자.

multiple nonlinear layer가 복잡한 함수를 추정한다는 말은, Residual 함수를 추정할 수 있다는 것과 같다.

위에서 언급한 것처럼, Residual Function F(x) = H(x) - x라 하면, 기존 H(x) = F(x) + x 이다.

즉, Residual Function에 input x를 더한 것을 추정하는 과정이 되는 것이다.

이러한 재정의는 Degradation 문제에서 비롯되었다.

앞선 introduction에서 언급했듯 identity mapping을 이용해 Layer를 추가하면,

깊이가 깊은 model의 training error은 얕은 model에 비해 작을 것이다.

또한 Degradation 문제는 multiple nonlinear layer를 통해

identity mapping을 추정하는 데 있어 어려움을 겪을 것을 암시한다.

하지만 identity mapping이 optimal 하다면, Residual Learning을 이용해 이를 해결할 수 있다.

(사실 현실적인 부분에서 identity Mapping이 적합한 경우는 많지않다)

적합한 함수가 Identity Function과 유사할 때,

즉 Zero Mapping을 하는 것이 아닐 때, 작은 변화에 더 쉽게 반응할 수 있을 것이다.

3.2. IDENTITY MAPPING BY SHORTCUTS

출처 : Deep Residual Learning for Image Recognition

앞선 그림에서의 building block을 식으로 정의하면 식(1)과 같다.

x, y는 각각 layer의 input, output vector, 함수 F(x, {W_i})는 residual mapping function이다.

F + x는 shortcut connection, element wise addition을 통해 계산되며, 이후 Nonlinear function을 거친다.

이러한 shorcut connection은 추가적인 Parameter를 생성하지 않으며,

computation complexity 또한 증가시키지 않는다.

단, 위 식(1)은 x 와 F의 dimension이 같아야 적용할 수 있다.

x, F의 dimension이 같지 않은 경우, linear projection W_s를 통해 dimension을 맞춰줄 수 있다.

출처 : Deep Residual Learning for Image Recognition

위 식 (2)는 (1)과 아이디어는 동일하지만, 입, 출력 Layer의 dimension이 다른 경우의 식이다.

위에서 언급한 것처럼, x와 F의 dimension이 다르므로 x에 linear projection W_s를 적용했다.

Residual Function F의 구조는 유연하다. (본 논문에서 F는 2~3 Layer 구조로 이루어져 있다)

다만 만약 F가 single Layer일 경우 y = W_1x +x로 정의할 수 있는데,

이는 식(1)의 linear Layer와 매우 유사하며, 이점이 없다.

3.3. NETWORK ARCHITECTURES

출처 : Deep Residual Learning for Image Recognition

위 그림은 각각 VGG-19, 34-layer plain model, 34-layer residual model을 나타낸 그림이다.

각각의 모델의 특징 및 차이점에 대해 간단히 알아보자.

< PLAIN NETWORK >

그림의 Plain baseline은 VGG network에서 영감을 받은 34개의 Layer를 가진 모델이다.

Convolution Layer는 거의 모두 3x3 Filter로 이루어져 있으며, 다음과 같은 2가지 design rule을 따른다.

1) 같은 output feature map size 도출을 위해, Layer들은 모두 같은 개수의 필터를 가진다.

2) 만약 feature map size가 절반이 되면, 각 Layer의 시간 복잡도를 유지하기 위해 filter의 개수는 두배가 된다.

Stride=2 Convolution을 이용해 Downsampling을 진행하였으며,

네트워크 마지막 단에는 global average pooling Layer, 1000-way FC-Layer, softmax 를 수행한다.

따라서 총 weighted Layer는 34이다.

이 baseline model은 VGG network보다 복잡도도 낮고 Parameter 수도 적다.

< RESIDUAL NETWORK >

Residual network는 위 Plain network에 기반하여, shortcut connection을 추가했다.

input과 output의 dimension이 같은 경우, identity shortcut이 바로 적용되며,

만약 같지 않은 경우, 다음과 같은 2가지 옵션이 있다.

1) shorcut이 identity mapping을 그대로 진행하고, 증가된 dimension 만큼 추가적으로 제로 패딩한다.

2) 위 식(2)의 projection shortcut을 이용해 dimension을 맞춰준다.

1)의 방식은 추가적인 파라미터를 생성하지 않으며,

두 옵션 모두 feature map을 2 size씩 건너뛰기 때문에 stride=2로 준다.

3.4. IMPLEMENTATION

Input : 224 x 224 RGB image

([256, 480] resize 후, random crop.

crop이미지 또는 Horizontal Flip 이미지 사용하며, 각 픽셀마다 평균을 subtract)

Batch Normalization : 각 convolution 이후, activation 전에 수행

Batch size : SGD with mini-batch size=256

Learning rate : 0.1 and divided by 10 when error plateaus

Wight Decay : 0.0001 (Momentum of 0.9)

Dropout : X

4. EXPERIMENTS

4.1. IMAGENET CLASSIFICATION

1000개의 class로 구성된 ImageNet 2012 classification dataset을 통해 평가했다.

(1.28 million training images, and evaluated on the 50k validation images)

최종 결과는 100k test images를 사용했다.

< PLAIN NETWORK >

출처 : Deep Residual Learning for Image Recognition

해당 논문에서는 18개 Layer, 34개 Layer의 Plain Network를 사용하였다. 위 사진은 각 모델의 구조이다.

출처 : Deep Residual Learning for Image Recognition

위 사진은 각각 train error(얇은선), validation error(굵은선)를 나타낸 사진이다.

그림을 보면 알 수 있듯, 34-Layer plain model이 18-Layer plain model보다 validation error가 높았다.

즉, 이를 통해 vanishing gradients로 인해 Optimization이 어렵다는 것을 알 수 있다.

해당 논문에서는 더 깊은 Plain Network가 아주 작은 Convergence Rates를 가져서

Training Error가 줄어드는 것이 약하다고 추측하고 있다.

< RESIDUAL NETWORK >

출처 : Deep Residual Learning for Image Recognition

반면, ResNet의 경우, 34-Layer보다 18-Layer model이 validation error가 높았다.

즉, 모델이 깊어지면서 생기는 성능 악화 문제를 해결했다.

또한 위 표에 따르면, ResNet이 Plain Network보다 3.5% 낮은 Error를 보였으며,

ResNet이 Plain Network보다 초반 단계에서 빠르게 convergence 되는 것을 확인할 수 있었다.

< IDENTITY VS. PROJECTION SHORTCUTS >

출처 : Deep Residual Learning for Image Recognition

지금까지의 training에서 identity shortcut 방식을 사용했다면,

이번에는 projection shortcut에 대해 알아보자.

위 Table 3. 을 보면, 다음과 같은 3가지 옵션을 두고 비교했다.

(A) Zero Padding을 통해 Dimension 증가 (parameter free)

(B) Projection을 통해 Dimension 증가

(C) 모든 Shortcut을 Projection

위 3가지 경우 모두 plain network보다 성능이 좋았으며,

B가 A보다 조금 더 나은 성능을 보였다.

이는 A의 경우 zero-padding 과정에서 Residual Learning을 하지 않아서이다.

또한, C의 경우 B보다 성능이 좋았는데, 이는 많은 Projection Shortcut(13개)에 의해 Extra Parameter생겼기 때문이다.

즉 성능은 A < B < C 의 순으로 좋았다.

하지만 이 세 가지 모두 성능 악화 문제에 대해서는 그닥 중요하지 않았다.

이 논문에서는 성능 악화 문제를 해결하기 위해 모델이 큰 C를 굳이 사용하지 않았다.

< DEEPER BOTTLENECK ARCHITECTURES >

다음으로, ImageNet을 위해 더 깊게 만든 모델에 대해 알아보자.

Training time에 대한 우려로, Bottleneck Design 이라는 building block을 사용했으며,

각 Residual Function F에 대해 3개 Layer를 쌓은 구조를 사용했다.

3개의 Layer들은 각각 1 x 1, 3 x 3, 그리고 다시 1 x 1의 Convolution을 수행한다.

여기서 1x1 Layer는 Dimension을 줄였다가 키우는 역할을 하는데,

이는 3x3 Layer의 Input/Output Dimension을 줄이기 위해서이다.

자세한 구조는 아래 그림을 보자. (두 그림 모두 비슷한 time-complexity를 가진다.)

출처 : Deep Residual Learning for Image Recognition

여기서 Identity Shortcuts은 BottleNeck 구조에서 아주 중요하다.

만약 오른편의 그림에서, Identity Shortcut이 Projection으로 바뀐다면

해당 모델의 Time Complexity와 크기는 두배가 된다.

따라서 identity shortcut은 Bottleneck 디자인의 효율적인 구조를 위해 필수적인 요소이다.

< 50-LAYER RESNET >

기존 34-Layer network의 2-layer block을 3-Bottleneck block으로 바꾸었다.

이렇게 바꾼 구조가 50-Lyaer ResNet이다.

Dimension 증가를 위해 옵션(B)를 선택했다.

< 101-LAYER AND 152-LAYER RESNETS >

더 많은 3-Layer block을 이용해 101, 152 Layer ResNet을 만들었다.

깊이를 많이 늘렸음에도 불구하고, 152-Layer ResNet은 VGG16/19 보다 낮은 complexity를 갖는다.

출처 : Deep Residual Learning for Image Recognition

위 Table 4.를 보면, 50/101/152-Layer ResNet은 34-Layer 보다 상당히 정확한 성능을 보인다.

Degradation 문제로 인한 성능 악화는 발생하지 않았으며,

모든 evaluation metrics에서 깊이가 깊어질수록 좋은 성능이 나타나는 것을 확인했다.

< COMPARISONS WITH STATE-OF-ART METHODS >

Table 4.에 따르면, 이전의 좋은 성능을 보였던 모델보다 ResNet이 더 좋은 성능을 보인 것을 알 수 있다.

해당 논문의 baseline인 34-Layer ResNet의 경우, 훨씬 더 정확한 성능을 보였으며,

152-Layer 아주 적은 오차율을 보였다.

출처 : Deep Residual Learning for Image Recognition

또한 Emsemble Learning을 이용하여 더 낮은 Error를 달성했다.

4.2. CIFAR-10 AND ANALYSIS

ImageNet Dataset뿐만 아니라 CIFAR-10 Dataset에 대해서도 실험을 진행했다.

(50k training images, 10k testing images in 10 classes)

앞선 모델과 비슷한 구조로 모델을 구성했다.

모델 구조 순서는 다음과 같다.

1. input은 32 x 32 이미지로, 각 픽셀에서 평균값을 subtract

2. 첫 번째 Layer는 3 x 3 Convolution을 수행 이어서 6n개의 Layer들 stack

3. 6n개의 Layer들에도 3 x 3 Convolution을 수행

(각각의 feature map size는 32, 16, 8으로 각각 2n개의 Layer. 필터 개수는 각각 16, 32, 64)

4. subsampling은 stride=2 convolution으로 수행

5. 네트워크 끝단에 global average pooling, 10-way FC layer, softmax 수행

위 과정을 통해 쪽 6n+2 stacked weighted layers 구조가 만들어지게 된다.

다음은 각 output map size에 대한 구조를 나타낸 표이다.

출처 : Deep Residual Learning for Image Recognition

shortcut connection을 이용하게 되면, 한 쌍의 3 x 3 Layer에 연결된다. (총 3n개의 shortcut)

해당 Dataset에서는 모든 경우에 identity shortcut을 사용했으며,

제안된 residual model들은 모두 같은 depth, width, parameter 개수를 갖는다.

실험에 사용한 environment들은 다음과 같다.

Data argumentation : 4 pixel padded on each side

& 32 x 32 crop is randomly sampled from the padded image or its horizontal flip

Batch Normalization : O

Batch size : 128 on two GPUs

Learning rate : 0.1

(divided by 10 at 32k and 48k iterations, and terminate training at 64k)

Wight Decay : 0.0001 (Momentum of 0.9)

Dropout : X

본 논문에서는 n이 각각 3, 5, 7, 9일 때, 20, 32, 44, 56개의 Layer로 이루어진 모델들을 비교했다.

아래 그림을 보자.

출처 : Deep Residual Learning for Image Recognition

가장 왼쪽부터 차례대로 Plain network, ResNet, ResNet110, 1202 Layers이다.

먼저 왼쪽의 Plain Network 그림을 보자.

모델의 깊이가 깊어질수록 training error가 증가하며,

앞서 설명한 ImageNet과 MNIST 데이터 셋과 비슷한 현상이 나타나는 것을 확인할 수 있다.

가운데는 ResNet의 양상을 보여주는 그림이다.

이 또한 ImageNet의 결과와 비슷하며,

optimization difficulty와 깊이가 깊어질수록 정확도가 감소하는 문제를 해결했다.

마지막 그림은 ResNet 110, 1202 Layers의 양상을 보여주는 그림이다.

이처럼 굉장히 깊은 Layer의 경우, 초기에 설정한 learning rate는 0.1이 수렴하기에는 너무 큰 값이기에,

Training error가 80%(약 400 iterations) 미만이 될 때까지는 0.01을 사용했다.

(나머지 학습 환경은 이전과 동일)

그 결과 모델이 잘 수렴하는 것으로 나타났다.

이 모델의 경우 다른 FitNet, Highway보다 적은 파라미터 수를 가졌고, 더 높은 성능을 보였다.

< ANALYSIS OF LAYER RESPONSES >

출처 : Deep Residual Learning for Image Recognition

위 그림은 Layer responses의 standard deviations(std) 를 나타낸다.

여기서 response 는 각 3 x 3 Layer의 결과로, nonlinearity(RELU/addition) 이전이자 BN 이후 값이다.

ResNet에서, 이 분석값은 residual function의 response 강도를 나타낸다.

Figure 7. 을 보면, ResNet은 일반적으로 Plain Net보다 작은 response를 보였다.

이는 앞서 얘기했던 non-residual function보다 residual function이 0에 더 가깝다는 점과 동일하다.

ResNet 20, 56, 110을 비교해보면, 더 깊은 ResNet이 더 작은 response를 가진다는 것 또한 알 수 있다.

즉, Layer가 깊어질수록 signal이 적게 변화하는 경향이 나타났다.

< EXPLORING OVER 1000 LAYERS >

그렇다면 1000개 이상의 Layer를 가진 아주 깊은 모델은 어떨까?

앞선 Figure 6. 에서 언급된 n = 200 인 1202-Layer network를 보자.

비교적 더 얕은 모델인 110-Layer의 경우와 Training Error는 비슷하지만,

Testing Error는 더 깊은 모델인 1202-Layer가 더 안좋은 것을 확인할 수 있다.

분명 Residual Function을 사용하면, 깊은 모델일수록 더 잘 학습하는 것 같아 보였는데 왜일까?

논문에서는 해당 현상의 원인으로 Overfitting을 꼽고 있다.

1202-Layer는 작은 dataset에 비해 필요이상으로 크고,

maxout, dropout과 같은 강한 Regularization이 좋은 성능을 내는 것을 보였지만,^[각주:8]

해당 논문에서는 이를 사용하지 않고 비교적 심플한 구조를 사용했다.

이러한 현상에 대해서는 더 연구해 볼 예정이라고 한다.

4.3. OBJECT DETECTION ON PASCAL VOC AND MS COCO

출처 : Deep Residual Learning for Image Recognition

해당 method는 recognition에서도 좋은 성능을 보였다.

위 Table 7, Table 8.은 PASCAL VOC 2007, 2012, COCO Dataset와 같은 객체 검출 결과이다.

검출 method로는 Faster R-CNN 모델을 사용했으며, VGG-16을 ResNet-101로 교체했다.

결과적으로 이는 다른 모델들보다 더 높은 성능을 보였고,

이를 통해 ResNet이 Representation을 잘 학습하여 성능이 향상된 것을 확인할 수 있다.

<논문 구현>

ResNet 논문 구현은 아래 링크를 참고하자.

https://beginnerdeveloper-lit.tistory.com/160

[논문구현] ResNet (Deep Residual Learning for Image Recognition) 구현

ResNet에 대한 논문 리뷰 https://beginnerdeveloper-lit.tistory.com/159 [논문리뷰] ResNet (Deep Residual Learning for Image Recognition) ResNet https://arxiv.org/abs/1512.03385 Deep Residual Learning for Image Recognition Deeper neural networks ar

beginnerdeveloper-lit.tistory.com

REFERENCE

https://codebaragi23.github.io/machine%20learning/3.-ResNet-paper-review/

ResNet 논문 리뷰

논문명: Deep Residual Learning for Image Recognition

codebaragi23.github.io

https://jxnjxn.tistory.com/22

[논문 리뷰] Deep Residual Learning for Image Recognition - ResNet(1)

ResNet ResNet 이라는 이름으로 더 유명한 논문을 리뷰해보겠습니다. 최고의 빅데이터 분석 동아리 '투빅스' 과제 겸사겸사 하는 리뷰입니다. (사실 이게 아니라 구현 과제를 해야되는데 어렵네요...

jxnjxn.tistory.com

https://developnote.tistory.com/112

[논문 리뷰] ResNet: Deep Residual Learning for Image Recognition

layer가 깊으면서 현재 가장 많이 쓰이는 ResNet을 알아보자. Introduction 오늘 리뷰 할 논문은 K. He가 마이크로소프트에 있을 때 발표한 ResNet의 첫 번째 논문이다. Convolutional Layer가 깊을수록 더 복잡

developnote.tistory.com

https://gaussian37.github.io/dl-concept-resnet/

ResNet (Residual Network)

gaussian37's blog

gaussian37.github.io

https://deep-math.tistory.com/18

[논문 리뷰] ResNet

Deep Residual Learning for Image Recognition Abstract 모델이 깊어질수록 학습을 하는 것도 더 어려워진다. 이를 해결하기 위해 이 논문에서는 Residual Learning Framework를 소개한다. 이를 통해 모델의 깊이가 깊

deep-math.tistory.com

S. Ioffe and C. Szegedy. Batch normalization: Accelerating deep network training by reducing internal covariate shift. In ICML, 2015. [본문으로]
K. Simonyan and A. Zisserman. Very deep convolutional networks for large-scale image recognition. In ICLR, 2015. [본문으로]
vanishing/exploding gradient 문제란, 기울기를 구하기 위해 손실함수의 미분을 오차역전법(Backpropagation)으로 구하는 과정에서 발생하는 문제로, 해당 과정에서 활성화 함수의 편미분을 곱해줌으로서 발생한다. Layer가 클수록 vanishing 문제가 커지며, 미분값이 클수록 exploding 문제가 커지게 된다. [본문으로]
H. Jegou, F. Perronnin, M. Douze, J. Sanchez, P. Perez, and C. Schmid. Aggregating local image descriptors into compact codes. TPAMI, 2012. [본문으로]
B. D. Ripley. Pattern recognition and neural networks. Cambridge university press, 1996. [본문으로]
W. Venables and B. Ripley. Modern applied statistics with s-plus.1999 [본문으로]
C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, and A. Rabinovich. Going deeper with convolutions. In CVPR, 2015. [본문으로]
I. J. Goodfellow, D. Warde-Farley, M. Mirza, A.Courville, and Y. Bengio. Maxout networks. arXiv:1302.4389, 2013. [본문으로]

[논문구현] VGG16 (Very Deep Convolutional Networks for Large-Scale Image Recognition) 구현

릿99 — Fri, 20 Jan 2023 11:55:33 +0900

VGG16에 대한 논문 리뷰

https://beginnerdeveloper-lit.tistory.com/157

[논문리뷰] VGG16 (Very Deep Convolutional Networks for Large-Scale Image Recognition)

VGG16 https://arxiv.org/abs/1409.1556 Very Deep Convolutional Networks for Large-Scale Image Recognition In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contri

beginnerdeveloper-lit.tistory.com

VGG16

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

저번에 리뷰한 "Very Deep Convolutional Networks for Large-Scale Image Recognition" 논문의

VGG16 네트워크를 구현해보고자 한다.

VGG16의 구조는 Table 1의 D와 같으며, 자세한 그림으로 살펴보면 아래와 같다.

출처 : https://neurohive.io/en/popular-networks/vgg16/

VGG16의 16은 16-Layer, 13개의 Convolution Layer + 3개의 Fully-Connected Layer를 의미한다.

224 x 224 x 3 (RGB) 이미지를 input으로 받아 위와 같은 과정을 걸쳐 인식하게 된다.

자세한 내용은 상단 링크의 VGG16 논문 리뷰 글을 참고하자.

Environment & Parameter

❗ 해당 논문의 VGG16 모델의 구조에 초점을 맞춰 구현하였으며,

그 외 세부적인 사항까지 완벽하게 구현하지는 못했습니다.

(image crop, single/multi scale evaluation 등)

❗ 또한, 논문에 사용된 Dataset과 다른 Dataset을 사용했으므로,

Parameter들 또한 상이하다는 점 양해 부탁드립니다.

실험에 사용한 환경은 아래와 같습니다.

Language : Python

Framework : Tensorflow (GPU)

Dataset : Kaggle Dog & Cat 중 일부 사용 (train : Dog 5000, Cat 5000 / validation : Dog 2000, Cat 2000)

(https://www.kaggle.com/datasets/tongpython/cat-and-dog?select=training_set)

(Dataset 중 일부 훼손된 이미지가 있어, 해당 이미지들 필수 삭제 후 훈련 필요)

Image Size : 224 x 224 x 3

Batch Size : 32

Epoch : 50

VGG16 Code

<VGG16 Model Code>

'''
< VGG16 model Architecture>
- 13 convolution Layers + 3 fully-connected Layers
- 3x3 convolution filter, stride = 1
- 2x2 max pooling
- ReLU
'''

def VGG16():
    tf.set_random_seed(2)
    model = tf.keras.models.Sequential([
        # input = 224 x 224 x 3

        # 224 x 224 x 64
        layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu', input_shape=(224, 224, 3)),
        layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu'),

        # 112 x 112 x 64
        layers.MaxPool2D((2, 2), padding='same'),

        # 112 x 112 x 128
        layers.Conv2D(128, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(128, (3, 3), strides=1, padding='same', activation='relu'),

        # 56 x 56 x 128
        layers.MaxPool2D((2, 2), padding='same'),

        # 56 x 56 x 256
        layers.Conv2D(256, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(256, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(256, (3, 3), strides=1, padding='same', activation='relu'),

        # 28 x 28 x 256
        layers.MaxPool2D((2, 2), padding='same'),

        # 28 x 28 x 512
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),

        # 14 x 14 x 512
        layers.MaxPool2D((2, 2), padding='same'),

        # 14 x 14 x 512
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),

        # 7 x 7 x 512
        layers.MaxPool2D((2, 2), padding='same'),

        # 1 x 1 x 25088
        layers.Flatten(),
        layers.Dropout(0.5),

        # 1 x 1 x 4096
        layers.Dense(4096, activation='relu'),

        # 1 x 1 x 4096
        layers.Dense(4096, activation='relu'),

        # 1 x 1 x 1000
        layers.Dense(1000, activation='relu'),

        # 1 x 1 x 2
        layers.Dense(2, activation='softmax'),
    ])
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
                  loss='categorical_crossentropy',
                  metrics=['acc'])

    return model

<Entire Code>

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

tf.test.is_gpu_available()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

'''
< VGG16 model Architecture>
- 13 convolution Layers + 3 fully-connected Layers
- 3x3 convolution filter, stride = 1
- 2x2 max pooling
- ReLU
'''

def VGG16():
    tf.set_random_seed(2)
    model = tf.keras.models.Sequential([
        # input = 224 x 224 x 3

        # 224 x 224 x 64
        layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu', input_shape=(224, 224, 3)),
        layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu'),

        # 112 x 112 x 64
        layers.MaxPool2D((2, 2), padding='same'),

        # 112 x 112 x 128
        layers.Conv2D(128, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(128, (3, 3), strides=1, padding='same', activation='relu'),

        # 56 x 56 x 128
        layers.MaxPool2D((2, 2), padding='same'),

        # 56 x 56 x 256
        layers.Conv2D(256, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(256, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(256, (3, 3), strides=1, padding='same', activation='relu'),

        # 28 x 28 x 256
        layers.MaxPool2D((2, 2), padding='same'),

        # 28 x 28 x 512
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),

        # 14 x 14 x 512
        layers.MaxPool2D((2, 2), padding='same'),

        # 14 x 14 x 512
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),
        layers.Conv2D(512, (3, 3), strides=1, padding='same', activation='relu'),

        # 7 x 7 x 512
        layers.MaxPool2D((2, 2), padding='same'),

        # 1 x 1 x 25088
        layers.Flatten(),
        layers.Dropout(0.5),

        # 1 x 1 x 4096
        layers.Dense(4096, activation='relu'),

        # 1 x 1 x 4096
        layers.Dense(4096, activation='relu'),

        # 1 x 1 x 1000
        layers.Dense(1000, activation='relu'),

        # 1 x 1 x 2
        layers.Dense(2, activation='softmax'),
    ])
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
                  loss='categorical_crossentropy',
                  metrics=['acc'])

    return model


# Dataset (Kaggle Cat and Dog Dataset)
dataset_path = os.path.join('/home/kellybjs/Cat_Dog_Dataset')
train_dataset_path = dataset_path + '/train_set'
train_data_generator = ImageDataGenerator(rescale=1. / 255)
train_dataset = train_data_generator.flow_from_directory(train_dataset_path,
                                                         shuffle=True,
                                                         target_size=(224, 224),
                                                         batch_size=32,
                                                         class_mode='categorical')

valid_dataset_path = dataset_path + '/validation_set'
valid_data_generator = ImageDataGenerator(rescale=1. / 255)
valid_dataset = valid_data_generator.flow_from_directory(valid_dataset_path,
                                                         shuffle=True,
                                                         target_size=(224, 224),
                                                         batch_size=32,
                                                         class_mode='categorical')

# Train
print("Start Train!")
model = VGG16()
model.summary()
train = model.fit_generator(train_dataset, epochs=50, validation_data=valid_dataset)

# Accuracy graph
plt.figure(1)
plt.plot(train.history['acc'])
plt.plot(train.history['val_acc'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('VGG16_Accuracy_1.png')
print("Saved Accuracy graph")

# Loss graph
plt.figure(2)
plt.plot(train.history['loss'])
plt.plot(train.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.savefig('VGG16_Loss_1.png')
print("Saved Loss graph")

model.save('VGG16.h5')

Result

VGG16 Accuracy

위 코드를 적용한 Train 및 Validation Accuracy 결과이다.

Train 시에는 최대 약 90%, Validation 시에는 약 80%의 정확도가 나오는 것을 볼 수 있다.

추후 다룰테지만, Fine Training 시 최대 99%정도의 정확도가 도출될 정도로 우수한 성능을 보인다.

VGG16 Loss

위 코드를 적용한 Train 및 Validation Loss 결과이다.

두 그래프 모두 점차 Loss 가 줄어드는 것이 보이나, Validation의 경우 후반에 많이 진동하는 점이 아쉽다.

이 또한 Fine Training 시 0%에 가까운 Loss가 도출되며, 보다 우수한 성능을 보인다.

[논문리뷰] VGG16 (Very Deep Convolutional Networks for Large-Scale Image Recognition)

릿99 — Thu, 19 Jan 2023 20:14:41 +0900

VGG16

https://arxiv.org/abs/1409.1556

Very Deep Convolutional Networks for Large-Scale Image Recognition

In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x

arxiv.org

1. INTRODUCTION

Convolution networks(ConvNets)는 large-scale 이미지 및 영상 인식에서 큰 성과를 이루었다.

이는 ImageNet과 같은 대용량 이미지 데이터셋과 GPU같은 고성능의 컴퓨팅 시스템,

그리고 ImageNet Large-Scale Visual Recognition Challenge(ILSVRC)과 같은 대회의 개최 등으로 인해 가능한 일이었다.

ConvNets가 컴퓨터 비전 분야에서 점차 주목받게 되면서,

original architecture를 개선하기 위한 수많은 시도들이 이루어졌는데,

그 예로 첫 번째 Convolution Layer에서 작은 receptive window size 와 작은 stride를 사용한 방법이 있다.

ILSVRC-2013에 제출된 위 방법은 뛰어난 성능을 보여주었으며,

이 외에도 전체 이미지와 multiple scales에 걸쳐 밀접하게 훈련 및 테스트하는 방법도 있다.

이 논문에서는, ConvNet의 디자인 구조 중 다른 중요한 면에 초점을 맞췄는데, 바로 깊이(depth)이다.

파라미터들을 고정시키고, 모든 Layer에 아주 작은 convolution filter(3 x 3) 을 적용함으로써,

Convolution Layer를 더해나가며 네트워크의 깊이를 점진적으로 늘려나갔다.

이는 결과적으로 더욱 정확한 ConvNet 구조를 실현시켰으며, 다른 이미지 인식 데이터셋에서도 완벽한 성능을 보여주었다.

2. CONVNET CONFIGURATIONS

2.1. ARCHITECTURE

Input : 224 x 224 RGB image

Preprocessing : Train Set의 각 픽셀에서 RGB 값의 평균 subtract

Convolution Layer : filter size= 3 x 3, stride = 1 (model C의 경우, filter size= 1 x 1, stride = 1 존재)

Padding : 1 pixel for 3 x 3 conv.layers

Pooling : 5 max pooling layers, 2 x 2 pixel window with stride = 2

FC-Layer : 처음 2 채널은 4096, 3번째 채널은 1000, 마지막은 softmax

2.2. CONFIGURATIONS

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

A-E 까지 총 6개의 model을 나누어 비교했으며, conv1은 1 x 1 filter, conv3은 3 x 3 filter를 의미한다.

Table 1. ConvNet configurations의 model C를 보면, conv1, 즉 1 x 1 convolution layer를 볼 수 있는데,

이는 receptive field에 영향을 미치지 않고 모델의 비선형성을 증가시키는 방법 중 하나로 볼 수 있다.

위 표 중 model D에 해당하는 구조를 그림으로 나타내보면 다음과 같다.

출처 : https://neurohive.io/en/popular-networks/vgg16/

위와 같이 224 x 224 x 3 (RGB) 이미지를 input으로 받아 총 16개의 Layer에 걸쳐 인식하게 된다.

여기서 이야기하는 ConvNet, 즉 VGG16의 16은 16-Layer,

13개의 Convolution Layer + 3개의 Fully-Connected Layer를 의미한다.

2.3. DISCUSSION

해당 논문에서는 첫 번째 convolution layer에서 큰 receptive field와 stride

(11 x 11 with stride=4, 7 x 7 with stride=2)를 사용하지 않고,

전체 network에 걸쳐 비교적 작은 receptive field와 stride를 사용했다. (3 x 3 with stride=1)

그렇다면, 왜 이렇게 비교적 작은 사이즈의 3 x 3 convolution filter를 사용했을까?

하나의 7 x 7 filter를 사용하는 것보다 3개의 3 x 3 filter를 쌓아 사용하는 이유가 무엇일까?

그 이유는 아래와 같다.

1. 비선형성 (Non-linear)

단일 Layer 대신 3개의 3 x 3 convolution filter를 쌓아 사용하게 되면,

Decision function이 더욱 차이를 잘 분간하게 만들 수 있는

non-linear rectification layer 3개를 포함하는 것과 같다.

즉, 비선형성을 증가시켜, 해당 모델이 이미지를 더 잘 식별하도록 한다.

2. 파라미터 개수 감소 (Decrease number of parameters)

비교적 큰 사이즈의 단일 필터 하나를 사용하는 것이 파라미터의 개수가 적을 것 같지만,

실제 각각의 필터에 대한 파라미터의 개수를 계산해보면

7 x 7 filter 1개에 대한 학습 파라미터 수는 49(7x7), 3 x 3 filter 3개에 대한 학습 파라미터 수는 27(3x3x3)이다.

즉, 3개의 3 x 3 filter가 하나의 7 x 7 filter보다 훨씬 적은 파라미터를 갖는다.

이 논문은 비교적 작은 크기의 Convolution Filter를 사용했다는 점, Network의 깊이를 늘렸다는 점이 특징이다.

그렇다면, 다른 유사한 아이디어를 가진 논문들과의 차별점은 무엇일까?

먼저, 비교적 작은 크기의 convolution filter를 사용한 방식은 이전에도 존재했다.

하지만, 해당 논문에서처럼 깊게 Layer를 쌓지 못했으며, 대용량 ILSVRC 데이터셋에도 적용하지 못했다.^[각주:1]

다른 연구에서는 깊은 ConvNet을 적용함으로써, 깊이를 깊게 만들면 더 나은 성능을 보일 수 있다는 점을 보였다.

하지만 이는 해당 논문에 비교하면 비교적 적은 Layer(11 weight layers)를 가졌다.^[각주:2]

GoogLeNet^[각주:3]은 ILSVRC-2014의 classification 부문에서 가장 좋은 성능을 낸 모델로,

깊은 ConvNet, 작은 convolution filter를 이용했다는 점에서 매우 유사하다.

(22 weight layers and use 3 x 3, 1 x 1, 5 x 5 convolutions)

3. CLASSIFICATION FRAMEWORK

3.1. TRAINING

Optimization: multinomial logistic regression using mini-batch gradient descent with momentum

(batch size = 256, momentum to 0.9)

Regularization : weight decay(L2 Norm with 5*10^-4), dropout = 0.5 (for first 2 FC-Layer)

Learning rate : 0.01 (단,validation set에 대한 accuracy가 증가하지 않을 시, 10배로 감소

해당 실험에서는 3번 감소하였으며, 74 epoch 학습)

가중치가 잘못 초기화된 경우 학습이 불안정해질 수 있기 때문에, 깊이가 깊은 네트워크에서 초기 네트워크 가중치 설정은 매우 중요하다.

이러한 문제를 피하고자, 해당 논문에서는 Table1.의 모델 A를 선학습했다.

이렇듯 먼저 학습된 모델 A를 이용해, 더 깊은 구조의 모델들을 학습 시킬 경우,

처음 4개의 Convolution Layer와 마지막 3개의 Fully Connected Layer를 A의 Layer로 초기화했다.

또한, ConvNet의 input 이미지로 일정한 크기의 이미지(224 x 224)를 얻기 위해,

(256 x 256) ~ (512 x 512) 사이의 사이즈로 rescale된 이미지들에서 랜덤하게 crop하여 사용했다.

(SGD iteration 당 하나의 crop image)

이와 같은 방법은 한정적인 데이터의 양을 늘리고,

하나의 object에 대해 다양한 면을 학습시킴으로서 classification 정확도를 향상시켰다.

또한 Training set을 강화하기 위해, 랜덤으로 좌우 반전 및 RGB color shift를 적용하기도 했다.

3.2. TESTING

오버피팅을 방지하기 위해, Train과 Test의 구조가 조금 다르다.

가장 주요한 차이점은 Test시, Fully-Connected Layer는 먼저 Convolution Layer로 변환된다는 것이다.

(첫 번째 FC Layer는 7 x 7 Convolution Layer, 마지막 2개의 FC Layer는 1 x 1 Convolution Layer로 변환)

이렇게 변환된 네트워크를 Fully-Convolutional Net(FCN) 이라고 부르며,

FCN이 적용됨에 따라 input 이미지 사이즈에 대한 제약이 사라지며, 테스트 시 이미지를 crop 할 필요가 없어진다.

또한, 하나의 이미지를 다양한 크기로 변환 및 조합하여 이미지 classification 정확도를 개선할 수 있다.

3.3. IMPLEMENTATION DETAILS

해당 논문의 실험에서는 Multi-GPU training을 통해

병렬로 처리되는 여러 GPU 배치들로 각각의 training 이미지를 분할하여 사용했다.

GPU batch gradient를 계산 후 full batch의 gradient를 얻기 위해 평균을 계산했으며,

이는 평균화된 뒤 합쳐져서 full batch에 적용되었기 때문에 하나의 GPU를 사용한 것과 똑같은 결과가 나왔다.

사용한 GPU는 NVIDIA Titan black GPU 4개로, 하나의 네트워크를 훈련하는데 약 2~3주가 소요되었다.

4. CLASSIFICATION EXPERIMENTS

Dataset : 1000 class image 포함. 다음과 같이 3set로 나뉨

Training set (1.3M images)

Validation set (50K images)

Testing set (100K images)

Classification performance : 2 measures (top-1 error, top-5 error)

4.1. SINGLE SCALE EVALUATION

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

Single Scale과 Multi-Scale은 이미지의 크기를 하나로 고정할지, 또는 여러개를 사용할지를 나타낸다.

위 표는 single-scale test 이미지를 적용했을 때의 결과를 나타낸 표이다.

상단의 표를 통해 첫 번째로, 모델 A와 A-LRN를 비교한 결과,

Local Response Normalisation(LRN)이 성능을 향상시키지 못한다는 것을 알 수 있다.

따라서, B~E 모델에서는 LRN을 적용하지 않았다.

두 번째로, 네트워크의 깊이가 깊어질수록 결과가 좋아진다는 것을 확인할 수 있다.

따라서 가장 깊은 모델인 E의 오차가 가장 적게 나왔으며, 얕은 모델인 A로 갈수록 오차가 커졌다.

마지막으로, 이미지 스케일에 변화를 주는 것(S = 256 ~ 512)이 성능이 보다 높다는 것을 확인할 수 있다.

이를 통해, 스케일 변화(scale jittering)를 사용한 데이터 증강이 훈련 시 도움이 된다는 것을 확인했다.

4.2. MULTI SCALE EVALUATION

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

위 표는 multi-scale test 이미지를 적용했을 때의 결과를 나타낸 표이다.

jittering을 적용한 경우, 출력의 크기는 [256, 384, 512]로 테스트 이미지의 크기를 정했으며,

jittering을 적용하지 않은 것보다 결과가 좋음을 확인할 수 있다.

또한, single-scale 보다는 multi-scale의 결과가 좋다는 것을 확인할 수 있다.

4.3. MULTI CROP EVALUATION

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

위 표는 Multi-crop evaluation과 Dense-evaluation, 그리고 이 두 기법을 합친 결과를 나타낸 표이다.

두 기법의 soft-max output을 평균화해서 complementarity를 평가했으며,

multi-crop 방식이 dense 방식보다 좋은 성능을 보이나 연산량이 많다는 점이 단점이다.

결과적으로는 두 기법의 조합이 각각의 기법보다 나은 성능을 보였다.

4.4. CONVNET FUSION

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

앙상블을 통해 여러 모델을 조합한 결과를 나타낸 표이다.

Softmax class posterior을 평균화하여 여러 모델의 output을 조합, 실험한 결과,

모델의 보완성으로 인해 성능이 향상된 것을 확인 할 수 있다.

4.5. COMPARISON WITH THE STATE OF THE ART

출처 : Very Deep Convolutional Networks for Large-Scale Image Recognition

타 모델과의 성능을 비교한 결과를 나타낸 표이다.

다른 모델들에 비해 우수한 성능을 보여준 것을 확인할 수 있다.

5. CONCLUSION

논문에서는 깊은 Convolutional 네트워크(최대 19 weight layers)를 통해

대용량 이미지 classification을 수행했다.

이는 네트워크의 깊이가 classification 정확도에 효과적이며,

평범한 ConvNet 구조를 이용해 ImageNet challenge dataset과 같은 데이터셋을 분류할 수 있음을 입증했다.

< 논문 구현 >

논문 구현은 아래 링크를 참고하자.

https://beginnerdeveloper-lit.tistory.com/158

[논문구현] VGG16 (Very Deep Convolutional Networks for Large-Scale Image Recognition) Image Classification 구현

VGG16에 대한 논문 리뷰 https://beginnerdeveloper-lit.tistory.com/157 [논문리뷰] VGG16 (Very Deep Convolutional Networks for Large-Scale Image Recognition) VGG16 https://arxiv.org/abs/1409.1556 Very Deep Convolutional Networks for Large-Scale Ima

beginnerdeveloper-lit.tistory.com

REFERENCE

https://medium.com/@msmapark2/vgg16-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0-very-deep-convolutional-networks-for-large-scale-image-recognition-6f748235242a

VGG16 논문 리뷰 — Very Deep Convolutional Networks for Large-Scale Image Recognition

VGG-16 모델은 ImageNet Challenge에서 Top-5 테스트 정확도를 92.7% 달성하면서 2014년 컴퓨터 비전을 위한 딥러닝 관련 대표적 연구 중 하나로 자리매김하였다.

medium.com

https://mountain96.tistory.com/37

[논문 리뷰] VGG16(ICLR 2015) 요약 및 리뷰

서론 1. 배경 Convolutional networks(ConvNets)는 대용량 이미지 및 영상 인식에서 큰 발전을 이룩하였는데, 이는 ImageNet과 같은 대용량 공공 이미지 데이터셋이 가능해지고, GPU와 같은 고성능 컴퓨팅 시

mountain96.tistory.com

https://ahnty0122.tistory.com/72

[논문 리뷰] Very Deep Convolutional Networks for Large-Scale Image Recognition 리뷰, VGG Net

무려 1년전에 정리해놓은 논문 올리기 ㅎㅅㅎ Image Recognition에 입문할 때 좋은 논문이라고 생각한다. Very Deep Convolutional Networks for Large-Scale Image Recognition arxiv.org/abs/1409.1556 Very Deep Convolutional Networks

ahnty0122.tistory.com

https://codebaragi23.github.io/machine%20learning/1.-VGGNet-paper-review/

VGGNet 논문 리뷰

논문명: Very Deep Convolutional Networks for Large-Scale Image Recognition

codebaragi23.github.io

Ciresan, D. C., Meier, U., Masci, J., Gambardella, L. M., and Schmidhuber, J. Flexible, high performance convolutional neural networks for image classification. In IJCAI, pp. 1237–1242, 2011. [본문으로]
Goodfellow, I. J., Bulatov, Y., Ibarz, J., Arnoud, S., and Shet, V. Multi-digit number recognition from street view imagery using deep convolutional neural networks. In Proc. ICLR, 2014. [본문으로]
Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., and Rabinovich, A. Going deeper with convolutions. CoRR, abs/1409.4842, 2014. [본문으로]