해당 글은 노션에서 작성되었습니다. 이미지와 템플릿이 깨지는 부분이 있으니, 자세한 내용은 https://western-hortensia-8e0.notion.site/ViTPose-a8b89326bd144dd09172039e7ce9eac3?pvs=4 를 참고해주세요!
https://github.com/JunkyByte/easy_ViTPose
backbone→ ViT
head(TopdownHeatmapSimpleHead)→ Decoder
model.py
class ViTPose(nn.Module):
def __init__(self, cfg: dict) -> None:
super(ViTPose, self).__init__()
backbone_cfg = {k: v for k, v in cfg['backbone'].items() if k != 'type'}
head_cfg = {k: v for k, v in cfg['keypoint_head'].items() if k != 'type'}
self.backbone = ViT(**backbone_cfg)
self.keypoint_head = TopdownHeatmapSimpleHead(**head_cfg)
def forward_features(self, x):
return self.backbone(x)
def forward(self, x):
return self.keypoint_head(self.backbone(x))
vit.py
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio), padding=4 + 2 * (ratio//2-1))
def forward(self, x, **kwargs):
B, C, H, W = x.shape
#downsampling
x = self.proj(x)
Hp, Wp = x.shape[2], x.shape[3]
#flatten, linear projection
x = x.flatten(2).transpose(1, 2)
vit.py
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
norm_layer=nn.LayerNorm, attn_head_dim=None
):
super().__init__()
# norm layer
self.norm1 = norm_layer(dim)
# attention
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
# mlp
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
# FFN
def forward(self, x):
# skip connection
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class Attention(nn.Module):
def __init__(
self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
proj_drop=0., attn_head_dim=None,):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.dim = dim
self.scale = qk_scale or head_dim ** -0.5
# query key value
self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x)
qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
# scaling
q = q * self.scale
# query, key 행렬 내적하여 attention score
attn = (q @ k.transpose(-2, -1))
# softmax
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
# attention score를 value 행렬과 내적
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
topdown_heatmap_base_head.py
def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
"""Make deconv layers."""
layers = []
for i in range(num_layers):
kernel, padding, output_padding = \\
self._get_deconv_cfg(num_kernels[i]) # size를 받아옴
planes = num_filters[i]
# deconvolution layer
layers.append(
nn.ConvTranspose2d(in_channels=self.in_channels,
out_channels=planes,
kernel_size=kernel,
stride=2,
padding=padding,
output_padding=output_padding,
bias=False)
)
# BatchNorm
layers.append(nn.BatchNorm2d(planes))
# ReLU
layers.append(nn.ReLU(inplace=True))
self.in_channels = planes
return nn.Sequential(*layers)
def _get_deconv_cfg(deconv_kernel):
"""Get configurations for deconv layers."""
if deconv_kernel == 4:
padding = 1
output_padding = 0
elif deconv_kernel == 3:
padding = 1
output_padding = 1
elif deconv_kernel == 2:
padding = 0
output_padding = 0
else:
raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
return deconv_kernel, padding, output_padding
train.py
train_model(
model=model,
datasets_train=datasets_train,
datasets_valid=datasets_valid,
cfg=cfg,
distributed=distributed,
validate=cfg.validate,
timestamp=timestamp,
meta=meta
)
train_valid_fn.py
def train_model(model: nn.Module, datasets_train: Dataset, datasets_valid: Dataset, cfg: dict, distributed: bool, validate: bool, timestamp: str, meta: dict) -> None:
dataloaders_train = [DataLoader(ds, batch_size=cfg.data['samples_per_gpu'], shuffle=True, sampler=sampler, num_workers=cfg.data['workers_per_gpu'], pin_memory=False) for ds, sampler in zip(datasets_train, samplers_train)]
dataloaders_valid = [DataLoader(ds, batch_size=cfg.data['samples_per_gpu'], shuffle=False, sampler=sampler, num_workers=cfg.data['workers_per_gpu'], pin_memory=False) for ds, sampler in zip(datasets_valid, samplers_valid)]
model = DistributedDataParallel(
module=model,
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
# Loss function
criterion = JointsMSELoss(use_target_weight=cfg.model['keypoint_head']['loss_keypoint']['use_target_weight'])
# Optimizer
optimizer = AdamW(model.parameters(), lr=cfg.optimizer['lr'], betas=cfg.optimizer['betas'], weight_decay=cfg.optimizer['weight_decay'])
for dataloader in dataloaders_train:
for epoch in range(cfg.total_epochs):
model.train()
train_pbar = tqdm(dataloader)
total_loss = 0
tic = time()
for batch_idx, batch in enumerate(train_pbar):
layerwise_optimizer.zero_grad()
images, targets, target_weights, __ = batch
images = images.to('cuda')
targets = targets.to('cuda')
target_weights = target_weights.to('cuda')
total_loss += loss.item()
train_pbar.set_description(f"🏋️> Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Loss {loss.item():.4f} | LR {optimizer.param_groups[0]['lr']:.6f} | Step")
scheduler.step()
avg_loss_train = total_loss/len(dataloader)
logger.info(f"[Summary-train] Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Average Loss (train) {avg_loss_train:.4f} --- {time()-tic:.5f} sec. elapsed")
ckpt_name = f"epoch{str(epoch).zfill(3)}.pth"
ckpt_path = osp.join(cfg.work_dir, ckpt_name)
torch.save(model.module.state_dict(), ckpt_path)
# validation
if validate:
tic2 = time()
avg_loss_valid = valid_model(model, dataloaders_valid, criterion, cfg)
logger.info(f"[Summary-valid] Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Average Loss (valid) {avg_loss_valid:.4f} --- {time()-tic2:.5f} sec. elapsed")
mse_loss.py
class JointsMSELoss(nn.Module):
"""MSE loss for heatmaps.
Args:
use_target_weight (bool): Option to use weighted MSE loss.
Different joint types may have different target weights.
loss_weight (float): Weight of the loss. Default: 1.0.
"""
def __init__(self, use_target_weight=False, loss_weight=1.):
super().__init__()
self.criterion = nn.MSELoss()
self.use_target_weight = use_target_weight
self.loss_weight = loss_weight
def forward(self, output, target, target_weight):
"""Forward function."""
batch_size = output.size(0)
num_joints = output.size(1)
heatmaps_pred = output.reshape(
(batch_size, num_joints, -1)).split(1, 1)
heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
loss = 0.
for idx in range(num_joints):
heatmap_pred = heatmaps_pred[idx].squeeze(1)
heatmap_gt = heatmaps_gt[idx].squeeze(1)
if self.use_target_weight:
loss += self.criterion(heatmap_pred * target_weight[:, idx],
heatmap_gt * target_weight[:, idx])
else:
loss += self.criterion(heatmap_pred, heatmap_gt)
return loss / num_joints * self.loss_weight
https://colab.research.google.com/drive/1PxqnA97zgKbH8J8v8AVG_-Ha4zrCxCZQ?usp=sharing
[7주차 / 김지윤 / 코드리뷰] ViTAE (0) | 2023.08.21 |
---|---|
[7주차 / 임종우 / 논문리뷰] ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation (0) | 2023.08.19 |
[7주차 / 박민규 / 논문리뷰] Big Transfer : General Visual Representation Learning (0) | 2023.08.17 |
[6주차 / 박민규 / 논문리뷰 ] ViTAE (0) | 2023.08.17 |
[6주차 / 김지윤 / 논문리뷰] ViTAE : Vision Transformer Advanced by Exploring Intrinsic Inductive Bias (0) | 2023.08.11 |
댓글 영역