diff --git a/build_dataset.py b/build_dataset.py new file mode 100644 index 0000000..f965a7b --- /dev/null +++ b/build_dataset.py @@ -0,0 +1,55 @@ +# %% +# abstracts away the dataset loading and unloading +import torch +import torchvision +import torchvision.transforms as transforms +from torchvision.transforms import AutoAugment, AutoAugmentPolicy + +CLASS_REPR = ('plane', 'car', 'bird', 'cat', + 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') + +def collate_fn(data): + """pass the collation function to the Dataloader to batch images + into a single tensor""" + images = [d[0] for d in data] + labels = [d[1] for d in data] + return torch.stack(images, dim=0), torch.tensor(labels) + +def get_dataset(batch_size, augment=True): + # some dataset transforms, to introduce some invariances to the model + # such as scale invariance, rotation invariance, etc. + # finally noramlize the image for better training dynamics + transform = [AutoAugment(AutoAugmentPolicy.CIFAR10)] if augment else [] + transform = transforms.Compose( + transform + [transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + # we don't want augmentations on the val set + transform_val = transforms.Compose( + [transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + trainset = torchvision.datasets.CIFAR10(root='data', train=True, + download=True, transform=transform) + trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, + shuffle=True, num_workers=2, collate_fn=collate_fn) + + # do not train on the test dataset, to better gauge generalization error + # instead of training error, test dataset should not include any training + # images + testset = torchvision.datasets.CIFAR10(root='data', train=False, + download=True, transform=transform_val) + testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, + shuffle=False, num_workers=2, collate_fn=collate_fn) + + return trainloader, testloader + + +# some unit testing to see that it all works +if __name__ == '__main__': + from matplotlib import pyplot as plt + train_loader, test_loader = get_dataset(4, augment=False) + images, labels = next(iter(train_loader)) + plt.imshow(images[0].permute(1, 2, 0)) + plt.show() + print("class:", CLASS_REPR[labels[0]]) diff --git a/final_accuracy.txt b/final_accuracy.txt new file mode 100644 index 0000000..745c977 --- /dev/null +++ b/final_accuracy.txt @@ -0,0 +1 @@ +Accuracy of the network on the 10000 test images: 87.0 % \ No newline at end of file diff --git a/losses.txt b/losses.txt new file mode 100644 index 0000000..17c69cd --- /dev/null +++ b/losses.txt @@ -0,0 +1,18 @@ +[1, 500] train loss: 1.907 +[1, 500] eval loss: 1.870 +[1, 1000] train loss: 1.123 +[1, 1000] eval loss: 1.019 +[1, 1500] train loss: 0.806 +[1, 1500] eval loss: 0.681 +[2, 500] train loss: 0.652 +[2, 500] eval loss: 0.552 +[2, 1000] train loss: 0.600 +[2, 1000] eval loss: 0.481 +[2, 1500] train loss: 0.571 +[2, 1500] eval loss: 0.452 +[3, 500] train loss: 0.509 +[3, 500] eval loss: 0.426 +[3, 1000] train loss: 0.497 +[3, 1000] eval loss: 0.395 +[3, 1500] train loss: 0.484 +[3, 1500] eval loss: 0.387 diff --git a/main.py b/main.py index dfe56fe..6f0eff9 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,142 @@ -""" -This is a starter file to get you going. You may also include other files if you feel it's necessary. +# %% +from itertools import cycle -Make sure to follow the code convention described here: -https://github.com/UWARG/computer-vision-python/blob/main/README.md#naming-and-typing-conventions +import numpy as np +import torch +from torch import nn +import torch.optim as optim -Hints: -* The internet is your friend! Don't be afraid to search for tutorials/intros/etc. -* We suggest using a convolutional neural network. -* TensorFlow Keras has the CIFAR-10 dataset as a module, so you don't need to manually download and unpack it. -""" +from matplotlib import pyplot as plt -# Import whatever libraries/modules you need +# I decided I liked convnext better, being a newer architecture +# released with weights +# although model.py still has my comments, I did not bother with +# modelv2.py, since the architectural changes are already listed +# in their paper +from modelv2 import convnext_small +from build_dataset import get_dataset, CLASS_REPR -import numpy as np +# parameters for training +epochs = 3 +loss_file = 'losses.txt' +plot_file = 'plots.png' + +batch_size = 32 +lr = 1e-5 +architecture = 'resnet18' +num_classes = len(CLASS_REPR) + +# I want to fine-tune the model, since this results in higher +# total accuracy and is more energy efficient to train +model = convnext_small(pretrained=True, num_classes=10).cuda() + +# building the dataset, see build_dataset for more details +train_loader, test_loader = get_dataset(batch_size, augment=True) +# make the test_loader be an infinite cycle, so StopIteration never occurs +test_loader_t = cycle(iter(test_loader)) + +# construct the losses, in this case, the negative log-likehood loss +# which is used for classification tasks. It is the log of the softmax +# of the logits multiplied by the label. I don't want to implement label +# smoothing here, since its over-kill +criterion = nn.CrossEntropyLoss() + +# Use the adam optimizer for lesser hyper-parameters +optimizer = optim.Adam(model.parameters(), lr=lr) + +# save the loss ever log_steps +log_step = 500 +val_losses = [] +train_losses = [] +loss_msg = [] +# start the training loop +# in a more complex project, I would usually separate this +# in a separate function or class + +total_steps = 0 +for epoch in range(epochs): # loop over the dataset multiple times + + running_loss = 0.0 + running_val_loss = 0.0 + for i, data in enumerate(train_loader, 0): + # set the model to the train stage, since sometimes + # dropout has different behaviors + model.train() + # get the inputs; data is a list of [inputs, labels] + inputs, labels = data + # move data to gpu for acceleration + inputs = inputs.cuda() + labels = labels.cuda() + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # also compute the evalulation loss + with torch.no_grad(): + e_x, e_y = next(test_loader_t) + model.eval() + eval_loss = criterion(model(e_x.cuda()), e_y.cuda()) + # transfer back to cpu + running_val_loss += float(eval_loss) + + # print statistics + running_loss += loss.item() + if i % log_step == (log_step - 1): # print every log_step mini-batches + train_msg = f'[{epoch + 1}, {i + 1:5d}] train loss: {running_loss / log_step:.3f} \n' + eval_msg = f'[{epoch + 1}, {i + 1:5d}] eval loss: {running_val_loss / log_step:.3f} \n' + print(train_msg) + print(eval_msg) + + loss_msg.append(train_msg) + loss_msg.append(eval_msg) + train_losses.append(running_loss / log_step) + val_losses.append(running_val_loss / log_step) + + running_loss = 0.0 + running_val_loss = 0.0 + total_steps += 1 + + +print('Finished Training') + +# plot losses over time +plt.plot(np.array(train_losses), label='train-loss') +plt.plot(np.array(val_losses), label='val-loss') +plt.xlabel("steps") +plt.ylabel('mean-crossentropy-loss') +plt.title('mean-crossentropy-loss over steps') +plt.legend() +plt.savefig(plot_file) +plt.show() + +# %% +# save losses in a file +with open(loss_file, 'w') as f: + for t_msg in loss_msg: + f.write(t_msg) + +correct = 0 +total = 0 +# since we're not training, we don't need to calculate the gradients for our outputs +with torch.no_grad(): + for data in test_loader: + images, labels = data + # calculate outputs by running images through the network + outputs = model(images.cuda()) + # the class with the highest energy is what we choose as prediction + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += float((predicted == labels.cuda()).sum().item()) + + +print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %') -# Your working code here +# save accuracy in a file +with open('final_accuracy.txt', 'w') as f: + f.write(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %') diff --git a/model.py b/model.py new file mode 100644 index 0000000..37a088d --- /dev/null +++ b/model.py @@ -0,0 +1,124 @@ +# %% +# Mostly copy paste from https://github.com/facebookresearch/detr + +import torch +import torch.nn.functional as F +import torchvision +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter +import torchvision.models.resnet + +# We freeze batch norm for better fine-tuning +class FrozenBatchNorm2d(torch.nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + num_batches_tracked_key = prefix + 'num_batches_tracked' + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + eps = 1e-5 + scale = w * (rv + eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + +# This class serves to restitch the pretrained resnet models +# and remove the classification head for a custom one +# this was a copy and paste from a object detection model I have been working on +# so extra parameters are not really necessary for classification. +class BackboneBase(nn.Module): + def __init__(self, num_classes: int, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool): + super().__init__() + for name, parameter in backbone.named_parameters(): + if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: + parameter.requires_grad_(False) + # the resnet class of models have 4 stages, with each stage shrinking the + # spatial dimension by a factor of 2 (strided convolution). + # probably overkill for cifar10 though. + if return_interm_layers: + return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} + else: + return_layers = {'layer4': "0"} + # since the layers are ordered, we can hook into it by merely providing the ordering + self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) + self.num_channels = num_channels + # finally, remove any spatial dimensions + self.avg = nn.AdaptiveAvgPool2d((1, 1)) + # and project activations onto the output classes + self.out_proj = nn.Linear(num_channels, num_classes) + + def forward(self, input): + xs = self.body(input) + act = xs["0"] + act = self.avg(act) + # now, act.shape = [batch_size, num_channels, 1, 1] + act = act.flatten(1, 3) + return self.out_proj(act) + + +class Backbone(BackboneBase): + """ResNet backbone with frozen BatchNorm.""" + def __init__(self, + num_classes: int, + name: str, + train_backbone: bool, + return_interm_layers: bool, + dilation: bool): + backbone = getattr(torchvision.models, name)( + replace_stride_with_dilation=[False, False, dilation], + pretrained=True, norm_layer=FrozenBatchNorm2d) + num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 + super().__init__(num_classes, backbone, train_backbone, num_channels, return_interm_layers) + + +class Net(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = torch.flatten(x, 1) # flatten all dimensions except batch + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +if __name__ == '__main__': + x = torch.rand(4, 3, 32, 32) + model = Backbone(10, 'resnet18', True, False, False) + y = model(x) + print(y.shape) diff --git a/modelv2.py b/modelv2.py new file mode 100644 index 0000000..2677249 --- /dev/null +++ b/modelv2.py @@ -0,0 +1,206 @@ +# copied from https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext_isotropic.py +# %% + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import trunc_normal_, DropPath +from timm.models.registry import register_model + +class Block(nn.Module): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), + requires_grad=True) if layer_scale_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + +class ConvNeXt(nn.Module): + r""" ConvNeXt + A PyTorch impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + def __init__(self, in_chans=3, num_classes=1000, + depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0., + layer_scale_init_value=1e-6, head_init_scale=1., + ): + super().__init__() + + self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers + stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first") + ) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2), + ) + self.downsample_layers.append(downsample_layer) + + self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks + dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential( + *[Block(dim=dims[i], drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])] + ) + self.stages.append(stage) + cur += depths[i] + + self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer + self.head = nn.Linear(dims[-1], num_classes) + + self.apply(self._init_weights) + self.head.weight.data.mul_(head_init_scale) + self.head.bias.data.mul_(head_init_scale) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2d, nn.Linear)): + trunc_normal_(m.weight, std=.02) + nn.init.constant_(m.bias, 0) + + def forward_features(self, x): + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C) + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + +class LayerNorm(nn.Module): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +model_urls = { + "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", + "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", + "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", + "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", + "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", + "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", + "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", + "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", + "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", +} + +@register_model +def convnext_tiny(pretrained=False,in_22k=False, num_classes=1000, **kwargs): + model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs) + + if pretrained: + url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k'] + checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) + model.load_state_dict(checkpoint["model"], strict=False) + if num_classes != 1000: + model.head = nn.Linear(768, num_classes) + return model + +@register_model +def convnext_small(pretrained=False,in_22k=False, num_classes=1000, **kwargs): + model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) + if pretrained: + url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k'] + checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") + model.load_state_dict(checkpoint["model"], strict=False) + if num_classes != 1000: + model.head = nn.Linear(768, num_classes) + return model + +@register_model +def convnext_base(pretrained=False, in_22k=False, **kwargs): + model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) + if pretrained: + url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k'] + checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") + model.load_state_dict(checkpoint["model"], strict=False) + return model + +@register_model +def convnext_large(pretrained=False, in_22k=False, **kwargs): + model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) + if pretrained: + url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k'] + checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") + model.load_state_dict(checkpoint["model"], strict=False) + return model + +@register_model +def convnext_xlarge(pretrained=False, in_22k=False, **kwargs): + model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) + if pretrained: + assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True" + url = model_urls['convnext_xlarge_22k'] + checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu") + model.load_state_dict(checkpoint["model"], strict=False) + return model + +if __name__ == '__main__': + model = convnext_small(True, num_classes=10) + x = torch.rand(4, 3, 32, 32) + print(model(x).shape) \ No newline at end of file diff --git a/plots.png b/plots.png new file mode 100644 index 0000000..1bc325a Binary files /dev/null and b/plots.png differ