diff --git a/build_dataset.py b/build_dataset.py
new file mode 100644
index 0000000..f965a7b
--- /dev/null
+++ b/build_dataset.py
@@ -0,0 +1,55 @@
+# %%
+# abstracts away the dataset loading and unloading
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.transforms import AutoAugment, AutoAugmentPolicy
+
+CLASS_REPR = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+def collate_fn(data):
+    """pass the collation function to the Dataloader to batch images
+       into a single tensor"""
+    images = [d[0] for d in data]
+    labels = [d[1] for d in data]
+    return torch.stack(images, dim=0), torch.tensor(labels)
+
+def get_dataset(batch_size, augment=True):
+    # some dataset transforms, to introduce some invariances to the model
+    # such as scale invariance, rotation invariance, etc.
+    # finally noramlize the image for better training dynamics
+    transform = [AutoAugment(AutoAugmentPolicy.CIFAR10)] if augment else []
+    transform = transforms.Compose(
+     transform + [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+    # we don't want augmentations on the val set
+    transform_val = transforms.Compose(
+     [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+    trainset = torchvision.datasets.CIFAR10(root='data', train=True,
+                                            download=True, transform=transform)
+    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
+                                            shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+    # do not train on the test dataset, to better gauge generalization error
+    # instead of training error, test dataset should not include any training
+    # images
+    testset = torchvision.datasets.CIFAR10(root='data', train=False,
+                                        download=True, transform=transform_val)
+    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
+                                            shuffle=False, num_workers=2, collate_fn=collate_fn)
+
+    return trainloader, testloader
+
+
+# some unit testing to see that it all works
+if __name__ == '__main__':
+    from matplotlib import pyplot as plt
+    train_loader, test_loader = get_dataset(4, augment=False)
+    images, labels = next(iter(train_loader))
+    plt.imshow(images[0].permute(1, 2, 0))
+    plt.show()
+    print("class:", CLASS_REPR[labels[0]])
diff --git a/final_accuracy.txt b/final_accuracy.txt
new file mode 100644
index 0000000..745c977
--- /dev/null
+++ b/final_accuracy.txt
@@ -0,0 +1 @@
+Accuracy of the network on the 10000 test images: 87.0 %
\ No newline at end of file
diff --git a/losses.txt b/losses.txt
new file mode 100644
index 0000000..17c69cd
--- /dev/null
+++ b/losses.txt
@@ -0,0 +1,18 @@
+[1,   500] train loss: 1.907 
+[1,   500] eval loss: 1.870 
+[1,  1000] train loss: 1.123 
+[1,  1000] eval loss: 1.019 
+[1,  1500] train loss: 0.806 
+[1,  1500] eval loss: 0.681 
+[2,   500] train loss: 0.652 
+[2,   500] eval loss: 0.552 
+[2,  1000] train loss: 0.600 
+[2,  1000] eval loss: 0.481 
+[2,  1500] train loss: 0.571 
+[2,  1500] eval loss: 0.452 
+[3,   500] train loss: 0.509 
+[3,   500] eval loss: 0.426 
+[3,  1000] train loss: 0.497 
+[3,  1000] eval loss: 0.395 
+[3,  1500] train loss: 0.484 
+[3,  1500] eval loss: 0.387 
diff --git a/main.py b/main.py
index dfe56fe..6f0eff9 100644
--- a/main.py
+++ b/main.py
@@ -1,17 +1,142 @@
-"""
-This is a starter file to get you going. You may also include other files if you feel it's necessary.
+# %%
+from itertools import cycle
 
-Make sure to follow the code convention described here:
-https://github.com/UWARG/computer-vision-python/blob/main/README.md#naming-and-typing-conventions
+import numpy as np
+import torch
+from torch import nn
+import torch.optim as optim
 
-Hints:
-* The internet is your friend! Don't be afraid to search for tutorials/intros/etc.
-* We suggest using a convolutional neural network.
-* TensorFlow Keras has the CIFAR-10 dataset as a module, so you don't need to manually download and unpack it.
-"""
+from matplotlib import pyplot as plt
 
-# Import whatever libraries/modules you need
+# I decided I liked convnext better, being a newer architecture
+# released with weights
+# although model.py still has my comments, I did not bother with
+# modelv2.py, since the architectural changes are already listed
+# in their paper
+from modelv2 import convnext_small
+from build_dataset import get_dataset, CLASS_REPR
 
-import numpy as np
+# parameters for training
+epochs       = 3
+loss_file    = 'losses.txt'
+plot_file    = 'plots.png'
+
+batch_size   = 32
+lr           = 1e-5
+architecture = 'resnet18'
+num_classes  = len(CLASS_REPR)
+
+# I want to fine-tune the model, since this results in higher
+# total accuracy and is more energy efficient to train
+model = convnext_small(pretrained=True, num_classes=10).cuda()
+
+# building the dataset, see build_dataset for more details
+train_loader, test_loader = get_dataset(batch_size, augment=True)
+# make the test_loader be an infinite cycle, so StopIteration never occurs
+test_loader_t = cycle(iter(test_loader))
+
+# construct the losses, in this case, the negative log-likehood loss
+# which is used for classification tasks. It is the log of the softmax
+# of the logits multiplied by the label. I don't want to implement label
+# smoothing here, since its over-kill
+criterion = nn.CrossEntropyLoss()
+
+# Use the adam optimizer for lesser hyper-parameters
+optimizer = optim.Adam(model.parameters(), lr=lr)
+
+# save the loss ever log_steps
+log_step     = 500
+val_losses   = []
+train_losses = []
+loss_msg     = []
+# start the training loop
+# in a more complex project, I would usually separate this
+# in a separate function or class
+
+total_steps = 0
+for epoch in range(epochs):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    running_val_loss = 0.0
+    for i, data in enumerate(train_loader, 0):
+        # set the model to the train stage, since sometimes
+        # dropout has different behaviors
+        model.train()
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data
+        # move data to gpu for acceleration
+        inputs = inputs.cuda()
+        labels = labels.cuda()
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = model(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # also compute the evalulation loss
+        with torch.no_grad():
+            e_x, e_y = next(test_loader_t)
+            model.eval()
+            eval_loss = criterion(model(e_x.cuda()), e_y.cuda())
+            # transfer back to cpu
+            running_val_loss += float(eval_loss)
+
+        # print statistics
+        running_loss += loss.item()
+        if i % log_step == (log_step - 1):    # print every log_step mini-batches
+            train_msg = f'[{epoch + 1}, {i + 1:5d}] train loss: {running_loss / log_step:.3f} \n'
+            eval_msg = f'[{epoch + 1}, {i + 1:5d}] eval loss: {running_val_loss / log_step:.3f} \n'
+            print(train_msg)
+            print(eval_msg)
+
+            loss_msg.append(train_msg)
+            loss_msg.append(eval_msg)
+            train_losses.append(running_loss / log_step)
+            val_losses.append(running_val_loss / log_step)
+
+            running_loss = 0.0
+            running_val_loss = 0.0
+        total_steps += 1
+
+
+print('Finished Training')
+
+# plot losses over time
+plt.plot(np.array(train_losses), label='train-loss')
+plt.plot(np.array(val_losses), label='val-loss')
+plt.xlabel("steps")
+plt.ylabel('mean-crossentropy-loss')
+plt.title('mean-crossentropy-loss over steps')
+plt.legend()
+plt.savefig(plot_file)
+plt.show()
+
+# %%
+# save losses in a file
+with open(loss_file, 'w') as f:
+    for t_msg in loss_msg:
+        f.write(t_msg)
+
+correct = 0
+total = 0
+# since we're not training, we don't need to calculate the gradients for our outputs
+with torch.no_grad():
+    for data in test_loader:
+        images, labels = data
+        # calculate outputs by running images through the network
+        outputs = model(images.cuda())
+        # the class with the highest energy is what we choose as prediction
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += float((predicted == labels.cuda()).sum().item())
+
+
+print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
 
-# Your working code here
+# save accuracy in a file
+with open('final_accuracy.txt', 'w') as f:
+    f.write(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..37a088d
--- /dev/null
+++ b/model.py
@@ -0,0 +1,124 @@
+# %%
+# Mostly copy paste from https://github.com/facebookresearch/detr
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+import torchvision.models.resnet
+
+# We freeze batch norm for better fine-tuning
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+# This class serves to restitch the pretrained resnet models
+# and remove the classification head for a custom one
+# this was a copy and paste from a object detection model I have been working on
+# so extra parameters are not really necessary for classification.
+class BackboneBase(nn.Module):
+    def __init__(self, num_classes: int, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        # the resnet class of models have 4 stages, with each stage shrinking the 
+        # spatial dimension by a factor of 2 (strided convolution).
+        # probably overkill for cifar10 though.
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        # since the layers are ordered, we can hook into it by merely providing the ordering
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+        # finally, remove any spatial dimensions
+        self.avg = nn.AdaptiveAvgPool2d((1, 1))
+        # and project activations onto the output classes
+        self.out_proj = nn.Linear(num_channels, num_classes)
+
+    def forward(self, input):
+        xs = self.body(input)
+        act = xs["0"]
+        act = self.avg(act)
+        # now, act.shape = [batch_size, num_channels, 1, 1]
+        act = act.flatten(1, 3)
+        return self.out_proj(act)
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, 
+                 num_classes: int,
+                 name: str,
+                 train_backbone: bool,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=True, norm_layer=FrozenBatchNorm2d)
+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        super().__init__(num_classes, backbone, train_backbone, num_channels, return_interm_layers)
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1) # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+if __name__ == '__main__':
+    x = torch.rand(4, 3, 32, 32)
+    model = Backbone(10, 'resnet18', True, False, False)
+    y = model(x)
+    print(y.shape)
diff --git a/modelv2.py b/modelv2.py
new file mode 100644
index 0000000..2677249
--- /dev/null
+++ b/modelv2.py
@@ -0,0 +1,206 @@
+# copied from https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext_isotropic.py
+# %%
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.registry import register_model
+
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, num_classes=1000, 
+                 depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0., 
+                 layer_scale_init_value=1e-6, head_init_scale=1.,
+                 ):
+        super().__init__()
+
+        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j], 
+                layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
+        self.head = nn.Linear(dims[-1], num_classes)
+
+        self.apply(self._init_weights)
+        self.head.weight.data.mul_(head_init_scale)
+        self.head.bias.data.mul_(head_init_scale)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean([-2, -1])) # global average pooling, (N, C, H, W) -> (N, C)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError 
+        self.normalized_shape = (normalized_shape, )
+    
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+model_urls = {
+    "convnext_tiny_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+    "convnext_small_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+    "convnext_base_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+    "convnext_large_1k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+    "convnext_tiny_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
+    "convnext_small_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
+    "convnext_base_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
+    "convnext_large_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
+    "convnext_xlarge_22k": "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
+}
+
+@register_model
+def convnext_tiny(pretrained=False,in_22k=False, num_classes=1000, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    
+    if pretrained:
+        url = model_urls['convnext_tiny_22k'] if in_22k else model_urls['convnext_tiny_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+        model.load_state_dict(checkpoint["model"], strict=False)
+    if num_classes != 1000:
+        model.head = nn.Linear(768, num_classes)
+    return model
+
+@register_model
+def convnext_small(pretrained=False,in_22k=False, num_classes=1000, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_small_22k'] if in_22k else model_urls['convnext_small_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"], strict=False)
+    if num_classes != 1000:
+        model.head = nn.Linear(768, num_classes)
+    return model
+
+@register_model
+def convnext_base(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_base_22k'] if in_22k else model_urls['convnext_base_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"], strict=False)
+    return model
+
+@register_model
+def convnext_large(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    if pretrained:
+        url = model_urls['convnext_large_22k'] if in_22k else model_urls['convnext_large_1k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"], strict=False)
+    return model
+
+@register_model
+def convnext_xlarge(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
+    if pretrained:
+        assert in_22k, "only ImageNet-22K pre-trained ConvNeXt-XL is available; please set in_22k=True"
+        url = model_urls['convnext_xlarge_22k']
+        checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu")
+        model.load_state_dict(checkpoint["model"], strict=False)
+    return model
+
+if __name__ == '__main__':
+    model = convnext_small(True, num_classes=10)
+    x = torch.rand(4, 3, 32, 32)
+    print(model(x).shape)
\ No newline at end of file
diff --git a/plots.png b/plots.png
new file mode 100644
index 0000000..1bc325a
Binary files /dev/null and b/plots.png differ