From 409c3858e16bbf9f6f5a1a815019cb66c534ffba Mon Sep 17 00:00:00 2001
From: ENate <nathaniel@aims.ac.za>
Date: Sat, 1 Feb 2025 15:58:43 +0100
Subject: [PATCH] Implemented base transformers, structured folders, etc

---
 .devcontainer/Dockerfile                      |  11 -
 .devcontainer/devcontainer.json               |  32 --
 .devcontainer/noop.txt                        |   3 -
 .gitignore                                    |  12 +-
 .gitpod.Dockerfile                            |   0
 .gitpod.yml                                   |   4 +-
 .pre-commit-config.yaml                       |   0
 .settings/org.eclipse.core.resources.prefs    |   2 +
 README.md                                     |   3 +-
 TODO.md                                       |  11 +
 compose-llms.yaml                             |  44 ++
 devcontainer/Dockerfile                       |  30 --
 devcontainer/devcontainer.json                |  98 ----
 devcontainer/on-create.sh                     |  11 -
 .dockerignore => infra/docker/.dockerignore   |   0
 infra/docker/airflow/Dockerfile               |   0
 .../docker/docker-compose.yaml                |   0
 .../docker/docker-compose.yml                 |   2 -
 infra/docker/mlflow/Dockerfile                |   0
 .../observe}/docker-compose-observe.yml       |   0
 {etc => infra/observe/etc}/dashboards.yaml    |   0
 infra/observe/grafana/monitor.json            |   0
 infra/observe/loki/monitor.json               |   0
 .../platform/prometheus/prometheus.yml        |   0
 misc_files/ModifiedDeepNN.py                  |  37 +-
 misc_files/NeuralNetworkDiag.py               |   0
 misc_files/README.md                          |   0
 misc_files/bilinear_examples.py               |   0
 misc_files/simulator.py                       |  33 +-
 quantum/README.md                             |   0
 .../{ => fine_tuned}/reinforcement_trainer.py |   0
 .../{ => fine_tuned}/src/__init__.py          |   0
 .../{ => fine_tuned}/src/models/__init__.py   |   0
 reinforcement/{ => from_scratch}/README.md    |   0
 .../{ => from_scratch}/basic_example.ipynb    |   0
 .../from_scratch/reinforcement_trainer.py     |   0
 .../src}/__init__.py                          |   0
 .../from_scratch/src/models}/__init__.py      |   0
 .../from_scratch/src/models/train_model.py    |   8 +
 .../src/visualization}/__init__.py            |   0
 .../src/visualization/visualize.py            |   0
 requirements.txt                              |   6 +-
 supervised/README.md                          |   0
 supervised/__init__.py                        |   0
 supervised/base-trainer/docs/Makefile         |   0
 supervised/base-trainer/docs/conf.py          |  12 +-
 supervised/base-trainer/src/__init__.py       |   0
 supervised/base-trainer/src/data/.gitkeep     |   0
 supervised/base-trainer/src/data/__init__.py  |   0
 .../base-trainer/src/data/make_dataset.py     |   2 +-
 supervised/base-trainer/src/features/.gitkeep |   0
 .../base-trainer/src/features/__init__.py     |   0
 .../src/features/build_features.py            |   0
 .../base-trainer/src/models/__init__.py       |   0
 .../base-trainer/src/models/predict_model.py  |   0
 .../base-trainer/src/models/train_model.py    |   0
 .../base-trainer/src/visualization/.gitkeep   |   0
 .../src/visualization/__init__.py             |   0
 .../src/visualization/visualize.py            |   0
 .../src/transformer.py                        |   0
 .../generative-ai/src/models/train_model.py   |   0
 supervised/image_processing/README.md         |   0
 supervised/{generative-ai => nlp}/README.md   |   0
 .../{generative-ai => nlp}/notebooks/REAME.md |   0
 .../src/visualization => nlp/src}/__init__.py |   0
 .../nlp/src/models}/__init__.py               |   0
 .../src/models/generate_data.py               |   0
 .../nlp}/src/models/train_model.py            |   0
 .../nlp/src/visualization}/__init__.py        |   0
 .../src/visualization/visualize.py            |   0
 supervised/notebooks/README_personal.md       |   0
 supervised/notebooks/obj_detect.ipynb         |   0
 supervised/notebooks/py_tf_kafka.ipynb        |   0
 supervised/notebooks/tf_kafka_example.ipynb   |   0
 supervised/recommenders/notebooks/REAME.md    |   0
 supervised/recommenders/src/__init__.py       |   0
 .../recommenders/src/models/__init__.py       |   0
 .../recommenders/src/models/train_model.py    |   0
 .../src/visualization/__init__.py             |   0
 .../src/visualization/visualize.py            |   0
 .../predict_model-checkpoint.py               |   0
 supervised/trainer/predict_model.py           |   0
 supervised/trainer/train_model.py             |   0
 target/pylist.json                            |   1 -
 transformers/README.md                        |  32 +-
 .../decoder-only}/src/__init__.py             |   0
 .../encoder-decoder}/__init__.py              |   0
 .../encoder}/__init__.py                      |   0
 .../decoder/src}/__init__.py                  |   0
 .../from_scratch/decoder/test_transformer.py  |   0
 .../encoder-decoder/src/__init__.py           |   0
 .../encoder-decoder/src/decoder.py            | 162 +++++++
 .../encoder-decoder/src/encoder.py            | 115 +++++
 .../encoder-decoder}/src/lr_scheduler.py      |   9 +-
 .../encoder-decoder/src/models-tf/__init__.py |   0
 .../encoder-decoder}/src/models-tf/encoder.py |  10 +-
 .../src/models-tf/predict_model.py            |   0
 .../src/models-tf/train_model.py              |  13 +-
 .../src/models-torch/predict_model.py         |   0
 .../src/multi_head_attention.py               | 216 +++++++++
 .../src/positional_encoding.py                |  45 ++
 .../encoder-decoder/src/test_decoder.py       | 153 +++++++
 .../src/test_positional_encoding.py           | 129 ++++++
 .../encoder-decoder/src/test_utils.py         |  62 +++
 .../encoder-decoder/src/test_vocabulary.py    |  52 +++
 .../from_scratch/encoder-decoder/src/train.py |  70 +++
 .../encoder-decoder/src/transformer.py        |  54 +++
 .../encoder-decoder}/src/utils.py             |  78 +---
 .../encoder-decoder/src/vocabulary.py         |  88 ++++
 .../test_multi_head_attention.py              |  88 ++++
 .../encoder-decoder/test_train.py             |  98 ++++
 .../encoder-decoder/test_transformer.py       |  89 ++++
 .../from_scratch/encoder/src/__init__.py      |   0
 .../from_scratch/encoder/src/vocabulary.py    |   6 +
 .../from_scratch/encoder/test_transformer.py  |   0
 transformers/from_scratch/src/sentence.py     |  25 --
 transformers/src/decoder.py                   | 335 --------------
 transformers/src/models-torch/decoder.py      |   1 -
 transformers/src/models-torch/encoder.py      |   0
 transformers/src/models-torch/train_model.py  |   0
 transformers/src/multi_head_attention.py      | 418 ------------------
 transformers/src/positional_encoding.py       | 178 --------
 transformers/src/transformer.py               | 148 -------
 transformers/src/vocabulary.py                |  54 ---
 unsupervised/README.md                        |   5 +-
 125 files changed, 1610 insertions(+), 1485 deletions(-)
 delete mode 100644 .devcontainer/Dockerfile
 delete mode 100644 .devcontainer/devcontainer.json
 delete mode 100644 .devcontainer/noop.txt
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 .gitpod.Dockerfile
 mode change 100644 => 100755 .gitpod.yml
 mode change 100644 => 100755 .pre-commit-config.yaml
 create mode 100755 TODO.md
 create mode 100644 compose-llms.yaml
 delete mode 100644 devcontainer/Dockerfile
 delete mode 100644 devcontainer/devcontainer.json
 delete mode 100755 devcontainer/on-create.sh
 rename .dockerignore => infra/docker/.dockerignore (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 infra/docker/airflow/Dockerfile
 rename docker-compose.yaml => infra/docker/docker-compose.yaml (100%)
 mode change 100644 => 100755
 rename docker-compose.yml => infra/docker/docker-compose.yml (99%)
 mode change 100644 => 100755
 mode change 100644 => 100755 infra/docker/mlflow/Dockerfile
 rename {observe => infra/observe}/docker-compose-observe.yml (100%)
 mode change 100644 => 100755
 rename {etc => infra/observe/etc}/dashboards.yaml (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 infra/observe/grafana/monitor.json
 mode change 100644 => 100755 infra/observe/loki/monitor.json
 rename {observe => infra/observe}/platform/prometheus/prometheus.yml (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 misc_files/ModifiedDeepNN.py
 mode change 100644 => 100755 misc_files/NeuralNetworkDiag.py
 mode change 100644 => 100755 misc_files/README.md
 mode change 100644 => 100755 misc_files/bilinear_examples.py
 mode change 100644 => 100755 misc_files/simulator.py
 mode change 100644 => 100755 quantum/README.md
 rename reinforcement/{ => fine_tuned}/reinforcement_trainer.py (100%)
 mode change 100644 => 100755
 rename reinforcement/{ => fine_tuned}/src/__init__.py (100%)
 mode change 100644 => 100755
 rename reinforcement/{ => fine_tuned}/src/models/__init__.py (100%)
 mode change 100644 => 100755
 rename reinforcement/{ => from_scratch}/README.md (100%)
 mode change 100644 => 100755
 rename reinforcement/{ => from_scratch}/basic_example.ipynb (100%)
 mode change 100644 => 100755
 rename docker/airflow/Dockerfile => reinforcement/from_scratch/reinforcement_trainer.py (100%)
 mode change 100644 => 100755
 rename reinforcement/{src/visualization => from_scratch/src}/__init__.py (100%)
 mode change 100644 => 100755
 rename {supervised/generative-ai/src => reinforcement/from_scratch/src/models}/__init__.py (100%)
 mode change 100644 => 100755
 create mode 100755 reinforcement/from_scratch/src/models/train_model.py
 rename {supervised/generative-ai/src/models => reinforcement/from_scratch/src/visualization}/__init__.py (100%)
 mode change 100644 => 100755
 rename reinforcement/{ => from_scratch}/src/visualization/visualize.py (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 requirements.txt
 mode change 100644 => 100755 supervised/README.md
 mode change 100644 => 100755 supervised/__init__.py
 mode change 100644 => 100755 supervised/base-trainer/docs/Makefile
 mode change 100644 => 100755 supervised/base-trainer/src/__init__.py
 mode change 100644 => 100755 supervised/base-trainer/src/data/.gitkeep
 mode change 100644 => 100755 supervised/base-trainer/src/data/__init__.py
 mode change 100644 => 100755 supervised/base-trainer/src/data/make_dataset.py
 mode change 100644 => 100755 supervised/base-trainer/src/features/.gitkeep
 mode change 100644 => 100755 supervised/base-trainer/src/features/__init__.py
 mode change 100644 => 100755 supervised/base-trainer/src/features/build_features.py
 mode change 100644 => 100755 supervised/base-trainer/src/models/__init__.py
 mode change 100644 => 100755 supervised/base-trainer/src/models/predict_model.py
 mode change 100644 => 100755 supervised/base-trainer/src/models/train_model.py
 mode change 100644 => 100755 supervised/base-trainer/src/visualization/.gitkeep
 mode change 100644 => 100755 supervised/base-trainer/src/visualization/__init__.py
 mode change 100644 => 100755 supervised/base-trainer/src/visualization/visualize.py
 rename supervised/{transformers => classical}/src/transformer.py (100%)
 mode change 100644 => 100755
 delete mode 100644 supervised/generative-ai/src/models/train_model.py
 mode change 100644 => 100755 supervised/image_processing/README.md
 rename supervised/{generative-ai => nlp}/README.md (100%)
 mode change 100644 => 100755
 rename supervised/{generative-ai => nlp}/notebooks/REAME.md (100%)
 mode change 100644 => 100755
 rename supervised/{generative-ai/src/visualization => nlp/src}/__init__.py (100%)
 mode change 100644 => 100755
 rename {transformers => supervised/nlp/src/models}/__init__.py (100%)
 mode change 100644 => 100755
 rename supervised/{generative-ai => nlp}/src/models/generate_data.py (100%)
 mode change 100644 => 100755
 rename {reinforcement => supervised/nlp}/src/models/train_model.py (100%)
 mode change 100644 => 100755
 rename {transformers/from_scratch => supervised/nlp/src/visualization}/__init__.py (100%)
 mode change 100644 => 100755
 rename supervised/{generative-ai => nlp}/src/visualization/visualize.py (100%)
 mode change 100644 => 100755
 mode change 100644 => 100755 supervised/notebooks/README_personal.md
 mode change 100644 => 100755 supervised/notebooks/obj_detect.ipynb
 mode change 100644 => 100755 supervised/notebooks/py_tf_kafka.ipynb
 mode change 100644 => 100755 supervised/notebooks/tf_kafka_example.ipynb
 mode change 100644 => 100755 supervised/recommenders/notebooks/REAME.md
 mode change 100644 => 100755 supervised/recommenders/src/__init__.py
 mode change 100644 => 100755 supervised/recommenders/src/models/__init__.py
 mode change 100644 => 100755 supervised/recommenders/src/models/train_model.py
 mode change 100644 => 100755 supervised/recommenders/src/visualization/__init__.py
 mode change 100644 => 100755 supervised/recommenders/src/visualization/visualize.py
 mode change 100644 => 100755 supervised/trainer/.ipynb_checkpoints/predict_model-checkpoint.py
 mode change 100644 => 100755 supervised/trainer/predict_model.py
 mode change 100644 => 100755 supervised/trainer/train_model.py
 delete mode 100644 target/pylist.json
 mode change 100644 => 100755 transformers/README.md
 rename transformers/{from_scratch => finetuned/decoder-only}/src/__init__.py (100%)
 mode change 100644 => 100755
 rename transformers/{src => finetuned/encoder-decoder}/__init__.py (100%)
 mode change 100644 => 100755
 rename transformers/{src/models-tf => finetuned/encoder}/__init__.py (100%)
 mode change 100644 => 100755
 rename transformers/{src/models-torch => from_scratch/decoder/src}/__init__.py (100%)
 rename docker/airflow/requirements.txt => transformers/from_scratch/decoder/test_transformer.py (100%)
 rename docker/mlflow/Dockerfile => transformers/from_scratch/encoder-decoder/src/__init__.py (100%)
 mode change 100644 => 100755
 create mode 100755 transformers/from_scratch/encoder-decoder/src/decoder.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/encoder.py
 rename transformers/{ => from_scratch/encoder-decoder}/src/lr_scheduler.py (84%)
 mode change 100644 => 100755
 rename docker/mlflow/requirements.txt => transformers/from_scratch/encoder-decoder/src/models-tf/__init__.py (100%)
 mode change 100644 => 100755
 rename transformers/{ => from_scratch/encoder-decoder}/src/models-tf/encoder.py (96%)
 mode change 100644 => 100755
 rename transformers/{ => from_scratch/encoder-decoder}/src/models-tf/predict_model.py (100%)
 mode change 100644 => 100755
 rename transformers/{ => from_scratch/encoder-decoder}/src/models-tf/train_model.py (96%)
 mode change 100644 => 100755
 rename transformers/{ => from_scratch/encoder-decoder}/src/models-torch/predict_model.py (100%)
 mode change 100644 => 100755
 create mode 100755 transformers/from_scratch/encoder-decoder/src/multi_head_attention.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/positional_encoding.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/test_decoder.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/test_positional_encoding.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/test_utils.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/test_vocabulary.py
 create mode 100644 transformers/from_scratch/encoder-decoder/src/train.py
 create mode 100755 transformers/from_scratch/encoder-decoder/src/transformer.py
 rename transformers/{ => from_scratch/encoder-decoder}/src/utils.py (51%)
 mode change 100644 => 100755
 create mode 100755 transformers/from_scratch/encoder-decoder/src/vocabulary.py
 create mode 100755 transformers/from_scratch/encoder-decoder/test_multi_head_attention.py
 create mode 100644 transformers/from_scratch/encoder-decoder/test_train.py
 create mode 100755 transformers/from_scratch/encoder-decoder/test_transformer.py
 rename docker/postgres/Dockerfile => transformers/from_scratch/encoder/src/__init__.py (100%)
 create mode 100644 transformers/from_scratch/encoder/src/vocabulary.py
 rename docker/postgres/requirements.txt => transformers/from_scratch/encoder/test_transformer.py (100%)
 delete mode 100644 transformers/from_scratch/src/sentence.py
 delete mode 100644 transformers/src/decoder.py
 delete mode 100644 transformers/src/models-torch/decoder.py
 delete mode 100644 transformers/src/models-torch/encoder.py
 delete mode 100644 transformers/src/models-torch/train_model.py
 delete mode 100644 transformers/src/multi_head_attention.py
 delete mode 100644 transformers/src/positional_encoding.py
 delete mode 100644 transformers/src/transformer.py
 delete mode 100644 transformers/src/vocabulary.py
 mode change 100644 => 100755 unsupervised/README.md

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
deleted file mode 100644
index 351a829..0000000
--- a/.devcontainer/Dockerfile
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM mcr.microsoft.com/devcontainers/anaconda:0-3
-
-# Copy environment.yml (if found) to a temp location so we update the environment. Also
-# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
-COPY environment.yml* .devcontainer/noop.txt /tmp/conda-tmp/
-RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \
-    && rm -rf /tmp/conda-tmp
-
-# [Optional] Uncomment this section to install additional OS packages.
-# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
-#     && apt-get -y install --no-install-recommends <your-package-list-here>
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index 6554f6d..0000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,32 +0,0 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
-{
-	"name": "Anaconda (Python 3)",
-	"build": { 
-		"context": "..",
-		"dockerfile": "Dockerfile"
-	},
-	"features": {
-		"ghcr.io/devcontainers/features/docker-in-docker:2": {},
-		"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
-		"ghcr.io/devcontainers-contrib/features/angular-cli:2": {},
-		"ghcr.io/devcontainers-contrib/features/vault-asdf:2": {},
-		"ghcr.io/devcontainers-contrib/features/zsh-plugins:0": {},
-		"ghcr.io/cirolosapio/devcontainers-features/alpine-ohmyzsh:0": {}
-	}
-
-	// Features to add to the dev container. More info: https://containers.dev/features.
-	// "features": {},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Use 'postCreateCommand' to run commands after the container is created.
-	// "postCreateCommand": "python --version",
-
-	// Configure tool-specific properties.
-	// "customizations": {},
-
-	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
-	// "remoteUser": "root"
-}
diff --git a/.devcontainer/noop.txt b/.devcontainer/noop.txt
deleted file mode 100644
index dde8dc3..0000000
--- a/.devcontainer/noop.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-This file copied into the container along with environment.yml* from the parent
-folder. This file is included to prevents the Dockerfile COPY instruction from 
-failing if no environment.yml is found.
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
index 40db02b..0b1a201
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,14 @@
 .settings
 .vscode
 .idea
-AI_overview.drawio
\ No newline at end of file
+AI_overview.drawio
+.conda
+.settings/
+.idea
+.vscode
+transformers/from_scratch/src/__pycache__
+transformers/from_scratch/decoder/src/__pycache__
+transformers/from_scratch/encoder/src/__pycache__
+transformers/from_scratch/encoder-decoder/src/__pycache__
+supervised/base-trainer/.ipynb_checkpoints
+.project
\ No newline at end of file
diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile
old mode 100644
new mode 100755
diff --git a/.gitpod.yml b/.gitpod.yml
old mode 100644
new mode 100755
index 8af7cea..45bb565
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -38,8 +38,8 @@ vscode:
     - bierner.markdown-preview-github-styles
     - oderwat.indent-rainbow
     - mongodb.mongodb-vscode
-    - 2gua.rainbow-brackets
-    - dzhavat.bracket-pair-toggler
+    # - 2gua.rainbow-brackets
+    # - dzhavat.bracket-pair-toggler
     - IBM.output-colorizer
     - GitHub.vscode-pull-request-github
     - cweijan.git-graph-history
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
old mode 100644
new mode 100755
diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
index 6601bc8..a95d0cb 100644
--- a/.settings/org.eclipse.core.resources.prefs
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -1,2 +1,4 @@
 eclipse.preferences.version=1
+encoding//misc_files/bilinear_examples.py=utf-8
+encoding//supervised/base-trainer/src/data/make_dataset.py=utf-8
 encoding/bilinear_examples.py=utf-8
diff --git a/README.md b/README.md
index 198e292..c279c03 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,8 @@
 This repository contains examples, tutorials, tools and frameworks on how to prepare training environments, train and apply machine learning (ML) models to problems in various settings. 
 The training environments and selected tools will be based on popularity and personal choice with focus on their overall performance. Emphasis will also be laid on examples where performace will affect model training and implementation. Though my aim is not to recommend any particular tools and frameworks but I am hopeful that you may gain from my personal experience in using these tools. Besides, I will also select a number of ML model examples which are mostly suited to my use cases. Specifically, I will mention why using a particular tool may be suitable in a given scenario. Next, I will begin by listing the main tools and discuss the training methods and application of the ML models o interest.
 
-### Preparing the Training Environment 
+### Preparing the Training Environment
+
 In order to begin training or fine-tuning any model, we must prepare the training environment. This is necessary in order to facilitate training  and manage different Python versions. This also provides a virtual representation of the libraries and enables us to effectively manage tools and frameworks. It also helps us to prevent potential issues that may arise with using incompatible tools or frameworks which may affect the settings of your operating system.
 In python, I will usually download and install the latest stable Python version.
 
diff --git a/TODO.md b/TODO.md
new file mode 100755
index 0000000..7dac2b7
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,11 @@
+# Highlights
+
+ We cover the following topics:
+
+- Vectors, Matrices and Tensors
+- Vectors = magnetude and direction
+- Essential bits of Neural Network Models
+- Structure of Neural Networks Models
+- Essentials of Python Programming
+- Preparing the AI Training Tools (Tensorflow, JAX, PyTorch)
+- Implementation and Applications of AI Models
diff --git a/compose-llms.yaml b/compose-llms.yaml
new file mode 100644
index 0000000..315fbb7
--- /dev/null
+++ b/compose-llms.yaml
@@ -0,0 +1,44 @@
+version: '3'
+
+services:
+  tgi:
+    image: ghcr.io/huggingface/text-generation-inference:latest
+    container_name: tgi
+    ports:
+      - 8080:80
+    volumes:
+      - ${LOCAL_MODEL_CACHE_DIR}:/model_cache
+    environment:
+      - HUGGING_FACE_HUB_TOKEN=${LLAMA_TOKEN}
+    # need this to access GPU
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    command: 
+      - '--huggingface-hub-cache' 
+      - '/model_cache' 
+      - '--model-id'
+      - '${MODEL_ID}' 
+      - '--max-batch-prefill-tokens'
+      - '${MAX_PREFILL_TOKENS}' 
+      - '--quantize' 
+      - '${QUANT}'
+      - '--max-total-tokens'
+      - '${MAX_TOTAL_TOKENS}'
+      - '--max-input-length'
+      - '${MAX_INPUT_LENGTH}'
+    shm_size: 1gb
+  ui:
+    image: localllm-ui:latest
+    container_name: ui
+    build:
+      context: ./chat_ui/
+    ports:
+      - 7000:7000
+
+  # api:
+  #   image:
\ No newline at end of file
diff --git a/devcontainer/Dockerfile b/devcontainer/Dockerfile
deleted file mode 100644
index 22db779..0000000
--- a/devcontainer/Dockerfile
+++ /dev/null
@@ -1,30 +0,0 @@
-# See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.238.1/containers/python-3/.devcontainer/base.Dockerfile
-
-# [Choice] Python version (use -bullseye variants on local arm64/Apple Silicon): 3, 3.10, 3.9, 3.8, 3.7, 3.6, 3-bullseye, 3.10-bullseye, 3.9-bullseye, 3.8-bullseye, 3.7-bullseye, 3.6-bullseye, 3-buster, 3.10-buster, 3.9-buster, 3.8-buster, 3.7-buster, 3.6-buster
-ARG VARIANT="3.10-bullseye"
-FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT}
-
-# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
-ARG NODE_VERSION="none"
-RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
-
-# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
-COPY requirements.txt /tmp/pip-tmp/
-RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
-    && rm -rf /tmp/pip-tmp
-
-# [Optional] Uncomment this section to install additional OS packages.
-# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
-#     && apt-get -y install --no-install-recommends <your-package-list-here>
-
-# [Optional] Uncomment this line to install global node packages.
-# RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1
-RUN pip install -q tensorflow-ranking && pip install -q --upgrade tensorflow-datasets
-RUN pip install pip install --upgrade tensorflow-hub
-
-RUN pip install --no-cache-dir matplotlib pandas jupyter jupyterlab
-# RUN protoc object_detection/protos/*.proto --python_out=.
-
-EXPOSE 8888
-
-ENTRYPOINT ["jupyter", "lab","--ip=0.0.0.0","--allow-root"]
\ No newline at end of file
diff --git a/devcontainer/devcontainer.json b/devcontainer/devcontainer.json
deleted file mode 100644
index 8dca8f7..0000000
--- a/devcontainer/devcontainer.json
+++ /dev/null
@@ -1,98 +0,0 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
-// https://github.com/microsoft/vscode-dev-containers/tree/v0.238.1/containers/python-3
-{
-	"name": "Python 3",
-	"build": {
-		"dockerfile": "Dockerfile",
-		"context": "..",
-		"args": {
-			// Update 'VARIANT' to pick a Python version: 3, 3.10, 3.9, 3.8, 3.7, 3.6
-			// Append -bullseye or -buster to pin to an OS version.
-			// Use -bullseye variants on local on arm64/Apple Silicon.
-			"VARIANT": "3.10-bullseye",
-			// Options
-			"NODE_VERSION": "lts/*"
-		}
-	},
-
-	// Configure tool-specific properties.
-	"customizations": {
-		// Configure properties specific to VS Code.
-		"vscode": {
-			// Set *default* container specific settings.json values on container create.
-			"settings": {
-				"python.defaultInterpreterPath": "/usr/local/bin/python",
-				"python.linting.enabled": true,
-				"python.linting.pylintEnabled": true,
-				"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
-				"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
-				"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
-				"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
-				"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
-				"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
-				"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
-				"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
-				"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint"
-			},
-
-			// Add the IDs of extensions you want installed when the container is created.
-			"extensions": [
-				"ms-python.python",
-				"ms-python.vscode-pylance",
-				"vscjava.vscode-java-pack",
-				"formulahendry.vscode-mysql",
-				"cweijan.vscode-mysql-client2",
-				"Pivotal.vscode-boot-dev-pack",
-				"vscjava.vscode-lombok",
-				"eamodio.gitlens",
-				"felipecaputo.git-project-manager",
-				"dbaeumer.vscode-eslint",
-				"ms-vscode.cpptools-extension-pack",
-				"evondev.indent-rainbow-palettes",
-				"donjayamanne.git-extension-pack",
-				"esbenp.prettier-vscode",
-				"redhat.vscode-xml",
-				"rangav.vscode-thunder-client",
-				"developersoapbox.vscode-springboot-developer-pack",
-				"ms-azuretools.vscode-dapr",
-				"bierner.markdown-preview-github-styles",
-				"oderwat.indent-rainbow",
-				"mongodb.mongodb-vscode",
-				"2gua.rainbow-brackets",
-				"dzhavat.bracket-pair-toggler",
-				"IBM.output-colorizer",
-				"GitHub.vscode-pull-request-github",
-				"cweijan.git-graph-history",
-				"mhutchie.git-graph",
-				"ms-python.vscode-pylance",
-				"ms-python.python",
-				"pivotal.vscode-boot-dev-pack",
-				"davidanson.vscode-markdownlint",
-				"mechatroner.rainbow-csv"
-			]
-		}
-	},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Use 'postCreateCommand' to run commands after the container is created.
-	// "postCreateCommand": "pip3 install --user -r requirements.txt",
-	"onCreateCommand": "/bin/bash -c .devcontainer/on-create.sh",
-	 "postCreateCommand": "sudo apt-get install -y python3-lxml",
-
-	// Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
-	"remoteUser": "vscode",
-	"features": {
-		"docker-in-docker": "latest",
-		"kubectl-helm-minikube": "latest",
-		"terraform": "latest",
-		"git-lfs": "latest",
-		"fish": "latest",
-		"java": "lts",
-		"maven": "latest",
-		"gradle": "latest",
-		"dotnet": "latest",
-		"jupyterlab": "latest"
-	}
-}
diff --git a/devcontainer/on-create.sh b/devcontainer/on-create.sh
deleted file mode 100755
index 0115fb0..0000000
--- a/devcontainer/on-create.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo "on-create start" >> ~/status
-
-# install dapr cli
-wget -q https://raw.githubusercontent.com/dapr/cli/master/install/install.sh -O - | /bin/bash
-
-# initialize dapr
-dapr init
-
-echo "on-create complete" >> ~/status
diff --git a/.dockerignore b/infra/docker/.dockerignore
old mode 100644
new mode 100755
similarity index 100%
rename from .dockerignore
rename to infra/docker/.dockerignore
diff --git a/infra/docker/airflow/Dockerfile b/infra/docker/airflow/Dockerfile
old mode 100644
new mode 100755
diff --git a/docker-compose.yaml b/infra/docker/docker-compose.yaml
old mode 100644
new mode 100755
similarity index 100%
rename from docker-compose.yaml
rename to infra/docker/docker-compose.yaml
diff --git a/docker-compose.yml b/infra/docker/docker-compose.yml
old mode 100644
new mode 100755
similarity index 99%
rename from docker-compose.yml
rename to infra/docker/docker-compose.yml
index 45c3bcb..2e05916
--- a/docker-compose.yml
+++ b/infra/docker/docker-compose.yml
@@ -1,5 +1,3 @@
----
-version: '2'
 services:
   zookeeper:
     image: confluentinc/cp-zookeeper:7.1.0
diff --git a/infra/docker/mlflow/Dockerfile b/infra/docker/mlflow/Dockerfile
old mode 100644
new mode 100755
diff --git a/observe/docker-compose-observe.yml b/infra/observe/docker-compose-observe.yml
old mode 100644
new mode 100755
similarity index 100%
rename from observe/docker-compose-observe.yml
rename to infra/observe/docker-compose-observe.yml
diff --git a/etc/dashboards.yaml b/infra/observe/etc/dashboards.yaml
old mode 100644
new mode 100755
similarity index 100%
rename from etc/dashboards.yaml
rename to infra/observe/etc/dashboards.yaml
diff --git a/infra/observe/grafana/monitor.json b/infra/observe/grafana/monitor.json
old mode 100644
new mode 100755
diff --git a/infra/observe/loki/monitor.json b/infra/observe/loki/monitor.json
old mode 100644
new mode 100755
diff --git a/observe/platform/prometheus/prometheus.yml b/infra/observe/platform/prometheus/prometheus.yml
old mode 100644
new mode 100755
similarity index 100%
rename from observe/platform/prometheus/prometheus.yml
rename to infra/observe/platform/prometheus/prometheus.yml
diff --git a/misc_files/ModifiedDeepNN.py b/misc_files/ModifiedDeepNN.py
old mode 100644
new mode 100755
index 65842eb..0e59fba
--- a/misc_files/ModifiedDeepNN.py
+++ b/misc_files/ModifiedDeepNN.py
@@ -6,19 +6,25 @@
 #Partly obtained from a discussion on www.stackoverflow.com- and
 #modified for personal use for my thesis.
 """
-from matplotlib import pyplot
 from math import cos, sin, atan
+
+from matplotlib import pyplot
 import numpy as np
 
 # Define the number of neurons in each layer
 class Neuron:
+    """Define neuron position."""
     def __init__(self, x, y):
         self.x = x
         self.y = y
 
     def draw(self):
+        """Draw to be called."""
         circle = pyplot.Circle((self.x, self.y), radius=neuron_radius, fill=True)
         pyplot.gca().add_patch(circle)
+    
+    def layer_print(self):
+        """Prints the individual layers."""
 
 
 # Define class for each layer with number of connecting weights, number of neurons,
@@ -26,11 +32,13 @@ def draw(self):
 # as well as vertical distances between layers. It also defines
 # a function to draw the connecting weights, the individual neurons, too.
 class Layer:
-    def __init__(self, network, number_of_neurons, weights):
+    """Defines the Layer object."""
+    def __init__(self, network, number_of_neurons, weights, distance):
         self.previous_layer = self.__get_previous_layer(network)
         self.y = self.__calculate_layer_y_position()
         self.neurons = self.__intialise_neurons(number_of_neurons)
         self.weights = weights
+        self.horizontal_distance_between_neurons = distance
 
     def __intialise_neurons(self, number_of_neurons):
         neurons = []
@@ -38,12 +46,12 @@ def __intialise_neurons(self, number_of_neurons):
         for iteration in range(number_of_neurons):
             neuron = Neuron(x, self.y)
             neurons.append(neuron)
-            x += horizontal_distance_between_neurons
+            x += self.horizontal_distance_between_neurons
         return neurons
 
     def __calculate_left_margin_so_layer_is_centered(self, number_of_neurons):
         return (
-            horizontal_distance_between_neurons
+            self.horizontal_distance_between_neurons
             * (number_of_neurons_in_widest_layer - number_of_neurons)
             / 2
         )
@@ -54,9 +62,9 @@ def __calculate_layer_y_position(self):
         else:
             return 0
 
-    def __get_previous_layer(self, network):
-        if len(network.layers) > 0:
-            return network.layers[-1]
+    def __get_previous_layer(self, previous_network):
+        if len(previous_network.layers) > 0:
+            return previous_network.layers[-1]
         else:
             return None
 
@@ -91,14 +99,17 @@ def draw(self):
 # Appends the layers to each other, add neurons and defines the
 # drawing area. It also scales the axes and provides labels.
 class NeuralNetwork:
+    """Initialize."""
     def __init__(self):
         self.layers = []
 
     def add_layer(self, number_of_neurons, weights=None):
+        """Define num of neurons and add layer."""
         layer = Layer(self, number_of_neurons, weights)
         self.layers.append(layer)
 
     def draw(self):
+        """Draw network is called."""
         for layer in self.layers:
             layer.draw()
         pyplot.axis("scaled")
@@ -110,10 +121,14 @@ def draw(self):
 # Execution starts here. Provide the weights as a matrix, or define weight entries from
 # a file.
 if __name__ == "__main__":
-    vertical_distance_between_layers = 6
-    horizontal_distance_between_neurons = 2
-    neuron_radius = 0.5
-    number_of_neurons_in_widest_layer = 7
+    # vertical_distance_between_layers = 6
+    VERTICAL_DISTANCE_BETWEEN_LAYERS = 6
+    # horizontal_distance_between_neurons = 2
+    HORIZONTAL_DISTANCE_BETWEEN_NEURONS = 2
+    NEURON_RADIUS = 0.5
+    # neuron_radius = 0.5
+    # number_of_neurons_in_widest_layer = 7
+    NUMBER_OF_NEURONS_IN_WIDEST_LAYER = 7
     network = NeuralNetwork()
     # weights to convert from 10 outputs to 4 (decimal digits to their binary representation)
     # Matrices with rows as no. of weights, and columns as no. of neurons
diff --git a/misc_files/NeuralNetworkDiag.py b/misc_files/NeuralNetworkDiag.py
old mode 100644
new mode 100755
diff --git a/misc_files/README.md b/misc_files/README.md
old mode 100644
new mode 100755
diff --git a/misc_files/bilinear_examples.py b/misc_files/bilinear_examples.py
old mode 100644
new mode 100755
diff --git a/misc_files/simulator.py b/misc_files/simulator.py
old mode 100644
new mode 100755
index 9017391..1d7dba0
--- a/misc_files/simulator.py
+++ b/misc_files/simulator.py
@@ -5,11 +5,12 @@
 	Button-2 + Drag : zoom
 	Button-3 + Drag : camera movement
 """
-from __future__ import division
-from vpython import (color, display, curve, sphere, label)  # http://www.vpython.org/webdoc/visual/
-from random import random
 import time
-
+import random
+from __future__ import division
+# http://www.vpython.org/webdoc/visual/
+from vpython import (color, window, curve, sphere)
+# from vpython import color, curve, sphere, label
 # gnts ================================================================
 WINDOW_TITLE = "Brownian Motion"
 WINDOW_WIDTH = 640 + 4 + 4
@@ -36,7 +37,7 @@
 SLEEP_SECONDS = 0.25  # seconds to delay among each simulation dt
 SAMPLING_RATE = 0.5  # dts to skip among each sampling
 
-sigma = 10
+SIGMA = 10
 # maximum displacement applied when computing the particles movement
 
 
@@ -51,13 +52,13 @@
 
 # Axis X
 axisX = curve(
-    pos=[(-sigma, 0, 0), (sigma, 0, 0)], color=color.white, radius=CUBE_THICKNESS
+    pos=[(-SIGMA, 0, 0), (SIGMA, 0, 0)], color=color.white, radius=CUBE_THICKNESS
 )
 
 
 # Axis Y
 axisY = curve(
-    pos=[(0, -4 * sigma, 0), (0, 4 * sigma, 0)],
+    pos=[(0, -4 * SIGMA, 0), (0, 4 * SIGMA, 0)],
     color=color.white,
     radius=CUBE_THICKNESS,
 )
@@ -67,8 +68,8 @@
 particles = []
 streams = []
 
-min = 1.1 * PARTICLES_SIZE
-max = CUBE_SIZE - min
+MIN = 1.1 * PARTICLES_SIZE
+MAX = CUBE_SIZE - MIN
 
 # Particles: creation
 for i in range(PARTICLES_TOTAL):
@@ -97,9 +98,9 @@
         pos = particles[i].pos
 
         # Random walk
-        dS = sigma * (random.random() - 0.5)
+        dS = SIGMA * (random.random() - 0.5)
 
-        pos[0] = dt * sigma
+        pos[0] = dt * SIGMA
         pos[1] = pos[1] + dS
         pos[2] = 0
 
@@ -107,11 +108,11 @@
         streams[i].append(pos=pos)
         axisX.append(pos=(pos[0], 0, 0))
 
-        newThickness = CUBE_THICKNESS * (dt * SAMPLING_RATE) + 1
-        streams[i].radius = newThickness
-        axisX.radius = newThickness * 0.5
-        axisY.radius = newThickness * 0.5
-        particles[i].radius = newThickness * 2.0
+        NEW_THICKNESS = CUBE_THICKNESS * (dt * SAMPLING_RATE) + 1
+        streams[i].radius = NEW_THICKNESS
+        axisX.radius = NEW_THICKNESS * 0.5
+        axisY.radius = NEW_THICKNESS * 0.5
+        particles[i].radius = NEW_THICKNESS * 2.0
 
     time.sleep(SLEEP_SECONDS)
 
diff --git a/quantum/README.md b/quantum/README.md
old mode 100644
new mode 100755
diff --git a/reinforcement/reinforcement_trainer.py b/reinforcement/fine_tuned/reinforcement_trainer.py
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/reinforcement_trainer.py
rename to reinforcement/fine_tuned/reinforcement_trainer.py
diff --git a/reinforcement/src/__init__.py b/reinforcement/fine_tuned/src/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/src/__init__.py
rename to reinforcement/fine_tuned/src/__init__.py
diff --git a/reinforcement/src/models/__init__.py b/reinforcement/fine_tuned/src/models/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/src/models/__init__.py
rename to reinforcement/fine_tuned/src/models/__init__.py
diff --git a/reinforcement/README.md b/reinforcement/from_scratch/README.md
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/README.md
rename to reinforcement/from_scratch/README.md
diff --git a/reinforcement/basic_example.ipynb b/reinforcement/from_scratch/basic_example.ipynb
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/basic_example.ipynb
rename to reinforcement/from_scratch/basic_example.ipynb
diff --git a/docker/airflow/Dockerfile b/reinforcement/from_scratch/reinforcement_trainer.py
old mode 100644
new mode 100755
similarity index 100%
rename from docker/airflow/Dockerfile
rename to reinforcement/from_scratch/reinforcement_trainer.py
diff --git a/reinforcement/src/visualization/__init__.py b/reinforcement/from_scratch/src/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/src/visualization/__init__.py
rename to reinforcement/from_scratch/src/__init__.py
diff --git a/supervised/generative-ai/src/__init__.py b/reinforcement/from_scratch/src/models/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/src/__init__.py
rename to reinforcement/from_scratch/src/models/__init__.py
diff --git a/reinforcement/from_scratch/src/models/train_model.py b/reinforcement/from_scratch/src/models/train_model.py
new file mode 100755
index 0000000..a463eaf
--- /dev/null
+++ b/reinforcement/from_scratch/src/models/train_model.py
@@ -0,0 +1,8 @@
+import tensorflow as tf
+
+
+def print_version():
+    """
+    Print versions from this method
+    """
+    print(tf.__version__)
\ No newline at end of file
diff --git a/supervised/generative-ai/src/models/__init__.py b/reinforcement/from_scratch/src/visualization/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/src/models/__init__.py
rename to reinforcement/from_scratch/src/visualization/__init__.py
diff --git a/reinforcement/src/visualization/visualize.py b/reinforcement/from_scratch/src/visualization/visualize.py
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/src/visualization/visualize.py
rename to reinforcement/from_scratch/src/visualization/visualize.py
diff --git a/requirements.txt b/requirements.txt
old mode 100644
new mode 100755
index a114986..8116bb3
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ jaxlib==0.3.0
 scikit-learn==1.0.2
 pandas==1.2.1
 pytest==6.2.2
-numpy==1.22
+numpy~=1.26.1
 kafka-python==2.0.2
 python-dotenv==0.20.0
 Pillow
@@ -13,4 +13,6 @@ tensorflow-hub
 tensorflow-datasets
 tweepy
 tweety
-matplotlib
\ No newline at end of file
+matplotlib
+setuptools~=68.2.2
+torch~=2.2.0
\ No newline at end of file
diff --git a/supervised/README.md b/supervised/README.md
old mode 100644
new mode 100755
diff --git a/supervised/__init__.py b/supervised/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/docs/Makefile b/supervised/base-trainer/docs/Makefile
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/docs/conf.py b/supervised/base-trainer/docs/conf.py
index 6785100..1470961 100644
--- a/supervised/base-trainer/docs/conf.py
+++ b/supervised/base-trainer/docs/conf.py
@@ -3,12 +3,14 @@
 # base-trainer documentation build configuration file, created by
 # sphinx-quickstart.
 #
-# This file is execfile()d with the current directory set to its containing dir.
+# This file is execfile()d with the current directory set to its
+# containing dir.
 #
 # Note that not all possible configuration values are present in this
 # autogenerated file.
 #
-# All configuration values have a default; values that are commented out
+# All configuration values have a default; values that are
+# commented out
 # serve to show the default.
 
 import os
@@ -66,8 +68,8 @@
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
 
-# The reST default role (used for this markup: `text`) to use for all documents.
-# default_role = None
+# The reST default role (used for this markup: `text`)
+# to use for all documents. # default_role = None
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
 # add_function_parentheses = True
@@ -87,7 +89,7 @@
 # modindex_common_prefix = []
 
 
-# -- Options for HTML output ---------------------------------------------------
+# -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
diff --git a/supervised/base-trainer/src/__init__.py b/supervised/base-trainer/src/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/data/.gitkeep b/supervised/base-trainer/src/data/.gitkeep
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/data/__init__.py b/supervised/base-trainer/src/data/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/data/make_dataset.py b/supervised/base-trainer/src/data/make_dataset.py
old mode 100644
new mode 100755
index 1a11480..ecb1d6b
--- a/supervised/base-trainer/src/data/make_dataset.py
+++ b/supervised/base-trainer/src/data/make_dataset.py
@@ -25,7 +25,7 @@ def main(input_filepath, output_filepath):
     # not used in this stub but often useful for finding various files
     project_dir = Path(__file__).resolve().parents[2]
 
-    # find .env automagically by walking up directories until it's found, then
+    # find .env automatically by walking up directories until it's found, then
     # load up the .env entries as environment variables
     load_dotenv(find_dotenv())
     INPUT_FILE, OUTPUT_FILE = 2, 3
diff --git a/supervised/base-trainer/src/features/.gitkeep b/supervised/base-trainer/src/features/.gitkeep
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/features/__init__.py b/supervised/base-trainer/src/features/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/features/build_features.py b/supervised/base-trainer/src/features/build_features.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/models/__init__.py b/supervised/base-trainer/src/models/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/models/predict_model.py b/supervised/base-trainer/src/models/predict_model.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/models/train_model.py b/supervised/base-trainer/src/models/train_model.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/visualization/.gitkeep b/supervised/base-trainer/src/visualization/.gitkeep
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/visualization/__init__.py b/supervised/base-trainer/src/visualization/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/base-trainer/src/visualization/visualize.py b/supervised/base-trainer/src/visualization/visualize.py
old mode 100644
new mode 100755
diff --git a/supervised/transformers/src/transformer.py b/supervised/classical/src/transformer.py
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/transformers/src/transformer.py
rename to supervised/classical/src/transformer.py
diff --git a/supervised/generative-ai/src/models/train_model.py b/supervised/generative-ai/src/models/train_model.py
deleted file mode 100644
index e69de29..0000000
diff --git a/supervised/image_processing/README.md b/supervised/image_processing/README.md
old mode 100644
new mode 100755
diff --git a/supervised/generative-ai/README.md b/supervised/nlp/README.md
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/README.md
rename to supervised/nlp/README.md
diff --git a/supervised/generative-ai/notebooks/REAME.md b/supervised/nlp/notebooks/REAME.md
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/notebooks/REAME.md
rename to supervised/nlp/notebooks/REAME.md
diff --git a/supervised/generative-ai/src/visualization/__init__.py b/supervised/nlp/src/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/src/visualization/__init__.py
rename to supervised/nlp/src/__init__.py
diff --git a/transformers/__init__.py b/supervised/nlp/src/models/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/__init__.py
rename to supervised/nlp/src/models/__init__.py
diff --git a/supervised/generative-ai/src/models/generate_data.py b/supervised/nlp/src/models/generate_data.py
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/src/models/generate_data.py
rename to supervised/nlp/src/models/generate_data.py
diff --git a/reinforcement/src/models/train_model.py b/supervised/nlp/src/models/train_model.py
old mode 100644
new mode 100755
similarity index 100%
rename from reinforcement/src/models/train_model.py
rename to supervised/nlp/src/models/train_model.py
diff --git a/transformers/from_scratch/__init__.py b/supervised/nlp/src/visualization/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/from_scratch/__init__.py
rename to supervised/nlp/src/visualization/__init__.py
diff --git a/supervised/generative-ai/src/visualization/visualize.py b/supervised/nlp/src/visualization/visualize.py
old mode 100644
new mode 100755
similarity index 100%
rename from supervised/generative-ai/src/visualization/visualize.py
rename to supervised/nlp/src/visualization/visualize.py
diff --git a/supervised/notebooks/README_personal.md b/supervised/notebooks/README_personal.md
old mode 100644
new mode 100755
diff --git a/supervised/notebooks/obj_detect.ipynb b/supervised/notebooks/obj_detect.ipynb
old mode 100644
new mode 100755
diff --git a/supervised/notebooks/py_tf_kafka.ipynb b/supervised/notebooks/py_tf_kafka.ipynb
old mode 100644
new mode 100755
diff --git a/supervised/notebooks/tf_kafka_example.ipynb b/supervised/notebooks/tf_kafka_example.ipynb
old mode 100644
new mode 100755
diff --git a/supervised/recommenders/notebooks/REAME.md b/supervised/recommenders/notebooks/REAME.md
old mode 100644
new mode 100755
diff --git a/supervised/recommenders/src/__init__.py b/supervised/recommenders/src/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/recommenders/src/models/__init__.py b/supervised/recommenders/src/models/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/recommenders/src/models/train_model.py b/supervised/recommenders/src/models/train_model.py
old mode 100644
new mode 100755
diff --git a/supervised/recommenders/src/visualization/__init__.py b/supervised/recommenders/src/visualization/__init__.py
old mode 100644
new mode 100755
diff --git a/supervised/recommenders/src/visualization/visualize.py b/supervised/recommenders/src/visualization/visualize.py
old mode 100644
new mode 100755
diff --git a/supervised/trainer/.ipynb_checkpoints/predict_model-checkpoint.py b/supervised/trainer/.ipynb_checkpoints/predict_model-checkpoint.py
old mode 100644
new mode 100755
diff --git a/supervised/trainer/predict_model.py b/supervised/trainer/predict_model.py
old mode 100644
new mode 100755
diff --git a/supervised/trainer/train_model.py b/supervised/trainer/train_model.py
old mode 100644
new mode 100755
diff --git a/target/pylist.json b/target/pylist.json
deleted file mode 100644
index 880dd93..0000000
--- a/target/pylist.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"package": "tensorflow", "version": "2.8.0", "deps": [{"package": "urllib3", "version": "1.26.8"}, {"package": "typing-extensions", "version": "4.1.1"}, {"package": "tf-estimator-nightly", "version": "2.8.0.dev2021122109"}, {"package": "google-auth-oauthlib", "version": "0.4.6"}, {"package": "certifi", "version": "2021.10.8"}, {"package": "tensorflow-io-gcs-filesystem", "version": "0.24.0"}, {"package": "protobuf", "version": "3.19.4"}, {"package": "astunparse", "version": "1.6.3"}, {"package": "requests", "version": "2.27.1"}, {"package": "cachetools", "version": "5.0.0"}, {"package": "markdown", "version": "3.3.6"}, {"package": "tensorboard-data-server", "version": "0.6.1"}, {"package": "charset-normalizer", "version": "2.0.12"}, {"package": "flatbuffers", "version": "2.0"}, {"package": "google-pasta", "version": "0.2.0"}, {"package": "keras", "version": "2.8.0"}, {"package": "six", "version": "1.16.0"}, {"package": "wheel", "version": "0.37.1"}, {"package": "pyasn1", "version": "0.4.8"}, {"package": "libclang", "version": "13.0.0"}, {"package": "requests-oauthlib", "version": "1.3.1"}, {"package": "absl-py", "version": "1.0.0"}, {"package": "numpy", "version": "1.22.0"}, {"package": "werkzeug", "version": "2.0.3"}, {"package": "rsa", "version": "4.8"}, {"package": "importlib-metadata", "version": "4.11.2"}, {"package": "wrapt", "version": "1.13.3"}, {"package": "google-auth", "version": "2.6.2"}, {"package": "tensorboard-plugin-wit", "version": "1.8.1"}, {"package": "setuptools", "version": "60.9.3"}, {"package": "h5py", "version": "3.6.0"}, {"package": "pyasn1-modules", "version": "0.2.8"}, {"package": "zipp", "version": "3.7.0"}, {"package": "opt-einsum", "version": "3.3.0"}, {"package": "oauthlib", "version": "3.2.0"}, {"package": "keras-preprocessing", "version": "1.1.2"}, {"package": "grpcio", "version": "1.44.0"}, {"package": "idna", "version": "3.3"}, {"package": "gast", "version": "0.5.3"}, {"package": "termcolor", "version": "1.1.0"}, {"package": "tensorboard", "version": "2.8.0"}]}, {"package": "tensorflow-probability", "version": "0.16.0", "deps": [{"package": "absl-py", "version": "1.0.0"}, {"package": "dm-tree", "version": "0.1.6"}, {"package": "numpy", "version": "1.22.0"}, {"package": "six", "version": "1.16.0"}, {"package": "gast", "version": "0.5.3"}, {"package": "cloudpickle", "version": "2.0.0"}, {"package": "decorator", "version": "5.1.1"}]}, {"package": "jax", "version": "0.3.0", "deps": [{"package": "typing-extensions", "version": "4.1.1"}, {"package": "opt-einsum", "version": "3.3.0"}, {"package": "absl-py", "version": "1.0.0"}, {"package": "scipy", "version": "1.8.0"}, {"package": "numpy", "version": "1.22.0"}, {"package": "six", "version": "1.16.0"}]}, {"package": "jaxlib", "version": "0.3.2", "deps": [{"package": "absl-py", "version": "1.0.0"}, {"package": "flatbuffers", "version": "2.0"}, {"package": "scipy", "version": "1.8.0"}, {"package": "numpy", "version": "1.22.0"}, {"package": "six", "version": "1.16.0"}]}, {"package": "scikit-learn", "version": "0.24.1", "deps": [{"package": "joblib", "version": "1.1.0"}, {"package": "threadpoolctl", "version": "3.1.0"}, {"package": "scipy", "version": "1.8.0"}, {"package": "numpy", "version": "1.22.0"}]}, {"package": "pandas", "version": "1.2.1", "deps": [{"package": "six", "version": "1.16.0"}, {"package": "pytz", "version": "2021.3"}, {"package": "python-dateutil", "version": "2.8.2"}, {"package": "numpy", "version": "1.22.0"}]}, {"package": "pytest", "version": "6.2.2", "deps": [{"package": "toml", "version": "0.10.2"}, {"package": "iniconfig", "version": "1.1.1"}, {"package": "attrs", "version": "21.4.0"}, {"package": "pyparsing", "version": "3.0.7"}, {"package": "py", "version": "1.11.0"}, {"package": "packaging", "version": "21.3"}, {"package": "pluggy", "version": "0.13.1"}]}, {"package": "numpy", "version": "1.22.0", "deps": []}]
\ No newline at end of file
diff --git a/transformers/README.md b/transformers/README.md
old mode 100644
new mode 100755
index 7311fdc..e9ac5f7
--- a/transformers/README.md
+++ b/transformers/README.md
@@ -1,29 +1,43 @@
-### Introduction
+# Introduction
 
 -----------------------------------------------
 
-We discuss and implement transformer architectures and apply them to AI and Machine Learning (ML)  problems. 
-To begin, we briefly mention why transformer models have become so popular in various applications
-due to their performance in numerous tasks. The rise of transformers was meant to address
-issues associated with Machine Translation, Document and Text Mining, Computer Vision 
-and other Natural Language Processing tasks. More so than that, it has also been adapted for different problem types.
+The contents of this folder consist of three types of transformer based architectures: encoder, decoder and encoder-decoder models. The folder is structured between implementing the aforementioned transformer models in two stages. In the first instance, I basically copied most of the classical encoder-decoder example implemented from scratch. And then implemented both the encoder and decoder transformer based models/architectures to present their implementation in a simplified manner.
+In doing so, we highlight why  transformer models have become so popular in various applications by identifying their unique performance in numerous tasks. Transformer based models differ from other classical AI models due to the development of Attention Mechanisms (local and global context) and novelty in training (for ease of parallelization). I will also discuss why there points introduced bottlenecks when sequence to sequence models were used in Machine Translation,  and other deep learning architectures in Document, Text Mining, Computer Vision and other tasks. 
+More so than that, it has also been adapted for different problem types.
 Next, we identify the main contribution of transformer models in solving such defects in these problems. 
 To proceed, we implement an example of a transformer from scratch and identify the 
 steps associated with applying them in deep learning.  In order to follow this tutorial and codes in this folder,
 the following steps will be discussed
 
-### Main Items
+## Main Items
+This sections is divided into two main folders with each folder consisting of 3 types of transformer architectures. A brief description contains the following:
+
+- fine-tuned consists of 
+1) encoder
+2) decoder
+3) encoder-decoder
+
+- The ```from_scratch``` consists of 
+1) encoder
+2) decoder
+3) encoder-decoder
+
+In each case, we validate the models with various data types of various compositions.
+
 We highlight the main items covered in this folder. First, I will present a summary on how to build 
 a transformer from scratch. Using some of the tutorials and tools that are available, I also use some code examples
 without a deep dive into the implementations. My goal is to give you more control over your desire to study the transformer architecture.
 I also feel that by studying using these explanations (mine included :)), you can pick up more ideas from these numerous sources.
 To begin, we will cover the following topics:
+
 - Discuss Attention Mechanism in Transformers
 - Building the transformer architecture from Scratch
 - Discussing the main points to consider when training a transformer model
 - Pre-training Transformer Models including Tools and Frameworks
 
 ### Implementation Steps
+
 - Initial text pre-processing
 - Embedding preprocessed text
 - Pass text to encoder
@@ -42,7 +56,7 @@ To begin, we will cover the following topics:
 
 ### Models
 
-The models implemented in these folders can be categorized into base and fine-tuned models. The former case involves
+The models implemented in these folders can be categorized into ```from_scratch```  and fine-tuned models. The former case involves
 implementing these models from scratch. Implementing models from scratch definitely gives you more control and the opportunity
 to understand the whole model architecture. Besides giving you a much better control of working with these models, it also
 makes it easy for you to clearly understand models during fine-tuning. Model fine-tuning gives you even more 
@@ -50,7 +64,6 @@ opportunity to use ready-made models (somewhat) which are trained with large dat
 It is also said that fine-tuning models will reduce your carbon footprint (we should really care about this), and given you state-of-the-art models
 to work with. Please check for more benefits of working with pre-trained models.
 
-
 ### In the Beginning
 
 We assume familiarity with deep learning neural networks (which are known to be universal approximators)
@@ -69,6 +82,7 @@ Language models represent supervised learning models used to train and develop t
 
 
 ### BERT Model
+
 - Bidirectional Encoder Representations from Transformers (BERT)
 - The arrival of the _Attention is All you Need_ Paper
 - Based on introduction of optimal training to model
diff --git a/transformers/from_scratch/src/__init__.py b/transformers/finetuned/decoder-only/src/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/from_scratch/src/__init__.py
rename to transformers/finetuned/decoder-only/src/__init__.py
diff --git a/transformers/src/__init__.py b/transformers/finetuned/encoder-decoder/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/src/__init__.py
rename to transformers/finetuned/encoder-decoder/__init__.py
diff --git a/transformers/src/models-tf/__init__.py b/transformers/finetuned/encoder/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/src/models-tf/__init__.py
rename to transformers/finetuned/encoder/__init__.py
diff --git a/transformers/src/models-torch/__init__.py b/transformers/from_scratch/decoder/src/__init__.py
similarity index 100%
rename from transformers/src/models-torch/__init__.py
rename to transformers/from_scratch/decoder/src/__init__.py
diff --git a/docker/airflow/requirements.txt b/transformers/from_scratch/decoder/test_transformer.py
similarity index 100%
rename from docker/airflow/requirements.txt
rename to transformers/from_scratch/decoder/test_transformer.py
diff --git a/docker/mlflow/Dockerfile b/transformers/from_scratch/encoder-decoder/src/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from docker/mlflow/Dockerfile
rename to transformers/from_scratch/encoder-decoder/src/__init__.py
diff --git a/transformers/from_scratch/encoder-decoder/src/decoder.py b/transformers/from_scratch/encoder-decoder/src/decoder.py
new file mode 100755
index 0000000..88fc1a0
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/decoder.py
@@ -0,0 +1,162 @@
+"""Implementation of the decoder step."""
+
+import math
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.nn.init import xavier_uniform_
+from .multi_head_attention import MultiHeadAttention
+from .positional_encoding import SinusoidalEncoding
+
+
+class TransformerDecoder(nn.Module):
+    """Defines the transformer decoder model."""
+
+    def __init__(
+        self,
+        embedding: torch.nn.Embedding,
+        hidden_dim: int,
+        ff_dim: int,
+        num_heads: int,
+        num_layers: int,
+        vocab_size: int,
+        dropout_p: float,
+        tie_output_to_embedding: Optional[bool] = True,
+    ):
+        super().__init__()
+
+        self.hidden_dim = hidden_dim
+        self.embed = embedding
+        self.positional_encoding = SinusoidalEncoding(hidden_dim)
+        self.dropout = nn.Dropout(p=0.1)
+        self.decoder_blocks = nn.ModuleList(
+            [
+                TransformerDecoderBlock(hidden_dim, ff_dim, num_heads, dropout_p)
+                for _ in range(num_layers)
+            ]
+        )
+        self.output_layer = nn.Linear(hidden_dim, vocab_size, bias=False)
+
+        # Note: a linear layer multiplies the input with a transpose of the
+        # weight matrix, so no need to do that here.
+        if tie_output_to_embedding:
+            self.output_layer.weight = nn.Parameter(self.embed.weight)
+
+    def _reset_parameters(self):
+        """Perform xavier weight initialization"""
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+
+    def forward(
+        self,
+        input_tokens: torch.IntTensor,
+        encoder_hidden_states: torch.Tensor,
+        src_padding_mask: Optional[torch.BoolTensor] = None,
+        future_mask: Optional[torch.BoolTensor] = None,
+    ):
+        """
+        Performs one decoder forward pass given encoder hidden states, the
+        decoder input tokens and attention masks.
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+        V = vocabulary size
+
+        :param input_tokens: Decoder input tokens. Shape: (N, T)
+        :param encoder_hidden_states: The encoder's final (contextualized)
+        token embeddings. Shape: (N, S, E)
+        :param src_padding_mask: An attention mask to ignore pad-tokens in the
+        source input. Shape (N, S)
+        :param future_mask: An attention mask to ignore future-tokens in the
+        target input. Shape (T, T)
+        :return: Un-normalized logits over the vocabulary for every token in
+        the batch. Shape (N, T, V)
+        """
+        # (batch_size, sequence_length, hidden_dim)
+        x = self.embed(input_tokens) * math.sqrt(self.hidden_dim)
+        x = self.positional_encoding(x)
+        x = self.dropout(x)
+
+        for decoder_block in self.decoder_blocks:
+            x = decoder_block(x, encoder_hidden_states, src_padding_mask, future_mask)
+
+        # (batch_size, sequence_length, vocab_size)
+        logits = self.output_layer(x)
+        return logits
+
+
+class TransformerDecoderBlock(nn.Module):
+    """Generate transformer block
+
+    Args:
+        nn (model): defines neural network model.
+    """
+
+    def __init__(self, hidden_dim: int, ff_dim: int, num_heads: int, dropout_p: float):
+        super().__init__()
+
+        self.cross_mha = MultiHeadAttention(hidden_dim, num_heads)
+        self.self_mha = MultiHeadAttention(hidden_dim, num_heads)
+        self.feed_forward = nn.Sequential(
+            nn.Linear(hidden_dim, ff_dim),
+            nn.ReLU(),
+            nn.Linear(ff_dim, hidden_dim),
+        )
+
+        self.dropout1 = nn.Dropout(p=dropout_p)
+        self.dropout2 = nn.Dropout(p=dropout_p)
+        self.dropout3 = nn.Dropout(p=dropout_p)
+
+        self.layer_norm1 = nn.LayerNorm(hidden_dim)
+        self.layer_norm2 = nn.LayerNorm(hidden_dim)
+        self.layer_norm3 = nn.LayerNorm(hidden_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_hidden_states: torch.FloatTensor,
+        src_padding_mask: Optional[torch.BoolTensor] = None,
+        future_mask: Optional[torch.BoolTensor] = None,
+    ):
+        """
+        Performs one decoder *block* forward pass given final encoder hidden
+        states, the previous block's output, and
+        attention masks.
+
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+        V = vocabulary size
+
+        :param x: Previous decoder block's output. Shape: (N, T, E)
+        :param encoder_hidden_states: The encoder's final (contextualized)
+        token embeddings. Shape: (N, S, E)
+        :param src_padding_mask: An attention mask to ignore pad-tokens in the
+        source input. Shape (N, S)
+        :param future_mask: An attention mask to ignore future-tokens in the
+        target input. Shape (T, T)
+        :return: Updated, contextualized token embeddings. Shape (N, T, E)
+        """
+
+        # Self attention (with future masking during training)
+        output = self.dropout1(self.self_mha.forward(x, future_mask=future_mask))
+        x = self.layer_norm1(x + output)
+
+        # Cross or encoder-decoder attention
+        output = self.dropout2(
+            self.cross_mha.forward(
+                x,
+                encoder_hidden_states=encoder_hidden_states,
+                src_padding_mask=src_padding_mask,
+            )
+        )
+        x = self.layer_norm2(x + output)
+
+        # Feed forward layers
+        output = self.dropout3(self.feed_forward(x))
+        x = self.layer_norm3(x + output)
+        return x
diff --git a/transformers/from_scratch/encoder-decoder/src/encoder.py b/transformers/from_scratch/encoder-decoder/src/encoder.py
new file mode 100755
index 0000000..0bdb421
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/encoder.py
@@ -0,0 +1,115 @@
+"""A torch transformer for learning encoder from scratch"""
+
+import math
+import torch
+from torch import nn
+
+from torch.nn.init import xavier_uniform_
+from .multi_head_attention import MultiHeadAttention
+from .positional_encoding import SinusoidalEncoding
+
+
+class TransformerEncoder(nn.Module):
+    """Defines transformer encoders."""
+
+    def __init__(
+        self,
+        embedding: torch.nn.Embedding,
+        hidden_dim: int,
+        ff_dim: int,
+        num_heads: int,
+        num_layers: int,
+        dropout_p: float,
+    ):
+        super().__init__()
+        self.embed = embedding
+        self.hidden_dim = hidden_dim
+        self.positional_encoding = SinusoidalEncoding(hidden_dim, max_len=5000)
+        self.dropout = nn.Dropout(p=dropout_p)
+        self.encoder_blocks = nn.ModuleList(
+            [
+                EncoderBlock(hidden_dim, ff_dim, num_heads, dropout_p)
+                for _ in range(num_layers)
+            ]
+        )
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+
+    def forward(
+        self, input_ids: torch.Tensor, src_padding_mask: torch.BoolTensor = None
+    ):
+        """
+        Performs one encoder forward pass given input
+        token ids and an optional attention mask.
+
+        N = batch size
+        S = source sequence length
+        E = embedding dimensionality
+
+        :param input_ids: Tensor containing input token
+        ids. Shape: (N, S)
+        :param src_padding_mask: An attention mask to
+        ignore pad-tokens in the source input. Shape (N, S)
+        :return: The encoder's final (contextualized)
+        token embeddings. Shape: (N, S, E)
+        """
+        x = self.embed(input_ids) * math.sqrt(self.hidden_dim)  # (N, S, E)
+        x = self.positional_encoding(x)
+        x = self.dropout(x)
+        for encoder_block in self.encoder_blocks:
+            x = encoder_block.forward(x, src_padding_mask=src_padding_mask)
+        return x
+
+
+class EncoderBlock(nn.Module):
+    """Encoder block."""
+
+    def __init__(self, hidden_dim: int, ff_dim: int, num_heads: int, dropout_p: float):
+        """Encode block module.
+
+        Args:
+            hidden_dim (int): _description_
+            ff_dim (int): _description_
+            num_heads (int): _description_
+            dropout_p (float): _description_
+        """
+        super().__init__()
+        self.self_mha = MultiHeadAttention(hidden_dim, num_heads)
+        self.feed_forward = nn.Sequential(
+            nn.Linear(hidden_dim, ff_dim),
+            nn.ReLU(),
+            nn.Linear(ff_dim, hidden_dim),
+        )
+
+        self.dropout1 = nn.Dropout(p=dropout_p)
+        self.dropout2 = nn.Dropout(p=dropout_p)
+        self.layer_norm1 = nn.LayerNorm(hidden_dim)
+        self.layer_norm2 = nn.LayerNorm(hidden_dim)
+
+    def forward(self, x: torch.FloatTensor, src_padding_mask: torch.BoolTensor = None):
+        """
+        Performs one encoder *block* forward pass given the previous block's
+        output and an optional attention mask.
+
+        N = batch size
+        S = source sequence length
+        E = embedding dimensionality
+
+        :param x: Tensor containing the output of the previous encoder block.
+        Shape: (N, S, E)
+        :param src_padding_mask: An attention mask to ignore pad-tokens
+        in the source input. Shape (N, S)
+        :return: Updated intermediate encoder (contextualized) token
+        embeddings. Shape: (N, S, E)
+        """
+        output = self.dropout1(
+            self.self_mha.forward(x, src_padding_mask=src_padding_mask)
+        )
+        x = self.layer_norm1(x + output)
+
+        output = self.dropout2(self.feed_forward(x))
+        x = self.layer_norm2(x + output)
+        return x
diff --git a/transformers/src/lr_scheduler.py b/transformers/from_scratch/encoder-decoder/src/lr_scheduler.py
old mode 100644
new mode 100755
similarity index 84%
rename from transformers/src/lr_scheduler.py
rename to transformers/from_scratch/encoder-decoder/src/lr_scheduler.py
index 625b0ac..73879fa
--- a/transformers/src/lr_scheduler.py
+++ b/transformers/from_scratch/encoder-decoder/src/lr_scheduler.py
@@ -42,13 +42,12 @@ def rate(self, step=None):
 
 
 def get_std_opt(model):
+    """Returns object of class with trained results.
+    Check if Adam or adam is contained in the torch.optim module.
+    """
     return NoamOpt(
         model.encoder.hidden_dim,
         2,
         4000,
-        torch.optim.Adam(
-            model.parameters(),
-            lr=0,
-            betas=(0.9, 0.98),
-            eps=1e-9),
+        torch.optim.adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9),
     )
diff --git a/docker/mlflow/requirements.txt b/transformers/from_scratch/encoder-decoder/src/models-tf/__init__.py
old mode 100644
new mode 100755
similarity index 100%
rename from docker/mlflow/requirements.txt
rename to transformers/from_scratch/encoder-decoder/src/models-tf/__init__.py
diff --git a/transformers/src/models-tf/encoder.py b/transformers/from_scratch/encoder-decoder/src/models-tf/encoder.py
old mode 100644
new mode 100755
similarity index 96%
rename from transformers/src/models-tf/encoder.py
rename to transformers/from_scratch/encoder-decoder/src/models-tf/encoder.py
index 1ab240a..72d98a1
--- a/transformers/src/models-tf/encoder.py
+++ b/transformers/from_scratch/encoder-decoder/src/models-tf/encoder.py
@@ -6,13 +6,12 @@
 from torch import nn
 # import xavier for initial generation of weights, etc
 from torch.nn.init import xavier_uniform_
-from supervised.transformers.src.multi_head_attention import MultiHeadAttention
-
-from supervised.transformers.src.positional_encoding import SinusoidEncoding
-from supervised.transformers.src.vocabulary import Vocabulary
-
+from ..multi_head_attention import MultiHeadAttention
+from ..positional_encoding import SinusoidEncoding
+from ..vocabulary import Vocabulary
 
 class TransformerEncoder(nn.Module):
+    """test transformer encoder."""
     def __init__(
         self,
         embedding: torch.nn.Embedding,
@@ -68,6 +67,7 @@ def forward(
 
 
 class EncoderBlock(nn.Module):
+    """Define encoder block."""
     def __init__(
         self,
         hidden_dim: int,
diff --git a/transformers/src/models-tf/predict_model.py b/transformers/from_scratch/encoder-decoder/src/models-tf/predict_model.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/src/models-tf/predict_model.py
rename to transformers/from_scratch/encoder-decoder/src/models-tf/predict_model.py
diff --git a/transformers/src/models-tf/train_model.py b/transformers/from_scratch/encoder-decoder/src/models-tf/train_model.py
old mode 100644
new mode 100755
similarity index 96%
rename from transformers/src/models-tf/train_model.py
rename to transformers/from_scratch/encoder-decoder/src/models-tf/train_model.py
index 28ae985..70dd144
--- a/transformers/src/models-tf/train_model.py
+++ b/transformers/from_scratch/encoder-decoder/src/models-tf/train_model.py
@@ -7,10 +7,10 @@
 import torch
 from torch import nn
 
-from lr_scheduler import NoamOpt
-from transformer import Transformer
-from vocabulary import Vocabulary
-from utils import construct_batches
+from ..lr_scheduler import NoamOpt
+from ..transformer import Transformer
+from ..vocabulary import Vocabulary
+from ..utils import construct_batches
 
 
 def train(
@@ -94,6 +94,11 @@ def train(
 
 
 class TestTransformerTraining(unittest.TestCase):
+    """A transformer training tester
+
+    Args:
+        unittest (Test): Testing module
+    """
     seed = 0
     torch.manual_seed(seed)
     random.seed(seed)
diff --git a/transformers/src/models-torch/predict_model.py b/transformers/from_scratch/encoder-decoder/src/models-torch/predict_model.py
old mode 100644
new mode 100755
similarity index 100%
rename from transformers/src/models-torch/predict_model.py
rename to transformers/from_scratch/encoder-decoder/src/models-torch/predict_model.py
diff --git a/transformers/from_scratch/encoder-decoder/src/multi_head_attention.py b/transformers/from_scratch/encoder-decoder/src/multi_head_attention.py
new file mode 100755
index 0000000..b84aab3
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/multi_head_attention.py
@@ -0,0 +1,216 @@
+"""Implementation of the Multi-headed attention."""
+
+from typing import Optional
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class MultiHeadAttention(nn.Module):
+
+    def __init__(self, hidden_dim: int, num_heads: int):
+        super().__init__()
+        assert hidden_dim % num_heads == 0
+        self.qkv_dim = hidden_dim // num_heads
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+
+        self.qkv_proj = nn.Linear(hidden_dim, 3 * num_heads * self.qkv_dim, bias=False)
+        self.o_proj = nn.Linear(num_heads * self.qkv_dim, hidden_dim, bias=False)
+        self._reset_parameter()
+
+    def _reset_parameter(self):
+        """ Weight initialization
+        taken from the UvA DL1 PyTorch Transformer tutorial.
+        """
+        nn.init.xavier_uniform_(self.qkv_proj.weight)
+        nn.init.xavier_uniform_(self.o_proj.weight)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        src_padding_mask: Optional[torch.BoolTensor] = None,
+        future_mask: Optional[torch.BoolTensor] = None
+    ):
+        """Perform multi-head attention using one projection matrix.
+        Self attention is performed when encoder_hidden_states
+        is None, in which case input x represents encoder token embeddings.
+        Otherwise, cross-attention is performed.
+        In that case, input x represents the decoder hidden states.
+
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+
+        Args:
+            :param x: Either encoder or decoder hidden states. Shape: (N, S or T, E)
+            :param encoder_hidden_states: Encoder hidden states to perform cross-attention with. Shape: (N, S, E)
+            :param src_padding_mask: Used for encoder self-attention and cross-attention to handle pad tokens.
+            Masks all incoming "connections" or "logits" from any token position to any pad token in a sequence.
+            Shape: (N, S)
+            :param future_mask: Used for decoder self-attention to avoid any token i attending to a token >i, i.e. "peaking"
+            Shape: (T, T).
+            :return: Contextualized token embeddings. Shape depends on attention type. (N, S, E) for encoder self-attention
+            and decoder cross-attention. (N, T, E) for decoder self-attention.
+        """
+        batch_size, sequence_length, hidden_dim = x.size()
+        if encoder_hidden_states is None:
+            q, k, v = self._self_attention_projection(x)
+        else:
+            q, k, v = self._cross_attention_projection(encoder_hidden_states, x)
+
+        # Swap dimensions to (batch_size, n_heads, seq_len, qkv_dim). Required for the matrix multiplication below
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+
+        # Compute (contextualized) value vector for each "head"
+        values, attn = self.scaled_dot_product(q, k, v, src_padding_mask, future_mask)
+
+        # Concatenate contextualized value vectors from all heads
+        values = values.reshape(batch_size, sequence_length, hidden_dim)
+
+        # Linearly transform the concatenation of all heads' value vectors (8*64=512) to the original hidden dim (512)
+        output = self.o_proj(values)
+        return output
+
+    def _self_attention_projection(self, x: torch.Tensor):
+        """Project x and interpret the result as chunks that represent q, k and v vectors for every head.
+        Input x can be encoder or decoder hidden states, depending on which one calls this MHA module.
+
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+        H = number of heads
+
+        :param x (torch.Tensor): Encoder or decoder hidden states. (N, S or T, E)
+        :return: query, key and value vectors. (N, S or T, H, E/H)
+        """
+        batch_size, sequence_length, _ = x.shape
+
+        qkv = self.qkv_proj(x)
+        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.qkv_dim)
+        q, k, v = qkv.chunk(3, dim=-1)
+        return q, k, v
+
+    def _cross_attention_projection(
+        self, encoder_hidden_states: torch.Tensor,
+        decoder_hidden_states: torch.Tensor
+        ):
+        """
+        Projects hidden states into query vectors and encoder hidden states into key and value vectors.
+        The columns of w_proj determine how much independent linear combination of the inputs we obtain -
+        which we then interpret as heads and qkv vectors. Then we can simply split the weight matrix and
+        project the decoder hidden states into q separately from projecting the encoder_hidden_states into k and v.
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+        H = number of heads
+
+
+        Args:
+            :param encoder_hidden_states (torch.Tensor): Shape: (N, S, E)
+            :param decoder_hidden_states (torch.Tensor): Shape: (N, T, E)
+            :return query vector Shape: (N, T, H, E/H) and key and value vectors both (N, S, H, E/H)
+
+        """
+        batch_size, src_sequence_length, hidden_dim = encoder_hidden_states.shape
+        batch_size, tgt_sequence_length, hidden_dim = decoder_hidden_states.shape
+
+        #split weight matrix
+        w_q, w_kv = self.qkv_proj.weight.split([hidden_dim, 2 * hidden_dim])
+
+        # Project encoder_hidden_states into k's, and v's
+        k, v = (
+            F.linear(input=encoder_hidden_states, weight=w_kv)
+            .reshape(batch_size, src_sequence_length, self.num_heads, 2 * self.qkv_dim)
+            .chunk(2, dim=-1)
+        )
+        # Project decoder hidden states into q's
+
+        q = (
+            F.linear(input=decoder_hidden_states, weight=w_q)
+            .reshape(batch_size, tgt_sequence_length, self.num_heads, self.qkv_dim)
+        )
+        return q, k, v
+
+    def scaled_dot_product(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        src_padding_mask: Optional[torch.BoolTensor] = None,
+        future_mask: Optional[torch.BoolTensor] = None
+    ):
+        """
+        For cross-attention, the sequence length of q and (k,v) may differ as q is
+        projected from decoder hidden states and kv from encoder hidden states.
+
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+        H = number of heads
+
+        :param q: Tensor stacking query vectors for all tokens and all heads. Shape: (N, H, S or T, E/H)
+        :param k: Tensor stacking key vectors for all tokens and all heads. Shape: (N, H, S or T, E/H)
+        :param v: Tensor stacking value vectors for all tokens and all heads. Shape: (N, H, S or T, E/H)
+        :param src_padding_mask: Used for encoder self-attention and cross-attention to handle pad tokens.
+        Masks all incoming "connections" or "logits" from any token position to any pad token in a sequence.
+        Shape: (N, S)
+        :param future_mask: Used for decoder self-attention to avoid any token i attending to a token >i, i.e. "peaking"
+        Shape: (T, T).
+        :return: values (N, H, S or T, E/H), attention scores (N, H, S or T, S or T)
+        """
+        # Compute attention logits. Dot product between each query and key vector, through one matrix multiplication.
+        # Results in un-normalized attention scores for each position's query vector to each position's key vector
+        # Result is (batch_size, num_heads, seq_length, seq_length)
+        attn_logits = torch.matmul(q, torch.transpose(k, -2, -1),)
+        # Scale logits by constant to create less spiky softmax distribution
+        attn_logits = attn_logits / math.sqrt(q.size()[-1])
+        # Apply attention mask (for pad tokens and future-masking in cross-attention)
+        if src_padding_mask is not None or future_mask is not None:
+            attn_logits = self.mask_logits(attn_logits, src_padding_mask, future_mask)  # type: ignore
+
+        # Transform logits to attention probability distribution (one distribution per non-masked token index)
+        attention = F.softmax(attn_logits, dim=-1)
+        # Weighted sum of value vectors for each input token using attention scores -> new contextualized representation
+        # (batch_size, num_heads, sequence_length, qkv_dim)
+        values = torch.matmul(attention, v)
+        return values, attention
+
+    @staticmethod
+    def mask_logits(
+        logits: torch.Tensor,
+        src_padding_mask: Optional[torch.BoolTensor] = None,
+        future_mask: Optional[torch.BoolTensor] = None
+    ):
+        """Reshape masks to fit the shape of the logits and set all indices with "False" to -inf
+
+        N = batch size
+        S = source sequence length
+        T = target sequence length
+        E = embedding dimensionality
+        H = number of heads
+
+        Args:
+            :param logits (torch.Tensor): Tensor containing attention logits. Shape: (N, H, S or T, S or T)
+            :param src_padding_mask (Optional[torch.BoolTensor], optional): Used for encoder self-attention
+            and cross-attention to handle pad tokens. Masks all incoming "connections" or "logits" from any
+            token position to any pad token in a sequence. Shape: (N, S)Defaults to None.
+            :param future_mask (Optional[torch.BoolTensor], optional): Used for decoder self-attention
+            to avoid any token i attending to a token >i, i.e. "peaking" Shape: (T, T).. Defaults to None.
+            :return: masked_logits (N, H, S or T, S or T)
+        """
+        if src_padding_mask is not None:
+            masked_logits = logits.masked_fill(
+                src_padding_mask[:, None, None, :] == 0, float("-inf")
+            )
+        if future_mask is not None:
+            masked_logits = logits.masked_fill(future_mask == 0, float("-inf"))
+        return masked_logits
diff --git a/transformers/from_scratch/encoder-decoder/src/positional_encoding.py b/transformers/from_scratch/encoder-decoder/src/positional_encoding.py
new file mode 100755
index 0000000..afe9ccd
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/positional_encoding.py
@@ -0,0 +1,45 @@
+import math
+import torch
+
+
+class SinusoidalEncoding(torch.nn.Module):
+    """ Copied from this link:
+    https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html
+    """
+
+
+    def __init__(self, hidden_dim, max_len=8):
+        """ Defines the hidden input dimension and max sentence
+        length to expect.
+
+        Args:
+            hidden_dim (_type_): hidden input dimension
+            max_len (_type_, optional): expected sentence max length. Defaults to 5000.
+        """
+        super().__init__()
+        # Create matrix of dimension [seqLen, hidden_dim]
+        pos_embed =  torch.zeros(max_len, hidden_dim)
+        position = torch.arange(0, hidden_dim, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim)
+        )
+        pos_embed[:, 0::2] = torch.sin(position * div_term)
+        pos_embed[:, 1::2] = torch.cos(position * div_term)
+        pos_embed = pos_embed.unsqueeze(0)
+        # register_buffer => Tensor which is not a parameter, but should be part of the modules state.
+        # Used for tensors that need to be on the same device as the module.
+        # persistent=False tells PyTorch to not add the buffer to the state dict (e.g. when we save the model)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+    
+    def forward(self, x):
+        """
+        Adds positional embeddings to token embeddings.
+        N = batch size
+        L = sequence length
+        E = embedding dim
+
+        :param x: token embeddings. Shape: (N, L, E)
+        :return: token_embeddings + positional embeddings. Shape: (N, L, E)
+        """
+        x = x + self.pos_embed[:, : x.size(1)]
+        return x
\ No newline at end of file
diff --git a/transformers/from_scratch/encoder-decoder/src/test_decoder.py b/transformers/from_scratch/encoder-decoder/src/test_decoder.py
new file mode 100755
index 0000000..e580703
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/test_decoder.py
@@ -0,0 +1,153 @@
+"""The test file for decoder."""
+
+import random
+import unittest
+import numpy as np
+import torch
+
+from .utils import construct_future_mask
+
+from .decoder import TransformerDecoder
+
+
+class TestTransformerDecoder(unittest.TestCase):
+    """Defines a given test case.
+
+    Args:
+        unittest(object): represents a given test object.
+    """
+
+    def test_one_layer_transformer_decoder_inference(self):
+        """
+        Test two forward passes, simulating greedy decoding
+        test_one_layer_transformer_decoder_inference
+        steps.
+        """
+        seed = 0
+        torch.manual_seed(seed)
+        random.seed(seed)
+
+        with torch.no_grad():
+            batch_size = 2
+            src_seq_len = 10
+            hidden_dim = 512
+            vocab_size = 2000
+            num_layers = 1
+            num_heads = 8
+
+            # Prepare fake encoder hidden states and padding masks
+            encoder_output = torch.randn((batch_size, src_seq_len, hidden_dim))
+            src_padding_mask = torch.BoolTensor(
+                [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+            # Initialize the decoder, perform xavier init and set to
+            # evaluation mode
+            decoder = TransformerDecoder(
+                embedding=torch.nn.Embedding(vocab_size, hidden_dim),
+                hidden_dim=hidden_dim,
+                ff_dim=2048,
+                num_heads=num_heads,
+                num_layers=num_layers,
+                dropout_p=0.1,
+                vocab_size=vocab_size,
+                tie_output_to_embedding=True,
+            )
+            decoder._reset_parameters()
+            decoder.eval()
+            # Prepare decider input, mask, perform a decoding step,
+            # take the argmax over the softmax of the last token
+            bos_token_id = 1
+            decoder_input = torch.IntTensor([[bos_token_id], [bos_token_id]])
+            future_mask = None
+            for i in range(3):
+                decoder_output = decoder(
+                    decoder_input,
+                    encoder_output,
+                    src_padding_mask=src_padding_mask,
+                    future_mask=future_mask,
+                )
+                predicted_tokens = torch.argmax(
+                    decoder_output[:, -1, :], dim=-1
+                ).unsqueeze_(1)
+                decoder_input = torch.cat((decoder_input, predicted_tokens), dim=-1)
+                future_mask = construct_future_mask(decoder_input.shape[1])
+                self.assertEqual(decoder_output.shape, (batch_size, i + 1, vocab_size))
+                # Check: softmax entropy should not be zero
+                self.assertEqual(torch.any(decoder_output == 1), False)
+
+                """
+                With only one decoder layer the predicted tokens will always be the input token ids. This happens
+                only when the final linear transformation is tied to the (transpose of) the embedding matrix.
+                This is because the input embedding is barely transformed due to residual connections. This results in
+                the highest dot product between its final "contextualized" embedding and the original embedding vector
+                in the pre-softmax weight matrix (i.e. embedding matrix) - because they are still very similar.
+                This can be avoided by 1) scaling up the memory states - probably because this adds sufficient random
+                noise through cross-attention to the contextualised embedding to divergence from the input embedding.
+                2) increasing the number of layers - again adding more and more "noise" or 3) removing the last
+                residual connection after the feed forward layers. In practice, however, this is not an issue. Training
+                will take care of it.
+                """
+
+                self.assertEqual(torch.all(decoder_input == bos_token_id), False)
+
+    def test_multi_layer_transformer_decoder_inference(self):
+        """
+        Test two forward passes, simulating two inference decoding steps
+        """
+        seed = 0
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+
+        with torch.no_grad():
+            batch_size = 2
+            src_seq_len = 10
+            hidden_dim = 512
+            vocab_size = 2000
+
+            # Prepare fake encoder hidden states and padding masks
+            encoder_output = torch.randn((batch_size, src_seq_len, hidden_dim))
+            src_padding_mask = torch.BoolTensor(
+                [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+            )
+
+            # Initialize the decoder, perform xavier init and set to evaluation mode
+            decoder = TransformerDecoder(
+                embedding=torch.nn.Embedding(vocab_size, hidden_dim),
+                hidden_dim=hidden_dim,
+                ff_dim=2048,
+                num_heads=8,
+                num_layers=6,
+                dropout_p=0.1,
+                vocab_size=vocab_size,
+                tie_output_to_embedding=False,
+            )
+            decoder._reset_parameters()
+            decoder.eval()
+
+            # Prepare decoder input, mask, perform a decoding step, take the argmax over the softmax of the last token
+            bos_token_id = 10
+            # and iteratively feed the input+prediction back in.
+            decoder_input = torch.IntTensor([[bos_token_id], [bos_token_id]])
+            future_mask = None
+            for i in range(3):
+                decoder_output = decoder(
+                    decoder_input,
+                    encoder_output,
+                    src_padding_mask=src_padding_mask,
+                    future_mask=future_mask,
+                )
+                predicted_tokens = torch.argmax(
+                    decoder_output[:, -1, :], dim=-1
+                ).unsqueeze(1)
+                decoder_input = torch.cat((decoder_input, predicted_tokens), dim=-1)
+                future_mask = construct_future_mask(decoder_input.shape[1])
+
+                self.assertEqual(decoder_output.shape, (batch_size, i + 1, vocab_size))
+                # softmax entropy should not be 0
+                self.assertEqual(torch.any(decoder_output == 1), False)
+                self.assertEqual(torch.all(decoder_input == bos_token_id), False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/from_scratch/encoder-decoder/src/test_positional_encoding.py b/transformers/from_scratch/encoder-decoder/src/test_positional_encoding.py
new file mode 100755
index 0000000..a5c41c1
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/test_positional_encoding.py
@@ -0,0 +1,129 @@
+"""A test implementation for the positional_encoding."""
+import torch
+import unittest
+from positional_encoding import SinusoidalEncoding
+
+class TestSinusoidalEncoding(unittest.TestCase):
+
+    def test_create_embedding(self):
+        batch = 1
+        dim = 8
+        len = 3
+        x = torch.zeros(batch, len, dim)
+        encoding = SinusoidalEncoding(dim).forward(x)
+        expected = torch.Tensor([
+            [
+                [
+                    0.0000e00,
+                    1.0000e00,
+                    0.0000e00,
+                    1.0000e00,
+                    0.0000e00,
+                    1.0000e00,
+                    0.0000e00,
+                    1.0000e00,
+                ],
+                [
+                    8.4147e-01,
+                    5.4030e-01,
+                    9.9833e-02,
+                    9.9500e-01,
+                    9.9998e-03,
+                    9.9995e-01,
+                    1.0000e-03,
+                    1.0000e00,
+                ],
+                [
+                    9.0930e-01,
+                    -4.1615e-01,
+                    1.9867e-01,
+                    9.8007e-01,
+                    1.9999e-02,
+                    9.9980e-01,
+                    2.0000e-03,
+                    1.0000e00,
+                ]
+            ]
+        ])
+        torch.testing.assert_close(encoding, expected, rtol=10e-5, atol=10e-5)
+    
+    def test_create_embedding_multi_batch(self):
+        batch = 2
+        dim = 8
+        len = 3
+        x = torch.zeros(batch, len, dim)
+        encoding = SinusoidalEncoding(dim).forward(x)
+        expected = torch.Tensor(
+            [
+                [
+                    [
+                        0.0000e00,
+                        1.0000e00,
+                        0.0000e00,
+                        1.0000e00,
+                        0.0000e00,
+                        1.0000e00,
+                        0.0000e00,
+                        1.0000e00,
+                    ],
+                    [
+                        8.4147e-01,
+                        5.4030e-01,
+                        9.9833e-02,
+                        9.9500e-01,
+                        9.9998e-03,
+                        9.9995e-01,
+                        1.0000e-03,
+                        1.0000e00,
+                    ],
+                    [
+                        9.0930e-01,
+                        -4.1615e-01,
+                        1.9867e-01,
+                        9.8007e-01,
+                        1.9999e-02,
+                        9.9980e-01,
+                        2.0000e-03,
+                        1.0000e00,
+                    ],
+                ],
+                [
+                    [
+                        0.0000e00,
+                        1.0000e00,
+                        0.0000e00,
+                        1.0000e00,
+                        0.0000e00,
+                        1.0000e00,
+                        0.0000e00,
+                        1.0000e00,
+                    ],
+                    [
+                        8.4147e-01,
+                        5.4030e-01,
+                        9.9833e-02,
+                        9.9500e-01,
+                        9.9998e-03,
+                        9.9995e-01,
+                        1.0000e-03,
+                        1.0000e00,
+                    ],
+                    [
+                        9.0930e-01,
+                        -4.1615e-01,
+                        1.9867e-01,
+                        9.8007e-01,
+                        1.9999e-02,
+                        9.9980e-01,
+                        2.0000e-03,
+                        1.0000e00,
+                    ],
+                ],
+            ]
+        )
+        torch.testing.assert_close(encoding, expected, rtol=10e-5, atol=10e-5)
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/from_scratch/encoder-decoder/src/test_utils.py b/transformers/from_scratch/encoder-decoder/src/test_utils.py
new file mode 100755
index 0000000..32aad98
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/test_utils.py
@@ -0,0 +1,62 @@
+"""To test the utils py file."""
+
+import torch
+import unittest
+
+from vocabulary import Vocabulary
+from utils import construct_batches, construct_future_mask
+
+
+class TestUtils(unittest.TestCase):
+    def test_construct_future_mask(self):
+        mask = construct_future_mask(3)
+        torch.testing.assert_close(
+            mask,
+            torch.BoolTensor(
+                [[True, False, False], [True, True, False], [True, True, True]]
+            ),
+        )
+
+    def test_construct_future_mask_first_decoding_step(self):
+        mask = construct_future_mask(1)
+        torch.testing.assert_close(mask, torch.BoolTensor([[True]]))
+
+    def test_construct_batches(self):
+        corpus = [
+            {"en": "This is an english sentence.", "nl": "Dit is een Nederlandse zin."},
+            {"en": "The weather is nice today.", "nl": "Het is lekker weer vandaag."},
+            {
+                "en": "Yesterday I drove to a city called Amsterdam in my brand new car.",
+                "nl": "Ik reed gisteren in mijn gloednieuwe auto naar Amsterdam.",
+            },
+            {
+                "en": "You can pick up your laptop at noon tomorrow.",
+                "nl": "Je kunt je laptop morgenmiddag komen ophalen.",
+            },
+        ]
+        en_sentences, nl_sentences = (
+            [d["en"] for d in corpus],
+            [d["nl"] for d in corpus],
+        )
+        vocab = Vocabulary(en_sentences + nl_sentences)
+        batches, masks = construct_batches(
+            corpus, vocab, batch_size=2, src_lang_key="en", tgt_lang_key="nl"
+        )
+        torch.testing.assert_close(
+            batches["src"],
+            [
+                torch.IntTensor(
+                    [[0, 3, 4, 5, 6, 7, 8, 1], [0, 9, 10, 4, 11, 12, 8, 1]]
+                ),
+                torch.IntTensor(
+                    [
+                        [0, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 8, 1],
+                        [0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 8, 1, 2, 2, 2, 2],
+                    ]
+                ),
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/from_scratch/encoder-decoder/src/test_vocabulary.py b/transformers/from_scratch/encoder-decoder/src/test_vocabulary.py
new file mode 100755
index 0000000..50aa828
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/test_vocabulary.py
@@ -0,0 +1,52 @@
+"""A test implementation for the vocabulary script."""
+import unittest
+from vocabulary import Vocabulary
+
+
+class TestVocabulary(unittest.TestCase):
+
+    # Test the tokenize function.
+    def test_tokenize(self):
+        input_sentence = "Hey, there I am here!"
+        tokened_output = Vocabulary([]).tokenize(input_sentence)
+        print(tokened_output)
+        self.assertEqual(["BOS", "Hey", ",", "there", "I", "am", "here", "!", "EOS"], tokened_output)
+
+    def test_initalize_vocab(self):
+        input_sentence = ["May the force be with you."]
+        vocab = Vocabulary(input_sentence)
+        expected = {"BOS": 0, "EOS":1, "PAD":2, "May": 3, "the": 4, "force": 5, "be": 6, "with": 7, "you": 8, ".": 9}
+        self.assertEqual(vocab.token_2_index, expected)
+    
+    def test_encode(self):
+        input_sentence = ["May the force be with you."]
+        vocab = Vocabulary(input_sentence)
+        output = vocab.encode(input_sentence[0])
+        print(output)
+        self.assertEqual(output, [0, 3, 4, 5, 6, 7, 8, 9, 1])
+    
+    def test_encode_no_special_tokens(self):
+        input_sentence = ["May the force be with you."]
+        vocab = Vocabulary(input_sentence)
+        output = vocab.encode(input_sentence[0], add_special_tokens=False)
+        self.assertEqual(output, [3, 4, 5, 6, 7, 8, 9])
+    
+    def test_batch_encode(self):
+        input_sentences = [
+            "Round the rough and rugged road",
+            "The rugged rascal ruddely ran",
+            "Two tiny timid toads trying to troad to tarrytown"
+        ]
+        vocab = Vocabulary(input_sentences)
+        output = vocab.batch_encode(input_sentences, add_special_tokens=False)
+        print(output)
+        input_vec = [
+            [3, 4, 5, 6, 7, 8, 2, 2, 2], 
+            [9, 7, 10, 11, 12, 2, 2, 2, 2], 
+            [13, 14, 15, 16, 17, 18, 19, 18, 20]]
+        self.assertEqual(output, input_vec)
+
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/transformers/from_scratch/encoder-decoder/src/train.py b/transformers/from_scratch/encoder-decoder/src/train.py
new file mode 100644
index 0000000..3991a67
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/train.py
@@ -0,0 +1,70 @@
+"""The train.py implementation."""
+
+import torch
+from torch import nn
+from typing import List, Dict, Any
+
+
+def train(
+    transformer: nn.Module,
+    scheduler: Any,
+    criterion: Any,
+    batches: Dict[str, List[torch.Tensor]],
+    masks: Dict[str, List[torch.Tensor]],
+    n_epochs: int,
+):
+    """
+    Main training loop
+
+    :param transformer: the transformer model
+    :param scheduler: the learning rate scheduler
+    :param criterion: the optimization criterion (loss function)
+    :param batches: aligned src and tgt batches that contain tokens ids
+    :param masks: source key padding mask and target future mask for each batch
+    :param n_epochs: the number of epochs to train the model for
+    :return: the accuracy and loss on the latest batch
+    """
+    transformer.train(True)
+    num_iters = 0
+
+    for e in range(n_epochs):
+        for i, (src_batch, src_mask, tgt_batch, tgt_mask) in enumerate(
+            zip(batches["src"], masks["src"], batches["tgr"], masks["tgt"])
+        ):
+            encoder_output = transformer.encoder(src_batch, src_padding_mask=src_mask)
+            # Perform one decoder forward pass to obtain *all* next-token predictions for every 
+            # index i given its
+            # previous *gold standard* tokens [1,..., i] (i.e. teacher forcing) in parallel/at once.
+            decoder_output = transformer.decoder(
+                tgt_batch,
+                encoder_output,
+                src_padding_mask=src_mask,
+                future_mask=tgt_mask,
+            )
+            # Align labels with predictions: the last decoder prediction is meaningless because we have no target token
+            # for it. The BOS token in the target is also not something we want to compute a loss for
+            decoder_output = decoder_output[:, :-1, :]
+            tgt_batch = tgt_batch[:, 1:]
+            # Set pad tokens in the target to -100 so they don't incur a loss
+            # tgt_batch[tgt_batch == transformer.padding_idx] = -100
+            # Compute the average cross-entropy loss over all next-token predictions at each index i given [1, ..., i]
+            # for the entire batch. Note that the original paper uses label smoothing (I was too lazy).
+            batch_loss = criterion(
+                decoder_output.contiguous().permute(0, 2, 1),
+                tgt_batch.contiguous().long(),
+            )
+            # Rough estimate of per-token accuracy in the current training batch
+            batch_accuracy = (
+                torch.sum(decoder_output.argmax(dim=-1) == tgt_batch)
+            ) / torch.numel(tgt_batch)
+
+            if num_iters % 100 == 0:
+                print(
+                    f"epoch: {e}, num_iters:  {num_iters}, batch_loss: {batch_loss}, batch_accuracy: {batch_accuracy}"
+                )
+            # update parameters
+            batch_loss.backward()
+            scheduler.step()
+            scheduler.optimizer.zero_grad()
+            num_iters += 1
+    return batch_loss, batch_accuracy
diff --git a/transformers/from_scratch/encoder-decoder/src/transformer.py b/transformers/from_scratch/encoder-decoder/src/transformer.py
new file mode 100755
index 0000000..1758e90
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/transformer.py
@@ -0,0 +1,54 @@
+"""A base transformer model"""
+
+from typing import Optional
+from torch import nn
+from torch.nn.init import xavier_uniform_
+from .encoder import TransformerEncoder
+from .decoder import TransformerDecoder
+
+
+class Transformer(nn.Module):
+    """The transformer model."""
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        ff_dim: int,
+        num_heads: int,
+        num_layers: int,
+        max_decoding_length: int,
+        vocab_size: int,
+        padding_idx: int,
+        bos_idx: int,
+        dropout_p: float,
+        tie_output_to_embedding: Optional[bool] = None,
+    ):
+        super().__init__()
+        # Because the encoder embedding, and decoder embedding and
+        # decoder pre-softmax transformation share embeddings
+        # weights, initialize one here and pass it on.
+        self.embed = nn.Embedding(vocab_size, hidden_dim, padding_idx=padding_idx)
+        self.encoder = TransformerEncoder(
+            self.embed, hidden_dim, ff_dim, num_heads, num_layers, dropout_p
+        )
+        self.decoder = TransformerDecoder(
+            self.embed,
+            hidden_dim,
+            ff_dim,
+            num_heads,
+            num_layers,
+            vocab_size,
+            dropout_p,
+            tie_output_to_embedding,
+        )
+
+        self.padding_idx = padding_idx
+        self.bos_idx = bos_idx
+        self.max_decoding_length = max_decoding_length
+        self.hidden_dim = hidden_dim
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
diff --git a/transformers/src/utils.py b/transformers/from_scratch/encoder-decoder/src/utils.py
old mode 100644
new mode 100755
similarity index 51%
rename from transformers/src/utils.py
rename to transformers/from_scratch/encoder-decoder/src/utils.py
index 96d3385..d40a2e2
--- a/transformers/src/utils.py
+++ b/transformers/from_scratch/encoder-decoder/src/utils.py
@@ -1,8 +1,5 @@
-import unittest
 from typing import Dict, List, Tuple, Optional
-
 import torch
-
 from vocabulary import Vocabulary
 
 
@@ -36,7 +33,7 @@ def construct_batches(
     sequences, packed in a dictionary.
     :param vocab: The vocabulary object.
     :param batch_size: The number of sequences in a batch
-    
+
     :param src_lang_key: The source language key is a
     string that the source sequences are keyed under. E.g. "en"
     :param tgt_lang_key: The target language key is
@@ -45,22 +42,20 @@ def construct_batches(
     :return: A tuple containing two dictionaries. The
     first represents the batches, the second the attention masks.
     """
-    pad_token_id = vocab.token2index[vocab.PAD]
+    pad_token_id = vocab.token_2_index[vocab.PAD]
     batches: Dict[str, List] = {"src": [], "tgt": []}
     masks: Dict[str, List] = {"src": [], "tgt": []}
     for i in range(0, len(corpus), batch_size):
         src_batch = torch.IntTensor(
             vocab.batch_encode(
-                [pair[src_lang_key] for pair in corpus[
-                    i: i + batch_size]],
+                [pair[src_lang_key] for pair in corpus[i : i + batch_size]],
                 add_special_tokens=True,
                 padding=True,
             )
         )
         tgt_batch = torch.IntTensor(
             vocab.batch_encode(
-                [pair[tgt_lang_key] for pair in corpus[
-                    i: i + batch_size]],
+                [pair[tgt_lang_key] for pair in corpus[i : i + batch_size]],
                 add_special_tokens=True,
                 padding=True,
             )
@@ -80,68 +75,3 @@ def construct_batches(
         masks["src"].append(src_padding_mask)
         masks["tgt"].append(future_mask)
     return batches, masks
-
-
-class TestUtils(unittest.TestCase):
-    def test_construct_future_mask(self):
-        mask = construct_future_mask(3)
-        torch.testing.assert_close(
-            mask,
-            torch.BoolTensor(
-                [[True, False, False], [True, True, False], [True, True, True]]
-            ),
-        )
-
-    def test_construct_future_mask_first_decoding_step(self):
-        mask = construct_future_mask(1)
-        torch.testing.assert_close(
-            mask, torch.BoolTensor([[True]]),
-        )
-
-    def test_construct_batches(self):
-        corpus = [
-            {
-                "en":
-                    "This is an english sentence.",
-                    "nl": "Dit is een Nederlandse zin."},
-            {
-                "en":
-                    "The weather is nice today.",
-                    "nl": "Het is lekker weer vandaag."},
-            {
-                "en": "Yesterday I drove to a city called"
-                "Amsterdam in my brand new car.",
-                "nl": "Ik reed gisteren in mijn"
-                "gloednieuwe auto naar Amsterdam.",
-            },
-            {
-                "en": "You can pick up your laptop at noon tomorrow.",
-                "nl": "Je kunt je laptop morgenmiddag komen ophalen.",
-            },
-        ]
-        en_sentences, nl_sentences = (
-            [d["en"] for d in corpus],
-            [d["nl"] for d in corpus],
-        )
-        vocab = Vocabulary(en_sentences + nl_sentences)
-        batches, masks = construct_batches(
-            corpus, vocab, batch_size=2, src_lang_key="en", tgt_lang_key="nl"
-        )
-        torch.testing.assert_close(
-            batches["src"],
-            [
-                torch.IntTensor(
-                    [[0, 3, 4, 5, 6, 7, 8, 1], [0, 9, 10, 4, 11, 12, 8, 1]]
-                ),
-                torch.IntTensor(
-                    [
-                        [
-                            0, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                            23, 24, 25, 8, 1],
-                        [
-                            0, 26, 27, 28, 29, 30, 31, 32,
-                            33, 34, 8, 1, 2, 2, 2, 2],
-                    ]
-                ),
-            ],
-        )
\ No newline at end of file
diff --git a/transformers/from_scratch/encoder-decoder/src/vocabulary.py b/transformers/from_scratch/encoder-decoder/src/vocabulary.py
new file mode 100755
index 0000000..8cc4ead
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/src/vocabulary.py
@@ -0,0 +1,88 @@
+"""Defines the vabulary or sentence tokenizer."""
+import re
+from typing import Optional, List
+
+
+class Vocabulary:
+    BOS = "BOS"
+    EOS = "EOS"
+    PAD = "PAD"
+
+    def __init__(self, list_of_sentences: Optional[List[str]]):
+        """Initializes the parameters."""
+        self.token_2_index = { self.BOS: 0, self.EOS: 1, self.PAD: 2}
+        self.index_2_token = {v: k for k, v in self.token_2_index.items()}
+        if not list_of_sentences:
+            return
+        for sentence in list_of_sentences:
+            self.add_tokens(self.tokenize(sentence))
+    
+    def add_tokens(self, tokens: List[str]) -> None:
+        """Adds tokens to vocab
+        :param tokens - list of tokens
+        """
+        for token in tokens:
+            if token not in self.token_2_index:
+                i = len(self.token_2_index.items())
+                self.token_2_index[token] = i
+                self.index_2_token[i] = token
+    
+    def tokenize(self, sentence: str, add_special_tokens: bool = True) -> List[str]:
+        """Adds tokens to sentences by splits and punctuations
+
+        Args:
+            sentence (str): str of sentences
+            add_special_tokens (bool, optional): checks whether to add BIOS, etc.
+            defaults to True.
+
+        Returns:
+            List[str]: returns a list of tokens
+        """
+        tokens = re.findall(r"\w+|[^\s\w]+", sentence)
+        if add_special_tokens:
+            tokens = [self.BOS] + tokens + [self.EOS]
+        return tokens
+    
+    def encode(self, sentence: str, add_special_tokens: bool = True) -> List[int]:
+        """Converts a string to a list of token indices given the vocabulary.
+
+        Args:
+            sentence (str): a string representation of a sentence.
+            add_special_tokens (bool, optional): Whether to add BOS and EOS.
+            Defaults to True.
+
+        Returns:
+            List[str]: returns list of token indices.
+        """
+        tokens = self.tokenize(sentence, add_special_tokens)
+        return [self.token_2_index[token] for token in tokens]
+    
+    def batch_encode(self, sentences: List[str], padding = True, 
+    add_special_tokens: bool = False) -> List [List[int]]:
+        """Convert a list of string sentences to nested list of token indices. 
+        Optionally adds padding & bos+eos tokens
+
+        Args:
+            sentence (List[str]): A list of sentences to be encoded into a batch
+            padding (bool, optional): Boolean allows for padding up to the longest
+            sentence.
+            add_special_tokens (bool, optional): Boolean that allows for adding a 
+            BOS and EOS token to each sentence in the batch
+            Defaults to True.
+
+        Returns:
+            List [List[int]]: nested list of tokenized sequences
+        """
+        token_sequences = [
+            self.encode(sentence, add_special_tokens) for sentence in sentences]
+        if padding:
+            max_length = max([len(tokens) for tokens in token_sequences])
+            token_sequences = [
+                s + ((max_length - len(s)) * [self.token_2_index[self.PAD]])
+                for s in token_sequences
+            ]
+        return token_sequences
+
+
+    
+
diff --git a/transformers/from_scratch/encoder-decoder/test_multi_head_attention.py b/transformers/from_scratch/encoder-decoder/test_multi_head_attention.py
new file mode 100755
index 0000000..edca2b8
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/test_multi_head_attention.py
@@ -0,0 +1,88 @@
+"""An implementation to test the multi-headed attention."""
+
+import torch
+import unittest
+from src.multi_head_attention import MultiHeadAttention
+from src.utils import construct_future_mask
+
+
+class TestMultiHeadAttention(unittest.TestCase):
+    def test_scaled_dot_product(self):
+        mha = MultiHeadAttention(512, 8)
+        q = torch.randn(4, 8, 10, 512)
+        k = torch.randn(4, 8, 10, 512)
+        v = torch.randn(4, 8, 10, 512)
+
+        values, attention_scores = mha.scaled_dot_product(q, k, v)
+        self.assertEqual(values.shape, (4, 8, 10, 512))
+        self.assertEqual(attention_scores.shape, (4, 8, 10, 10))
+
+        # Each attention distribution should sum up to one
+        expected = torch.Tensor([1.0]).repeat((4, 8, 10))
+        torch.testing.assert_close(torch.sum(attention_scores, dim=-1), expected)
+
+        self.assertEqual(torch.any(torch.isnan(values)), False)
+        self.assertEqual(True in torch.isnan(attention_scores), False)
+
+    def test_scalar_dot_product(self):
+        mha = MultiHeadAttention(hidden_dim=512, num_heads=8)
+        q = torch.randn(2, 8, 10, 512, dtype=torch.float)
+        k = torch.randn(2, 8, 10, 512, dtype=torch.float)
+        v = torch.randn(2, 8, 10, 512, dtype=torch.float)
+
+        mask = torch.BoolTensor(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+        )
+        _, attention_scores = mha.scaled_dot_product(q, k, v, src_padding_mask=mask)
+        self.assertEqual(torch.any(torch.isnan(attention_scores)), False)
+        # For the first sequence, we expect the last two (8-10) attention scores
+        # for every attention distribution
+        # For every head to be exactly zero due to the mask defined above.
+        # The rest should be strictly non-zero.
+
+        self.assertEqual(torch.all(attention_scores[0, :, :, 8:] == 0), True)
+        self.assertEqual(torch.any(attention_scores[0, :, :, :8] == 0), False)
+        # Check if all attention distribution will sum up to 1! (i.e all values after summing
+        # should be 1!).
+        expected = torch.Tensor([1.0]).repeat([2, 8, 10])
+        torch.testing.assert_close(torch.sum(attention_scores, dim=-1), expected)
+
+        # For some second sequence in the batch, all attention scores
+        # should be non zero because the mask is all ones
+        self.assertEqual(torch.any(attention_scores[1] == 0), False)
+
+    def test_mha_self_attention_forward(self):
+        mha = MultiHeadAttention(512, 8)
+        x = torch.randn(4, 10, 512, dtype=torch.float)
+        output = mha.forward(x)
+        # Check dimension of attention matrix
+        self.assertEqual(output.shape, (4, 10, 512))
+        self.assertEqual(torch.any(torch.isnan(output)), False)
+
+    def test_cross_attention_projection(self):
+        mhs = MultiHeadAttention(512, 8)
+        decoder_hidden_states = torch.randn(4, 2, 512, dtype=torch.float)
+        encoder_hidden_states = torch.randn(4, 2, 512, dtype=torch.float)
+        output = mhs.forward(
+            x=decoder_hidden_states, encoder_hidden_states=encoder_hidden_states
+        )
+        self.assertEqual(output.shape, (4, 2, 512))
+        self.assertEqual(torch.any(torch.isnan(output)), False)
+
+    def test_future_masking(self):
+        batch_size, num_heads, seq_len = 2, 2, 3  # Add 2 , 3
+        logits = torch.randn(
+            batch_size, num_heads, num_heads, seq_len, seq_len, dtype=torch.float
+        )
+        future_mask = construct_future_mask(seq_len)
+        self.assertEqual(future_mask.shape, (3, 3))
+        masked_logits = MultiHeadAttention(512, num_heads=num_heads).mask_logits(
+            logits, future_mask=future_mask
+        )
+        torch.testing.assert_close(
+            torch.isinf(masked_logits) == 0, torch.BoolTensor([[]])
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/from_scratch/encoder-decoder/test_train.py b/transformers/from_scratch/encoder-decoder/test_train.py
new file mode 100644
index 0000000..5c72c11
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/test_train.py
@@ -0,0 +1,98 @@
+import unittest
+import random
+from random import choices
+
+import numpy as np
+import torch
+from torch import nn
+
+from src.lr_scheduler import NoamOpt
+from src.transformer import Transformer
+from src.vocabulary import Vocabulary
+from src.utils import construct_batches
+from src.train import train
+
+
+class TestTransformerTraining(unittest.TestCase):
+    seed = 0
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+
+    def test_copy_task(self):
+        """
+        Test training by trying to (over)fit a simple copy dataset - bringing the loss to ~zero. (GPU required)
+        """
+        device = (
+            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        )
+        if device.type == "cpu":
+            print("This unit test was not run because it requires a GPU")
+            return
+
+        # Hyperparameters
+        synthetic_corpus_size = 600
+        batch_size = 60
+        n_epochs = 200
+        n_tokens_in_batch = 10
+
+        # Construct vocabulary and create synthetic data by uniform randomly sampling tokens from it
+        # Note: the original paper uses byte pair encodings, we simply take each word to be a token.
+        corpus = ["These are the tokens that will end up in our vocabulary"]
+        vocab = Vocabulary(corpus)
+        vocab_size = len(
+            list(vocab.token_2_index.keys())
+        )  # 14 tokens including bos, eos and pad
+        valid_tokens = list(vocab.token_2_index.keys())[3:]
+        corpus += [
+            " ".join(choices(valid_tokens, k=n_tokens_in_batch))
+            for _ in range(synthetic_corpus_size)
+        ]
+
+        # Construct src-tgt aligned input batches (note: the original paper uses dynamic batching based on tokens)
+        corpus = [{"src": sent, "tgt": sent} for sent in corpus]
+        batches, masks = construct_batches(
+            corpus,
+            vocab,
+            batch_size=batch_size,
+            src_lang_key="src",
+            tgt_lang_key="tgt",
+            device=device,
+        )
+
+        # Initialize transformer
+        transformer = Transformer(
+            hidden_dim=512,
+            ff_dim=2048,
+            num_heads=8,
+            num_layers=2,
+            max_decoding_length=25,
+            vocab_size=vocab_size,
+            padding_idx=vocab.token_2_index[vocab.PAD],
+            bos_idx=vocab.token_2_index[vocab.BOS],
+            dropout_p=0.1,
+            tie_output_to_embedding=True,
+        ).to(device)
+
+        # Initialize learning rate scheduler, optimizer and loss (note: the original paper uses label smoothing)
+        optimizer = torch.optim.Adam(
+            transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9
+        )
+        scheduler = NoamOpt(
+            transformer.hidden_dim,
+            factor=1,
+            warmup=400,
+            optimizer=optimizer,
+        )
+        criterion = nn.CrossEntropyLoss()
+
+        # Start training and verify ~zero loss and >90% accuracy on the last batch
+        latest_batch_loss, latest_batch_accuracy = train(
+            transformer, scheduler, criterion, batches, masks, n_epochs=n_epochs
+        )
+        self.assertEqual(latest_batch_loss.item() <= 0.01, True)
+        self.assertEqual(latest_batch_accuracy >= 0.99, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/from_scratch/encoder-decoder/test_transformer.py b/transformers/from_scratch/encoder-decoder/test_transformer.py
new file mode 100755
index 0000000..bcabaca
--- /dev/null
+++ b/transformers/from_scratch/encoder-decoder/test_transformer.py
@@ -0,0 +1,89 @@
+"""The transformer test implementation."""
+
+import random
+import torch
+import unittest
+import numpy as np
+
+from .src.utils import construct_future_mask
+from .src.transformer import Transformer
+from .src.vocabulary import Vocabulary
+
+
+class TestTransformer(unittest.TestCase):
+    """Test case for the transformer implementation.
+    Args:
+        unittest (_type_): represents the transformer test class
+    """
+
+    def test_transformer_inference(self):
+        seed = 0
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+
+        # Create (shared) Vocabulary and special token
+        # given a dummy corpus
+        corpus = [
+            "Hello my name is Janice and I was born with the name Janice"
+            "Dit is een Nederlandse zin"
+        ]
+        en_vocab = Vocabulary(corpus)
+        en_vocab_size = len(en_vocab.token_2_index.items())
+        with torch.no_grad():
+            transformer = Transformer(
+                hidden_dim=512,
+                ff_dim=2048,
+                num_heads=8,
+                num_layers=6,
+                max_decoding_length=10,
+                vocab_size=en_vocab_size,
+                padding_idx=en_vocab.token_2_index[en_vocab.PAD],
+                bos_idx=en_vocab.token_2_index[en_vocab.BOS],
+                dropout_p=0.1,
+                tie_output_to_embedding=True,
+            )
+            transformer.eval()
+            # Prepare encoder input, mask and generate output hidden states
+            encoder_input = torch.IntTensor(
+                en_vocab.batch_encode(corpus, add_special_tokens=False)
+            )
+            src_padding_mask = encoder_input != transformer.padding_idx
+            encoder_output = transformer.encoder.forward(
+                encoder_input, src_padding_mask=src_padding_mask
+            )
+            self.assertEqual(torch.any(torch.isnan(encoder_output)), False)
+            # Prepare decoder input, mask and start decoding
+            decoder_input = torch.IntTensor(
+                [[transformer.bos_idx], [transformer.bos_idx]]
+            )
+            future_mask = construct_future_mask(seq_len=1)
+            for _ in range(transformer.max_decoding_length):
+                decoder_output = transformer.decoder(
+                    decoder_input,
+                    encoder_output,
+                    src_padding_mask=src_padding_mask,
+                    future_mask=future_mask,
+                )
+                # Take the argmax over the softmax of the last token
+                # to obtain the next token prediction
+                predicted_tokens = torch.argmax(
+                    decoder_output[:, -1, :], dim=-1
+                ).unsqueeze(1)
+
+                # Append the prediction to the already
+                # decoded tokens and construct the new mask
+                decoder_input = torch.cat((decoder_input, predicted_tokens), dim=-1)
+                future_mask = construct_future_mask(decoder_input.shape[1])
+
+                self.assertEqual(
+                    decoder_input.shape, (2, transformer.max_decoding_length + 1)
+                )
+                # See test_one_layer_transformer_decoder_inference
+                # in decoder.py for more inforemation. With num_layers=1,
+                # this will be true.
+                self.assertEqual(torch.all(decoder_input == transformer.bos_idx), False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/docker/postgres/Dockerfile b/transformers/from_scratch/encoder/src/__init__.py
similarity index 100%
rename from docker/postgres/Dockerfile
rename to transformers/from_scratch/encoder/src/__init__.py
diff --git a/transformers/from_scratch/encoder/src/vocabulary.py b/transformers/from_scratch/encoder/src/vocabulary.py
new file mode 100644
index 0000000..c2be88e
--- /dev/null
+++ b/transformers/from_scratch/encoder/src/vocabulary.py
@@ -0,0 +1,6 @@
+import re
+from typing import Optional, List
+
+
+class Vocabulary:
+    """Define tokenizer and other utilities."""
diff --git a/docker/postgres/requirements.txt b/transformers/from_scratch/encoder/test_transformer.py
similarity index 100%
rename from docker/postgres/requirements.txt
rename to transformers/from_scratch/encoder/test_transformer.py
diff --git a/transformers/from_scratch/src/sentence.py b/transformers/from_scratch/src/sentence.py
deleted file mode 100644
index b1e9fdd..0000000
--- a/transformers/from_scratch/src/sentence.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import re
-from typing import List, Optional
-
-
-class Sentence:
-    """Generate a list of tokens from a list of input sentences """
-    BOS = "BOS"  # Beginning of sentence
-    EOS = "EOS"  # End of Sentence
-    PAD = "PAD"
-    
-    def __init__(self, sentence_list: Optional[List[str]] = None):
-        self.token_2_index = {self.BOS: 0, self.EOS: 1, self.PAD: 2}
-        self.index_to_token = {v: k for k, v in self.token_2_index.items()}
-        # Check whether there is a list of input sentences
-        if not sentence_list:
-            return  # then return to initial state
-        
-        # Call token method if there is a list of sentences
-        # Tokenize and substitute BOS, EAS and PAD
-        for sentence in sentence_list:
-            self.add_tokens(self.tokenize(sentence))
-
-    def add_tokens(self, param):
-        pass
-        
\ No newline at end of file
diff --git a/transformers/src/decoder.py b/transformers/src/decoder.py
deleted file mode 100644
index 767431c..0000000
--- a/transformers/src/decoder.py
+++ /dev/null
@@ -1,335 +0,0 @@
-import math
-import unittest
-import random
-from typing import Optional
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn.init import xavier_uniform_
-from .multi_head_attention import MultiHeadAttention
-from .positional_encoding import SinusoidEncoding
-from .utils import construct_future_mask
-
-
-class TransformerDecoder(nn.Module):
-    def __init__(
-        self,
-        embedding: torch.nn.Embedding,
-        hidden_dim: int,
-        ff_dim: int,
-        num_heads: int,
-        num_layers: int,
-        vocab_size: int,
-        dropout_p: float,
-        tie_output_to_embedding: Optional[bool] = True,
-    ):
-        super().__init__()
-
-        self.hidden_dim = hidden_dim
-        self.embed = embedding
-        self.positional_encoding = SinusoidEncoding(hidden_dim)
-        self.dropout = nn.Dropout(p=0.1)
-        self.decoder_blocks = nn.ModuleList(
-            [
-                TransformerDecoderBlock(
-                    hidden_dim, ff_dim,
-                    num_heads, dropout_p)
-                for _ in range(num_layers)
-            ]
-        )
-        self.output_layer = nn.Linear(hidden_dim, vocab_size, bias=False)
-
-        # Note: a linear layer multiplies the input with a transpose of the
-        # weight matrix, so no need to do that here.
-        if tie_output_to_embedding:
-            self.output_layer.weight = nn.Parameter(self.embed.weight)
-
-    def _reset_parameters(self):
-        """ Perform xavier weight initialization"""
-        for p in self.parameters():
-            if p.dim() > 1:
-                xavier_uniform_(p)
-
-    def forward(
-        self,
-        input_tokens: torch.IntTensor,
-        encoder_hidden_states: torch.Tensor,
-        src_padding_mask: Optional[torch.BoolTensor] = None,
-        future_mask: Optional[torch.BoolTensor] = None,
-    ):
-        """
-        Performs one decoder forward pass given encoder hidden states, the
-        decoder input tokens and attention masks.
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-        V = vocabulary size
-
-        :param input_tokens: Decoder input tokens. Shape: (N, T)
-        :param encoder_hidden_states: The encoder's final (contextualized)
-        token embeddings. Shape: (N, S, E)
-        :param src_padding_mask: An attention mask to ignore pad-tokens in the
-        source input. Shape (N, S)
-        :param future_mask: An attention mask to ignore future-tokens in the
-        target input. Shape (T, T)
-        :return: Unnormalized logits over the vocabulary for every token in
-        the batch. Shape (N, T, V)
-        """
-        # (batch_size, sequence_length, hidden_dim)
-        x = self.embed(input_tokens) * math.sqrt(self.hidden_dim)
-        x = self.positional_encoding(x)
-        x = self.dropout(x)
-
-        for decoder_block in self.decoder_blocks:
-            x = decoder_block(
-                x,
-                encoder_hidden_states, src_padding_mask, future_mask)
-
-        # (batch_size, sequence_length, vocab_size)
-        logits = self.output_layer(x)
-        return logits
-
-
-class TransformerDecoderBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_dim: int,
-        ff_dim: int,
-        num_heads: int,
-        dropout_p: float
-    ):
-        super().__init__()
-
-        self.cross_mha = MultiHeadAttention(hidden_dim, num_heads)
-        self.self_mha = MultiHeadAttention(hidden_dim, num_heads)
-        self.feed_forward = nn.Sequential(
-            nn.Linear(
-                hidden_dim, ff_dim),
-            nn.ReLU(),
-            nn.Linear(ff_dim, hidden_dim),
-        )
-
-        self.dropout1 = nn.Dropout(p=dropout_p)
-        self.dropout2 = nn.Dropout(p=dropout_p)
-        self.dropout3 = nn.Dropout(p=dropout_p)
-
-        self.layer_norm1 = nn.LayerNorm(hidden_dim)
-        self.layer_norm2 = nn.LayerNorm(hidden_dim)
-        self.layer_norm3 = nn.LayerNorm(hidden_dim)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        encoder_hidden_states: torch.FloatTensor,
-        src_padding_mask: Optional[torch.BoolTensor] = None,
-        future_mask: Optional[torch.BoolTensor] = None,
-    ):
-        """
-        Performs one decoder *block* forward pass given final encoder hidden
-        states, the previous block's output, and
-        attention masks.
-
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-        V = vocabulary size
-
-        :param x: Previous decoder block's output. Shape: (N, T, E)
-        :param encoder_hidden_states: The encoder's final (contextualized)
-        token embeddings. Shape: (N, S, E)
-        :param src_padding_mask: An attention mask to ignore pad-tokens in the
-        source input. Shape (N, S)
-        :param future_mask: An attention mask to ignore future-tokens in the
-        target input. Shape (T, T)
-        :return: Updated, contextualized token embeddings. Shape (N, T, E)
-        """
-
-        # Self attention (with future masking during training)
-        output = self.dropout1(
-            self.self_mha.forward(
-                x, future_mask=future_mask))
-        x = self.layer_norm1(x + output)
-
-        # Cross or encoder-decoder attention
-        output = self.dropout2(
-            self.cross_mha.forward(
-                x,
-                encoder_hidden_states=encoder_hidden_states,
-                src_padding_mask=src_padding_mask,
-            )
-        )
-        x = self.layer_norm2(x + output)
-
-        # Feed forward layers
-        output = self.dropout3(self.feed_forward(x))
-        x = self.layer_norm3(x + output)
-        return x
-
-
-class TestTransformerDecoder(unittest.TestCase):
-    def test_one_layer_transformer_decoder_inference(self):
-        """
-        Test two forward passes, simulating two greedy decoding inference steps
-        """
-        seed = 0
-        torch.manual_seed(seed)
-        random.seed(seed)
-        np.random.seed(seed)
-
-        with torch.no_grad():
-            batch_size = 2
-            src_seq_len = 10
-            hidden_dim = 512
-            vocab_size = 2000
-            num_layers = 1
-            num_heads = 8
-
-            # Prepare fake encoder hidden states and padding masks
-            encoder_output = torch.randn((batch_size, src_seq_len, hidden_dim))
-            src_padding_mask = torch.BoolTensor(
-                [
-                    [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-                    ]
-            )
-
-            # Initialize the decoder, perform xavier init and set to
-            # evaluation mode
-            decoder = TransformerDecoder(
-                embedding=torch.nn.Embedding(vocab_size, hidden_dim),
-                hidden_dim=hidden_dim,
-                ff_dim=2048,
-                num_heads=num_heads,
-                num_layers=num_layers,
-                dropout_p=0.1,
-                vocab_size=vocab_size,
-                tie_output_to_embedding=True,
-            )
-            decoder._reset_parameters()
-            decoder.eval()
-
-            # Prepare decoder input, mask, perform a decoding step, take the
-            # argmax over the softmax of the last token
-            bos_token_id = 1
-            # and iteratively feed the input+prediction back in.
-            decoder_input = torch.IntTensor([[bos_token_id], [bos_token_id]])
-            future_mask = None
-            for i in range(3):
-                decoder_output = decoder(
-                    decoder_input,
-                    encoder_output,
-                    src_padding_mask=src_padding_mask,
-                    future_mask=future_mask,
-                )
-                predicted_tokens = torch.argmax(
-                    decoder_output[:, -1, :], dim=-1
-                ).unsqueeze(1)
-                decoder_input = torch.cat(
-                    (decoder_input, predicted_tokens),
-                    dim=-1)
-                future_mask = construct_future_mask(decoder_input.shape[1])
-
-                self.assertEqual(
-                    decoder_output.shape,
-                    (batch_size, i + 1,
-                        vocab_size))
-                # softmax entropy should not be 0
-                self.assertEqual(torch.any(decoder_output == 1), False)
-                """
-                With only one decoder layer the predicted tokens will always
-                be the input token ids. This happens
-                only when the final linear transformation is tied to the
-                (transpose of) the embedding matrix.
-                This is because the input embedding is barely transformed
-                due to residual connections. This results in
-                the highest dot product between its final "contextualized"
-                embedding and the original embedding vector
-                in the pre-softmax weight matrix (i.e. embedding matrix)
-                - because they are still very similar.
-                This can be avoided by 1) scaling up the memory states
-                - probably because this adds sufficient random
-                noise through cross-attention to the contextualised embedding
-                to divergence from the input embedding.
-                2) increasing the number of layers - again adding more and
-                more "noise" or 3) removing the last
-                residual connection after the feed forward layers. In
-                practice, however, this is not an issue. Training
-                will take care of it.
-                """
-                self.assertEqual(
-                    torch.all(decoder_input == bos_token_id),
-                    True)
-
-    def test_multi_layer_transformer_decoder_inference(self):
-        """
-        Test two forward passes, simulating two inference decoding steps
-        """
-        seed = 0
-        torch.manual_seed(seed)
-        random.seed(seed)
-        np.random.seed(seed)
-
-        with torch.no_grad():
-            batch_size = 2
-            src_seq_len = 10
-            hidden_dim = 512
-            vocab_size = 2000
-
-            # Prepare fake encoder hidden states and padding masks
-            encoder_output = torch.randn((batch_size, src_seq_len, hidden_dim))
-            src_padding_mask = torch.BoolTensor(
-                [
-                    [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
-            )
-
-            # Initialize the decoder, perform xavier init and set to
-            # evaluation mode
-            decoder = TransformerDecoder(
-                embedding=torch.nn.Embedding(vocab_size, hidden_dim),
-                hidden_dim=hidden_dim,
-                ff_dim=2048,
-                num_heads=8,
-                num_layers=6,
-                dropout_p=0.1,
-                vocab_size=vocab_size,
-                tie_output_to_embedding=False,
-            )
-            decoder._reset_parameters()
-            decoder.eval()
-
-            # Prepare decoder input, mask, perform a decoding step, take the
-            # argmax over the softmax of the last token
-            bos_token_id = 10
-            # and iteratively feed the input+prediction back in.
-            decoder_input = torch.IntTensor([[bos_token_id], [bos_token_id]])
-            future_mask = None
-            for i in range(3):
-                decoder_output = decoder(
-                    decoder_input,
-                    encoder_output,
-                    src_padding_mask=src_padding_mask,
-                    future_mask=future_mask,
-                )
-                predicted_tokens = torch.argmax(
-                    decoder_output[:, -1, :], dim=-1
-                ).unsqueeze(1)
-                decoder_input = torch.cat(
-                    (decoder_input, predicted_tokens),
-                    dim=-1)
-                future_mask = construct_future_mask(decoder_input.shape[1])
-
-                self.assertEqual(
-                    decoder_output.shape,
-                    (batch_size, i + 1, vocab_size))
-                # softmax entropy should not be 0
-                self.assertEqual(torch.any(decoder_output == 1), False)
-                self.assertEqual(
-                    torch.all(decoder_input == bos_token_id), False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/transformers/src/models-torch/decoder.py b/transformers/src/models-torch/decoder.py
deleted file mode 100644
index af0dca2..0000000
--- a/transformers/src/models-torch/decoder.py
+++ /dev/null
@@ -1 +0,0 @@
-"""A torch decoder for transformers from scratch"""
\ No newline at end of file
diff --git a/transformers/src/models-torch/encoder.py b/transformers/src/models-torch/encoder.py
deleted file mode 100644
index e69de29..0000000
diff --git a/transformers/src/models-torch/train_model.py b/transformers/src/models-torch/train_model.py
deleted file mode 100644
index e69de29..0000000
diff --git a/transformers/src/multi_head_attention.py b/transformers/src/multi_head_attention.py
deleted file mode 100644
index 768aac6..0000000
--- a/transformers/src/multi_head_attention.py
+++ /dev/null
@@ -1,418 +0,0 @@
-import math
-import unittest
-from typing import Optional
-
-# import torch and related libs
-import torch
-from torch import nn
-from torch.nn import functional as F
-from .utils import construct_future_mask
-
-
-class MultiHeadAttention(nn.Module):
-    def __init__(self, hidden_dim: int, num_heads: int):
-        super().__init__()
-
-        assert hidden_dim % num_heads == 0
-        self.qkv_dim = hidden_dim // num_heads
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-
-        self.qkv_proj = nn.Linear(
-            hidden_dim, 3 * num_heads * self.qkv_dim, bias=False)
-        self.o_proj = nn.Linear(
-            num_heads * self.qkv_dim, hidden_dim, bias=False)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        """ Weight initialization taken from the UvA DL1
-        PyTorch Transformer tutorial. """
-        nn.init.xavier_uniform_(self.qkv_proj.weight)
-        nn.init.xavier_uniform_(self.o_proj.weight)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        src_padding_mask: Optional[torch.BoolTensor] = None,
-        future_mask: Optional[torch.BoolTensor] = None,
-    ):
-        """
-        Perform multi-head attention using one projection matrix. Self
-        attention is performed when encoder_hidden_states
-        is None, in which case input x represents encoder token embeddings.
-        Otherwise, cross-attention is performed.
-        In that case, input x represents the decoder hidden states.
-
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-
-        :param x: Either encoder or decoder hidden states. Shape:
-        (N, S or T, E)
-        :param encoder_hidden_states: Encoder hidden states to perform
-        cross-attention with. Shape: (N, S, E)
-        :param src_padding_mask: Used for encoder self-attention and
-        cross-attention to handle pad tokens.
-        Masks all incoming "connections" or "logits" from any token
-        position to any pad token in a sequence.
-        Shape: (N, S)
-        :param future_mask: Used for decoder self-attention to avoid any
-        token i attending to a token >i, i.e. "peaking"
-        Shape: (T, T).
-        :return: Contextualized token embeddings. Shape depends on
-        attention type. (N, S, E) for encoder self-attention
-        and decoder cross-attention. (N, T, E) for decoder self-attention.
-        """
-
-        batch_size, sequence_length, hidden_dim = x.size()
-
-        if encoder_hidden_states is None:
-            q, k, v = self._self_attention_projection(x)
-        else:
-            q, k, v = self._cross_attention_projection(
-                encoder_hidden_states, x)
-
-        # Swap dimensions to (batch_size, n_heads, seq_len, qkv_dim).
-        # Required for the matrix multiplication below
-        q = q.permute(0, 2, 1, 3)
-        k = k.permute(0, 2, 1, 3)
-        v = v.permute(0, 2, 1, 3)
-
-        # Compute (contextualized) value vector for each "head"
-        values, attn = self.scaled_dot_product(
-            q, k, v, src_padding_mask, future_mask)
-
-        # Concatenate contextualized value vectors from all heads
-        values = values.reshape(batch_size, sequence_length, hidden_dim)
-
-        # Linearly transform the concatenation of all heads' value vectors
-        # (8*64=512) to the original hidden dim (512)
-        output = self.o_proj(values)
-        return output
-
-    def _self_attention_projection(self, x: torch.Tensor):
-        """
-        Project x and interpret the result as chunks that represent q, k and
-        v vectors for every head.
-        Input x can be encoder or decoder hidden states, depending on which
-        one calls this MHA module.
-
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-        H = number of heads
-
-        :param x: Encoder or decoder hidden states. (N, S or T, E)
-        :return: query, key and value vectors. (N, S or T, H, E/H)
-        """
-        batch_size, sequence_length, _ = x.shape
-        qkv = self.qkv_proj(x)
-        qkv = qkv.reshape(
-            batch_size, sequence_length, self.num_heads, 3 * self.qkv_dim)
-        q, k, v = qkv.chunk(3, dim=-1)
-        return q, k, v
-
-    def _cross_attention_projection(
-        self,
-        encoder_hidden_states: torch.Tensor,
-        decoder_hidden_states: torch.Tensor,
-    ):
-        """
-        Projects decoder hidden states into query vectors and encoder
-        hidden states into key and value vectors.
-        The columns of W_proj determine how much independent linear
-        combinations of the input we obtain - which we
-        then interpret as heads and qkv vectors. Thus we can
-        simply split the weight matrix and project the decoder
-        hidden states x into q separately from projecting the
-        encoder_hidden_states into k and v.
-
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-        H = number of heads
-
-        :param encoder_hidden_states: Shape: (N, S, E)
-        :param decoder_hidden_states: Shape: (N, T, E)
-        :return: query vector: Shape: (N, T, H, E/H) and key and
-        value vectors both (N, S, H, E/H)
-        """
-        batch_size, src_seq_length, hidden_dim = encoder_hidden_states.shape
-        batch_size, tgt_seq_length, hidden_dim = decoder_hidden_states.shape
-
-        # Split weight matrix
-        w_q, w_kv = self.qkv_proj.weight.split([hidden_dim, 2 * hidden_dim])
-
-        # Project encoder_hidden_states into k's, and v's
-        k, v = (
-            F.linear(input=encoder_hidden_states, weight=w_kv)
-            .reshape(
-                batch_size,
-                src_seq_length,
-                self.num_heads,
-                2 * self.qkv_dim)
-            .chunk(2, dim=-1)
-        )
-
-        # Project decoder hidden states into q's
-        q = F.linear(input=decoder_hidden_states, weight=w_q).reshape(
-            batch_size, tgt_seq_length, self.num_heads, self.qkv_dim
-        )
-
-        return q, k, v
-
-    def scaled_dot_product(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        src_padding_mask: Optional[torch.BoolTensor] = None,
-        future_mask: Optional[torch.BoolTensor] = None,
-    ):
-        """
-        For cross-attention, the sequence length of q and (k,v) may differ
-        as q is projected from decoder hidden states
-        and kv from encoder hidden states.
-
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-        H = number of heads
-
-        :param q: Tensor stacking query vectors for all tokens and all heads.
-        Shape: (N, H, S or T, E/H)
-        :param k: Tensor stacking key vectors for all tokens and all heads.
-        Shape: (N, H, S or T, E/H)
-        :param v: Tensor stacking value vectors for all tokens and all heads.
-        Shape: (N, H, S or T, E/H)
-        :param src_padding_mask: Used for encoder self-attention and
-        cross-attention to handle pad tokens.
-        Masks all incoming "connections" or "logits" from any token position
-        to any pad token in a sequence.
-        Shape: (N, S)
-        :param future_mask: Used for decoder self-attention to avoid any token
-        i attending to a token >i, i.e. "peaking"
-        Shape: (T, T).
-        :return: values (N, H, S or T, E/H), attention scores
-        (N, H, S or T, S or T)
-        """
-
-        # Compute attention logits. Dot product between each query and key
-        # vector, through one matrix multiplication.
-        # Results in un-normalized attention scores for each position's query
-        # vector to each position's key vector
-        # Result is (batch_size, num_heads, seq_length, seq_length)
-        attn_logits = torch.matmul(q, torch.transpose(k, -2, -1),)
-
-        # Scale logits by constant to create less spiky softmax distribution
-        attn_logits = attn_logits / math.sqrt(q.size()[-1])
-
-        # Apply attention mask (for pad tokens and future-masking in
-        # cross-attention)
-        if src_padding_mask is not None or future_mask is not None:
-            attn_logits = self.mask_logits(
-                attn_logits, src_padding_mask, future_mask)  # type: ignore
-
-        # Transform logits to attention probability distribution
-        # (one distribution per non-masked token index)
-        attention = F.softmax(attn_logits, dim=-1)
-
-        # Weighted sum of value vectors for each input token using attention
-        # scores -> new contextualized representation
-        # (batch_size, num_heads, sequence_length, qkv_dim)
-        values = torch.matmul(attention, v)
-        return values, attention
-
-    @staticmethod
-    def mask_logits(
-        logits: torch.Tensor,
-        src_padding_mask: Optional[torch.BoolTensor] = None,
-        future_mask: Optional[torch.BoolTensor] = None,
-    ):
-        """
-        Reshape masks to fit the shape of the logits and set all indices
-        with "False" to -inf
-
-        N = batch size
-        S = source sequence length
-        T = target sequence length
-        E = embedding dimensionality
-        H = number of heads
-
-        :param logits: Tensor containing attention logits.
-        Shape: (N, H, S or T, S or T)
-        :param src_padding_mask: Used for encoder self-attention and
-        cross-attention to handle pad tokens.
-        Masks all incoming "connections" or "logits" from any token position
-        to any pad token in a sequence.
-        Shape: (N, S)
-        :param future_mask: Used for decoder self-attention to avoid any
-        token i attending to a token >i, i.e. "peaking"
-        Shape: (T, T).
-        :return: masked_logits (N, H, S or T, S or T)
-        """
-        if src_padding_mask is not None:
-            masked_logits = logits.masked_fill(
-                src_padding_mask[:, None, None, :] == 0, float("-inf")
-            )
-        if future_mask is not None:
-            masked_logits = logits.masked_fill(future_mask == 0, float("-inf"))
-        return masked_logits
-
-
-class TestMultiHeadAttention(unittest.TestCase):
-    def test_scaled_dot_product(self):
-        mha = MultiHeadAttention(512, 8)
-        q = torch.randn(4, 8, 10, 512)
-        k = torch.randn(4, 8, 10, 512)
-        v = torch.randn(4, 8, 10, 512)
-
-        values, attention_scores = mha.scaled_dot_product(q, k, v)
-
-        self.assertEqual(values.shape, (4, 8, 10, 512))
-        self.assertEqual(attention_scores.shape, (4, 8, 10, 10))
-
-        # Each attention distribution should sum up to one
-        expected = torch.Tensor([1.0]).repeat((4, 8, 10))
-        torch.testing.assert_close(torch.sum(
-            attention_scores, dim=-1), expected)
-
-        self.assertEqual(torch.any(torch.isnan(values)), False)
-        self.assertEqual(True in torch.isnan(attention_scores), False)
-
-    def test_scaled_dot_product_encoder_self_attention_mask(self):
-        mha = MultiHeadAttention(hidden_dim=512, num_heads=8)
-        q = torch.randn(2, 8, 10, 512, dtype=torch.float)
-        k = torch.randn(2, 8, 10, 512, dtype=torch.float)
-        v = torch.randn(2, 8, 10, 512, dtype=torch.float)
-        mask = torch.BoolTensor(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
-        )
-
-        _, attention_scores = mha.scaled_dot_product(
-            q, k, v, src_padding_mask=mask)
-        self.assertEqual(torch.any(torch.isnan(attention_scores)), False)
-
-        # For the first sequence we expect the last two (8-10)
-        # scores for every attention distribution
-        # for every head to be exactly zero due to the mask we defined above.
-        # The rest should be strictly non-zero.
-        self.assertEqual(torch.all(attention_scores[0, :, :, 8:] == 0), True)
-        self.assertEqual(torch.any(attention_scores[0, :, :, :8] == 0), False)
-        # Each attention distribution should sum up to one
-        # (all values after summing should be 1)
-        expected = torch.Tensor([1.0]).repeat((2, 8, 10))
-        torch.testing.assert_close(
-            torch.sum(attention_scores, dim=-1), expected)
-
-        # For the second sequence in the batch all attention scores should
-        # be nonzero because the mask is all ones
-        self.assertEqual(torch.any(attention_scores[1] == 0), False)
-
-    def test_mha_self_attention_forward(self):
-        mha = MultiHeadAttention(512, 8)
-        x = torch.randn(4, 10, 512, dtype=torch.float)
-        output = mha.forward(x)
-        self.assertEqual(output.shape, (4, 10, 512))
-        self.assertEqual(torch.any(torch.isnan(output)), False)
-
-    def test_mha_cross_attention_forward(self):
-        mha = MultiHeadAttention(512, 8)
-        decoder_hidden_states = torch.randn(4, 2, 512, dtype=torch.float)
-        encoder_hidden_states = torch.randn(4, 10, 512, dtype=torch.float)
-        output = mha.forward(
-            x=decoder_hidden_states,
-            encoder_hidden_states=encoder_hidden_states
-        )
-        self.assertEqual(output.shape, (4, 2, 512))
-        self.assertEqual(torch.any(torch.isnan(output)), False)
-
-    def test_future_masking(self):
-        # TODO add 2 heads and batch_size=3
-        batch_size, n_heads, seq_len = 2, 2, 3
-        logits = torch.randn(
-            batch_size, n_heads, seq_len, seq_len, dtype=torch.float)
-        future_mask = construct_future_mask(seq_len)
-        self.assertEqual(future_mask.shape, (3, 3))
-
-        masked_logits = MultiHeadAttention(512, num_heads=n_heads).mask_logits(
-            logits, future_mask=future_mask
-        )
-        torch.testing.assert_close(
-            torch.isinf(masked_logits) == 0,
-            torch.BoolTensor(
-                [
-                    [
-                        [
-                            [True, False, False],
-                            [True, True, False],
-                            [True, True, True],
-                        ],
-                        [
-                            [True, False, False],
-                            [True, True, False],
-                            [True, True, True],
-                        ],
-                    ],
-                    [
-                        [
-                            [True, False, False],
-                            [True, True, False],
-                            [True, True, True],
-                        ],
-                        [
-                            [True, False, False],
-                            [True, True, False],
-                            [True, True, True],
-                        ],
-                    ],
-                ]
-            ),
-        )
-
-    def test_src_padding_masking(self):
-        batch_size, n_heads, seq_len = 2, 2, 3
-        logits = torch.randn(
-            batch_size, n_heads, seq_len, seq_len, dtype=torch.float)
-        src_padding_mask = torch.BoolTensor(
-            [[True, True, True], [True, False, False]])
-        self.assertEqual(src_padding_mask.shape, (2, 3))
-        masked_logits = MultiHeadAttention(512, num_heads=n_heads).mask_logits(
-            logits, src_padding_mask=src_padding_mask
-        )
-        torch.testing.assert_close(
-            torch.isinf(masked_logits) == 0,
-            torch.BoolTensor(
-                [
-                    [
-                        [
-                            [True, True, True],
-                            [True, True, True], [True, True, True],],
-                        [
-                            [True, True, True],
-                            [True, True, True], [True, True, True],],
-                    ],
-                    [
-                        [
-                            [True, False, False],
-                            [True, False, False],
-                            [True, False, False],
-                        ],
-                        [
-                            [True, False, False],
-                            [True, False, False],
-                            [True, False, False],
-                        ],
-                    ],
-                ]
-            ),
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/transformers/src/positional_encoding.py b/transformers/src/positional_encoding.py
deleted file mode 100644
index c88b60d..0000000
--- a/transformers/src/positional_encoding.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import math
-import unittest
-
-import torch
-
-
-class SinusoidEncoding(torch.nn.Module):
-    """
-    Mostly copied from
-    https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html
-    """
-
-    def __init__(self, hidden_dim, max_len=5000):
-        """
-        Inputs
-            d_model - Hidden dimensionality of the input.
-            max_len - Maximum length of a sequence to expect.
-        """
-        super().__init__()
-
-        # Create matrix of [SeqLen, HiddenDim] representing the positional
-        # encoding for max_len inputs
-        pos_embed = torch.zeros(max_len, hidden_dim)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(
-            torch.arange(
-                0,
-                hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim)
-        )
-        pos_embed[:, 0::2] = torch.sin(position * div_term)
-        pos_embed[:, 1::2] = torch.cos(position * div_term)
-        pos_embed = pos_embed.unsqueeze(0)
-
-        # register_buffer => Tensor which is not a parameter, but should
-        # be part of the modules state.
-        # Used for tensors that need to be on the same device as the module.
-        # persistent=False tells PyTorch to not add the buffer to the state
-        # dict (e.g. when we save the model)
-        self.register_buffer("pos_embed", pos_embed, persistent=False)
-
-    def forward(self, x):
-        """
-        Adds positional embeddings to token embeddings.
-        N = batch size
-        L = sequence length
-        E = embedding dim
-
-        :param x: token embeddings. Shape: (N, L, E)
-        :return: token_embeddings + positional embeddings. Shape: (N, L, E)
-        """
-        x = x + self.pos_embed[:, : x.size(1)]
-        return x
-
-
-class TestSinusoidEncoding(unittest.TestCase):
-    def test_create_embedding(self):
-        batch = 1
-        dim = 8
-        len = 3
-        x = torch.zeros(batch, len, dim)
-        encoding = SinusoidEncoding(dim).forward(x)
-        expected = torch.Tensor(
-            [
-                [
-                    [
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                    ],
-                    [
-                        8.4147e-01,
-                        5.4030e-01,
-                        9.9833e-02,
-                        9.9500e-01,
-                        9.9998e-03,
-                        9.9995e-01,
-                        1.0000e-03,
-                        1.0000e00,
-                    ],
-                    [
-                        9.0930e-01,
-                        -4.1615e-01,
-                        1.9867e-01,
-                        9.8007e-01,
-                        1.9999e-02,
-                        9.9980e-01,
-                        2.0000e-03,
-                        1.0000e00,
-                    ],
-                ]
-            ]
-        )
-        torch.testing.assert_close(encoding, expected, rtol=10e-5, atol=10e-5)
-
-    def test_create_embedding_multi_batch(self):
-        batch = 2
-        dim = 8
-        len = 3
-        x = torch.zeros(batch, len, dim)
-        encoding = SinusoidEncoding(dim).forward(x)
-        expected = torch.Tensor(
-            [
-                [
-                    [
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                    ],
-                    [
-                        8.4147e-01,
-                        5.4030e-01,
-                        9.9833e-02,
-                        9.9500e-01,
-                        9.9998e-03,
-                        9.9995e-01,
-                        1.0000e-03,
-                        1.0000e00,
-                    ],
-                    [
-                        9.0930e-01,
-                        -4.1615e-01,
-                        1.9867e-01,
-                        9.8007e-01,
-                        1.9999e-02,
-                        9.9980e-01,
-                        2.0000e-03,
-                        1.0000e00,
-                    ],
-                ],
-                [
-                    [
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                        0.0000e00,
-                        1.0000e00,
-                    ],
-                    [
-                        8.4147e-01,
-                        5.4030e-01,
-                        9.9833e-02,
-                        9.9500e-01,
-                        9.9998e-03,
-                        9.9995e-01,
-                        1.0000e-03,
-                        1.0000e00,
-                    ],
-                    [
-                        9.0930e-01,
-                        -4.1615e-01,
-                        1.9867e-01,
-                        9.8007e-01,
-                        1.9999e-02,
-                        9.9980e-01,
-                        2.0000e-03,
-                        1.0000e00,
-                    ],
-                ],
-            ]
-        )
-        torch.testing.assert_close(encoding, expected, rtol=10e-5, atol=10e-5)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/transformers/src/transformer.py b/transformers/src/transformer.py
deleted file mode 100644
index 19a9cca..0000000
--- a/transformers/src/transformer.py
+++ /dev/null
@@ -1,148 +0,0 @@
-"""A base transformer model"""
-import random
-import unittest
-from typing import Optional
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn.init import xavier_uniform_
-
-from .vocabulary import Vocabulary
-from encoder import TransformerEncoder
-from .decoder import TransformerDecoder
-from utils import construct_future_mask
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        hidden_dim: int,
-        ff_dim: int,
-        num_heads: int,
-        num_layers: int,
-        max_decoding_length: int,
-        vocab_size: int,
-        padding_idx: int,
-        bos_idx: int,
-        dropout_p: float,
-        tie_output_to_embedding: Optional[bool] = None,
-    ):
-        super().__init__()
-        # Because the encoder embedding, and decoder embedding and
-        # decoder pre-softmax transformeation share embeddings
-        # weights, initialize one here and pass it on.
-        self.embed = nn.Embedding(
-            vocab_size, hidden_dim,
-            padding_idx=padding_idx)
-        self.encoder = TransformerEncoder(
-            self.embed, hidden_dim, ff_dim, num_heads, num_layers, dropout_p
-        )
-        self.decoder = TransformerDecoder(
-            self.embed,
-            hidden_dim,
-            ff_dim,
-            num_heads,
-            num_layers,
-            vocab_size,
-            dropout_p,
-            tie_output_to_embedding,
-        )
-
-        self.padding_idx = padding_idx
-        self.bos_idx = bos_idx
-        self.max_decoding_length = max_decoding_length
-        self.hidden_dim = hidden_dim
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                xavier_uniform_(p)
-
-
-class TestTransformer(unittest.TestCase):
-    """Test case for transformer
-
-    Args:
-        unittest (_type_): _description_
-    """
-    def test_transformer_inference(self):
-        """A test transformer inference
-        """
-        seed = 0
-        torch.manual_seed(seed)
-        random.seed(seed)
-        np.random.seed(seed)
-
-        # Create (shared) vocabulary and special token indices given a
-        # dummy corpus
-        corpus = [
-            "Hello my name is Joris and I was born with the name Joris.",
-            "Dit is een Nederlandse zin.",
-        ]
-        en_vocab = Vocabulary(corpus)
-        en_vocab_size = len(en_vocab.token2index.items())
-        with torch.no_grad():
-            transformer = Transformer(
-                hidden_dim=512,
-                ff_dim=2048,
-                num_heads=8,
-                num_layers=6,
-                max_decoding_length=10,
-                vocab_size=en_vocab_size,
-                padding_idx=en_vocab.token2index[en_vocab.PAD],
-                bos_idx=en_vocab.token2index[en_vocab.BOS],
-                dropout_p=0.1,
-                tie_output_to_embedding=True,
-            )
-            transformer.eval()
-
-            # Prepare encoder input, mask and generate output hidden states
-            encoder_input = torch.IntTensor(
-                en_vocab.batch_encode(corpus, add_special_tokens=False)
-            )
-            src_padding_mask = encoder_input != transformer.padding_idx
-            encoder_output = transformer.encoder.forward(
-                encoder_input, src_padding_mask=src_padding_mask
-            )
-            self.assertEqual(torch.any(torch.isnan(encoder_output)), False)
-
-            # Prepare decoder input and mask and start decoding
-            decoder_input = torch.IntTensor(
-                [[transformer.bos_idx], [transformer.bos_idx]]
-            )
-            future_mask = construct_future_mask(seq_len=1)
-            for i in range(transformer.max_decoding_length):
-                decoder_output = transformer.decoder(
-                    decoder_input,
-                    encoder_output,
-                    src_padding_mask=src_padding_mask,
-                    future_mask=future_mask,
-                )
-                # Take the argmax over the softmax of the last token
-                # to obtain the next-token prediction
-                predicted_tokens = torch.argmax(
-                    decoder_output[:, -1, :], dim=-1
-                ).unsqueeze(1)
-
-                # Append the prediction to the already
-                # decoded tokens and construct the new mask
-                decoder_input = torch.cat(
-                    (decoder_input, predicted_tokens),
-                    dim=-1)
-                future_mask = construct_future_mask(decoder_input.shape[1])
-
-        self.assertEqual(
-            decoder_input.shape,
-            (2, transformer.max_decoding_length + 1)
-            )
-        # see test_one_layer_transformer_decoder_inference
-        # in decoder.py for more information. with num_layers=1 this
-        # will be true.
-        self.assertEqual(
-            torch.all(decoder_input == transformer.bos_idx), False)
-
-
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
diff --git a/transformers/src/vocabulary.py b/transformers/src/vocabulary.py
deleted file mode 100644
index 829eba2..0000000
--- a/transformers/src/vocabulary.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import re
-# check why we need this lin
-from typing import List, Optional
-
-
-class Vocabulary:
-    """Generates a list of tokens from input string """
-    BOS = "BOS"
-    EOS = "EOS"
-    PAD = "PAD"
-
-    def __init__(self, list_of_sentences: Optional[List[str]]) -> None:
-        self.token2index = {self.BOS: 0, self.EOS: 1, self.PAD: 2}
-        self.index2token = {v: k for k, v in self.token2index.items()}
-        # check whether there is a list of sentences?
-        if not list_of_sentences:
-            # then return
-            return
-        # But if there is a list of sentences, call add token method,
-        # tokenizes and substitute the values for BOS, EAS, PADS?
-        for sentence in list_of_sentences:
-            self.add_tokens(self.tokenize(sentence))
-
-    def add_tokens(self, tokens: List[str]) -> None:
-        """
-        Adds token to the vocabulary
-        :param tokens - represents a tokenized sentence
-        :return None
-        """
-        for token in tokens:
-            if token not in self.token2index:
-                i = len(self.token2index.items())
-                self.token2index[token] = i
-                self.index2token[i] = token
-
-    def tokenize(
-            self,
-            sentence: str,
-            add_boolean_tokens: bool = True) -> List[str]:
-        """
-        Splits all tokens and punctuations. Adds BOS and EOS optionally.
-        :param: sentence
-        :param: add_boolean_tokens
-        :return list of long tokens
-        """
-        tokens = re.findall(r"\w+|[^\s\w]+", sentence)
-        if add_boolean_tokens:
-            tokens = [self.BOS] + tokens + [self.EOS]
-        return tokens
-
-    def encode(
-            self, sentence: str,
-            add_special_tokens: bool = True) -> List[str]:
-        pass
diff --git a/unsupervised/README.md b/unsupervised/README.md
old mode 100644
new mode 100755
index ea91260..c0de805
--- a/unsupervised/README.md
+++ b/unsupervised/README.md
@@ -1,8 +1,9 @@
-### Contents
+# Contents
 
 --------------------------
+In this folder, I present LLMs which are based on encoder models. The aim is to discuss examples which are suitable for problems with no labeled outputs. This implies that we categorize models based on inputs and associated learning methods.
 
-### Algorithms
+## Algorithms
   
 * Identify different data collection techniques
 * Data types, preparation and analysis.