braunagn
diff --git a/‎config.py
+118-7 b/‎config.py
+118-7
diff --git a/‎dataset.py
+5-37 b/‎dataset.py
+5-37
@@ -7,7 +7,13 @@
 ####################
 
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+C = 512
 T = 30  # max context length; informed via quick analysis
+N_LAYERS = 6
+NUM_HEADS = 8
+HEAD_SIZE = 64  # C // NUM_HEADS = 512 // 8
+
+DROPOUT = 0.1
 BATCH_SIZE = 8
 BATCH_SIZE_VAL = 50
 
@@ -24,11 +30,116 @@
 }
 VOCAB_SIZE = 30000
 IGNORE = [
-    # chars that appear very infrequently (1-5 times) in the dataset.  Ignoring these sentence
+    # chars that appear very infrequently (~1-5 times) in the dataset.  Ignoring these sentence
     # all together given negligible impact on training and project focus is educational
-    '°', '²', '½', 'Á', 'Ç', 'É', '×', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
-    'ð', 'ñ', 'ó', 'ô', 'ö', 'ú', 'û', 'ü', 'ā', 'ă', 'Ĉ', 'ĉ', 'Č', 'ĝ', 'ĥ', 'ī', 'ı', 'ĵ', 'ł', 'ō', 'ŝ', 'ş', 'š',
-    'ŭ', 'ș',  'ə', 'ʻ', 'π', 'ḥ', 'ṛ',  '/', '…', '√', '🌡', '😷', '🤒', '🤧', '🤮', '🦠',  '🧼', '«', '»', 'Í',
-    'Ö', 'ć', 'ń', 'ŏ', 'ū', 'М', 'Ч', 'а', 'з', 'и', 'к', 'л', 'о', 'р', 'с', 'т', 'ы', 'э', 'ׁ', '‐', '–', '—', '‘',
-    '’', '“', '”', '₂', '€', '→', 'あ', '@', '^', '+', '"', '&', '_', '{', '}', '(', ')', '[', ']', '#', "...",
-  ]
+    "°",
+    "²",
+    "½",
+    "Á",
+    "Ç",
+    "É",
+    "×",
+    "ß",
+    "à",
+    "á",
+    "â",
+    "ã",
+    "ä",
+    "å",
+    "ç",
+    "è",
+    "é",
+    "ê",
+    "ë",
+    "ì",
+    "í",
+    "î",
+    "ï",
+    "ð",
+    "ñ",
+    "ó",
+    "ô",
+    "ö",
+    "ú",
+    "û",
+    "ü",
+    "ā",
+    "ă",
+    "Ĉ",
+    "ĉ",
+    "Č",
+    "ĝ",
+    "ĥ",
+    "ī",
+    "ı",
+    "ĵ",
+    "ł",
+    "ō",
+    "ŝ",
+    "ş",
+    "š",
+    "ŭ",
+    "ș",
+    "ə",
+    "ʻ",
+    "π",
+    "ḥ",
+    "ṛ",
+    "/",
+    "…",
+    "√",
+    "🌡",
+    "😷",
+    "🤒",
+    "🤧",
+    "🤮",
+    "🦠",
+    "🧼",
+    "«",
+    "»",
+    "Í",
+    "Ö",
+    "ć",
+    "ń",
+    "ŏ",
+    "ū",
+    "М",
+    "Ч",
+    "а",
+    "з",
+    "и",
+    "к",
+    "л",
+    "о",
+    "р",
+    "с",
+    "т",
+    "ы",
+    "э",
+    "ׁ",
+    "‐",
+    "–",
+    "—",
+    "‘",
+    "’",
+    "“",
+    "”",
+    "₂",
+    "€",
+    "→",
+    "あ",
+    "@",
+    "^",
+    "+",
+    '"',
+    "&",
+    "_",
+    "{",
+    "}",
+    "(",
+    ")",
+    "[",
+    "]",
+    "#",
+    "...",
+]
@@ -7,49 +7,17 @@
 class LanguageDataset(Dataset):
     def __init__(self, X1, X2, y, pad_token_id=None):
         super(LanguageDataset).__init__()
-        self.X1 = torch.tensor(X1, dtype=torch.int32, device=config.DEVICE)
-        self.X2 = torch.tensor(X2, dtype=torch.int32, device=config.DEVICE)
-        self.y = torch.tensor(y, dtype=torch.float32, device=config.DEVICE)  # float to compare w/model output
+        self.X1 = torch.tensor(X1, device=config.DEVICE)  # NL sequences
+        self.X2 = torch.tensor(X2, device=config.DEVICE)  # EN sequences
+        self.y = torch.tensor(y, dtype=torch.float32, device=config.DEVICE)  # EN shifted
         self.pad_token_id = pad_token_id
 
     def __getitem__(self, index):
+        # each sample returns: NL seq (X1), EN seq (X2), EN+1 seq (y) and pad masking for NL seq (x1pad)
         if self.pad_token_id is not None:
             x1pad = self.X1[index]==self.pad_token_id
             return self.X1[index], self.X2[index], self.y[index], x1pad[None,:]
         return self.X1[index], self.X2[index], self.y[index]
 
     def __len__(self):
-        return len(self.X1)
-
-
-X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
-    X1,
-    X2,
-    y,
-    test_size=0.15,
-    shuffle=False, # already shuffled and grouped
-)
-train_data = LanguageDataset(X1_train, X2_train, y_train, pad_token_id=pad_token_id)
-test_data = LanguageDataset(X1_test, X2_test, y_test, pad_token_id=pad_token_id)
-
-training_dl = DataLoader(
-    train_data,
-    batch_size=config.BATCH_SIZE,
-    shuffle=False,  # keep sequences of the same length together
-    drop_last=False,
-)
-
-# for loss performance over train/test datasets (vs. batch being trained on)
-train_dl = DataLoader(
-    train_data,
-    batch_size=config.BATCH_SIZE_VAL,
-    shuffle=True,   # sample across the dataset, regardless of sequence len
-    drop_last=False,
-)
-
-test_dl = DataLoader(
-    test_data,
-    batch_size=config.BATCH_SIZE_VAL,
-    shuffle=True,
-    drop_last=False,
-)
+        return len(self.X1)