[add] updates

AISaturdaysLagos · Dec 12, 2023 · 0b56537 · 0b56537
1 parent 5383afe
commit 0b56537
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 35 deletions.
diff --git a/linguify_yb/development/dev.ipynb b/linguify_yb/development/dev.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -38,33 +38,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.is_distri"
+    "sample_sentence_token = [60,51,39,40,50,0,40,50,0,32,0,51,36,50,51,0,49,52,45,61]\n",
+    "# Padding the token\n",
+    "sample_sentence_token = sample_sentence_token + ([59] * (64 - len(sample_sentence_token)))\n",
+    "sample_sentence_token = torch.tensor(sample_sentence_token)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.Size([1, 128, 345]) torch.Size([1, 64]) 1\n",
-      "torch.Size([2, 128, 345]) torch.Size([2, 64]) 2\n",
-      "torch.Size([4, 128, 345]) torch.Size([4, 64]) 4\n",
-      "torch.Size([8, 128, 345]) torch.Size([8, 64]) 8\n"
-     ]
+     "data": {
+      "text/plain": [
+       "64"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(sample_sentence_token)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([60, 51, 39, 40, 50,  0, 40, 50,  0, 32,  0, 51, 36, 50, 51,  0, 49, 52,\n",
+       "        45, 61, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,\n",
+       "        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,\n",
+       "        59, 59, 59, 59, 59, 59, 59, 59, 59, 59])"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "for b in ans:\n",
-    "    x, y , bs = b\n",
-    "    print(x.shape, y.shape,bs)"
+    "sample_sentence_token"
    ]
   },
   {

diff --git a/linguify_yb/src/dataset/dataset_loader.py b/linguify_yb/src/dataset/dataset_loader.py
@@ -42,11 +42,11 @@ def __init__(
     def _indexesfromsentence(self, sentence):
         return [self.word2index[word] for word in sentence]
 
-    def tensorfromsentence(self, sentence):
+    def sentence_to_tensor(self, sentence):
         indexes = self._indexesfromsentence(sentence)
-        return torch.tensor(indexes, dtype=torch.long)  # .view(1, -1)
+        return torch.tensor(indexes, dtype=torch.long)
 
-    def indexes_to_sentence(self, indexes_list):
+    def index_to_sentence(self, indexes_list):
         if torch.is_tensor(indexes_list):
             indexes_list = indexes_list.tolist()
         words = [self.index2word[idx] for idx in indexes_list]

diff --git a/linguify_yb/src/tests/test_data_ingestion.py b/linguify_yb/src/tests/test_data_ingestion.py
@@ -6,6 +6,7 @@
 from torch.utils.data import DataLoader
 from src.dataset.frames_config import FRAME_LEN
 from src.dataset.preprocess import clean_frames_process
+from src.dataset.dataset_loader import TokenHashTable
 
 # TODO test for frames in right shapes, in tensor, frames are normalize
 # TODO test for frames dont contain NAN
@@ -18,17 +19,27 @@
     [torch.randn(num_frames, 345) for num_frames in [10, 108, 128, 156, 750, 420]],
 )
 def test_frames_preprocess(frames):
-    clean_frames = clean_frames_process(frames)
+    """doc"""
+    frames = clean_frames_process(frames)
     expected_output_shape = (128, 345)
-    assert expected_output_shape == clean_frames.shape
-
-@pytest
-def test_TokenHashTable(tokentable):
-    token_table = 
-    sample_sentences = ""
-    sample_sentences_len = len(sample_sentences)
-    sample_sentences_token = [64,]
-    tokenize_result = token_table
-    assert sample_sentences_len == len(tokenize_result)
-    assert sample_sentences_token == tokenize_result
-
+    assert (
+        expected_output_shape == frames.shape
+    ), f"frames shape should be {expected_output_shape}"
+
+
+def test_token_hash_table():
+    token_table = TokenHashTable()
+    sample_sentence = "this is a test run"
+    sample_sentence_len = len(sample_sentence)
+    sample_sentence_token = [60,51,39,40,50,0,40,50,0,32,0,51,36,50,51,0,49,52,45,61]
+    # Padding the token
+    sample_sentence_token = sample_sentence_token + (
+        [59] * (64 - len(sample_sentence_token))
+    )
+    sample_sentence_token = torch.tensor(sample_sentence_token)
+    tokenize_result = token_table.sentence_to_tensor(sample_sentence)
+    assert sample_sentence_len == len(tokenize_result)
+    assert sample_sentence_token == tokenize_result
+
+    # Assert that clean_frames is a PyTorch tensor
+    assert torch.is_tensor(tokenize_result), "is not PyTorch tensor"
diff --git a/linguify_yb/src/trainer.py b/linguify_yb/src/trainer.py
@@ -135,7 +135,3 @@ def load_checkpoint(model, optimizer, filename):
     losses = checkpoint["loss"]
     val_losses = checkpoint["val_loss"]
     return model
-
-
-def distributed_stra_gpu():
-    pass