Skip to content

Commit

Permalink
[add] updates
Browse files Browse the repository at this point in the history
  • Loading branch information
rileydrizzy committed Dec 12, 2023
1 parent 5383afe commit 0b56537
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 35 deletions.
54 changes: 39 additions & 15 deletions linguify_yb/development/dev.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -38,33 +38,57 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"torch.is_distri"
"sample_sentence_token = [60,51,39,40,50,0,40,50,0,32,0,51,36,50,51,0,49,52,45,61]\n",
"# Padding the token\n",
"sample_sentence_token = sample_sentence_token + ([59] * (64 - len(sample_sentence_token)))\n",
"sample_sentence_token = torch.tensor(sample_sentence_token)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 128, 345]) torch.Size([1, 64]) 1\n",
"torch.Size([2, 128, 345]) torch.Size([2, 64]) 2\n",
"torch.Size([4, 128, 345]) torch.Size([4, 64]) 4\n",
"torch.Size([8, 128, 345]) torch.Size([8, 64]) 8\n"
]
"data": {
"text/plain": [
"64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sample_sentence_token)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([60, 51, 39, 40, 50, 0, 40, 50, 0, 32, 0, 51, 36, 50, 51, 0, 49, 52,\n",
" 45, 61, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,\n",
" 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,\n",
" 59, 59, 59, 59, 59, 59, 59, 59, 59, 59])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for b in ans:\n",
" x, y , bs = b\n",
" print(x.shape, y.shape,bs)"
"sample_sentence_token"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions linguify_yb/src/dataset/dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ def __init__(
def _indexesfromsentence(self, sentence):
return [self.word2index[word] for word in sentence]

def tensorfromsentence(self, sentence):
def sentence_to_tensor(self, sentence):
indexes = self._indexesfromsentence(sentence)
return torch.tensor(indexes, dtype=torch.long) # .view(1, -1)
return torch.tensor(indexes, dtype=torch.long)

def indexes_to_sentence(self, indexes_list):
def index_to_sentence(self, indexes_list):
if torch.is_tensor(indexes_list):
indexes_list = indexes_list.tolist()
words = [self.index2word[idx] for idx in indexes_list]
Expand Down
37 changes: 24 additions & 13 deletions linguify_yb/src/tests/test_data_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from torch.utils.data import DataLoader
from src.dataset.frames_config import FRAME_LEN
from src.dataset.preprocess import clean_frames_process
from src.dataset.dataset_loader import TokenHashTable

# TODO test for frames in right shapes, in tensor, frames are normalize
# TODO test for frames dont contain NAN
Expand All @@ -18,17 +19,27 @@
[torch.randn(num_frames, 345) for num_frames in [10, 108, 128, 156, 750, 420]],
)
def test_frames_preprocess(frames):
clean_frames = clean_frames_process(frames)
"""doc"""
frames = clean_frames_process(frames)
expected_output_shape = (128, 345)
assert expected_output_shape == clean_frames.shape

@pytest
def test_TokenHashTable(tokentable):
token_table =
sample_sentences = ""
sample_sentences_len = len(sample_sentences)
sample_sentences_token = [64,]
tokenize_result = token_table
assert sample_sentences_len == len(tokenize_result)
assert sample_sentences_token == tokenize_result

assert (
expected_output_shape == frames.shape
), f"frames shape should be {expected_output_shape}"


def test_token_hash_table():
token_table = TokenHashTable()
sample_sentence = "this is a test run"
sample_sentence_len = len(sample_sentence)
sample_sentence_token = [60,51,39,40,50,0,40,50,0,32,0,51,36,50,51,0,49,52,45,61]
# Padding the token
sample_sentence_token = sample_sentence_token + (
[59] * (64 - len(sample_sentence_token))
)
sample_sentence_token = torch.tensor(sample_sentence_token)
tokenize_result = token_table.sentence_to_tensor(sample_sentence)
assert sample_sentence_len == len(tokenize_result)
assert sample_sentence_token == tokenize_result

# Assert that clean_frames is a PyTorch tensor
assert torch.is_tensor(tokenize_result), "is not PyTorch tensor"
4 changes: 0 additions & 4 deletions linguify_yb/src/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,3 @@ def load_checkpoint(model, optimizer, filename):
losses = checkpoint["loss"]
val_losses = checkpoint["val_loss"]
return model


def distributed_stra_gpu():
pass

0 comments on commit 0b56537

Please sign in to comment.