Skip to content

Commit

Permalink
Merge pull request #7 from rileydrizzy/main
Browse files Browse the repository at this point in the history
Merge
  • Loading branch information
rileydrizzy authored Nov 23, 2023
2 parents 5a3866b + 8335ebb commit 0299456
Show file tree
Hide file tree
Showing 9 changed files with 365 additions and 144 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ data/*/*
models/*
!.gitkeep
!dataset_paths.json
!dev_samples.json

# linguify_yb directory
linguify_yb/data/*/*
Expand All @@ -112,7 +113,7 @@ yb2audio/data/*/*
#yb2audio/development

# Development Enviroment
#dev
dev.py
#development
dev_env.txt

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ $ make setup
$ source $(poetry env info --path)/bin/activate
```

## Planned Task
## Project Roadmap

Here's a glimpse of the exciting features we plan to implement in the coming weeks:

Expand Down
Binary file added images/sign lang.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 3 additions & 1 deletion linguify_yb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
[![Python](https://img.shields.io/badge/python-3.6-blue.svg?style=flat-square)](https://www.python.org/)
[![PyTorch](https://img.shields.io/badge/PyTorch-1.7.0-orange)](https://pytorch.org/)

![image/gif]()
![image/gif](https://github.com/rileydrizzy/Cohort8-Ransom-Kuti-Ladipo/blob/main/images/sign%20lang.gif)

## Project description

***Overview:*** \

# Project Roadmap
Empty file added linguify_yb/__init__.py
Empty file.
76 changes: 76 additions & 0 deletions linguify_yb/data/dataset_paths.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"train_files": [
"/kaggle/input/asl-fingerspelling/train_landmarks/1552432300.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/5414471.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/568753759.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1643479812.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/2026717426.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1365772051.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/882979387.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1920330615.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1099408314.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/871280215.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1320204318.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/566963657.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/433948159.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/296317215.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1906357076.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/2118949241.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/495378749.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/388576474.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/152029243.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/169560558.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1134756332.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/234418913.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1098899348.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1647220008.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/149822653.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/474255203.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/546816846.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/2072296290.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/933868835.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1865557033.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/128822441.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1021040628.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1358493307.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1405046009.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1997878546.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/649779897.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1726141437.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1133664520.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/2072876091.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/638508439.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1662742697.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/2036580525.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1255240050.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/349393104.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/654436541.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/450474571.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1905462118.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1497621680.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1562234637.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/105143404.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/614661748.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/175396851.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/527708222.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1557244878.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/522550314.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1785039512.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1019715464.parquet"
],
"valid_files": [
"/kaggle/input/asl-fingerspelling/train_landmarks/532011803.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1341528257.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/939623093.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1448136004.parquet"
],
"test_files": [
"/kaggle/input/asl-fingerspelling/train_landmarks/425182931.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1969985709.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1967755728.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1365275733.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1880177496.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/1664666588.parquet",
"/kaggle/input/asl-fingerspelling/train_landmarks/683666742.parquet"
]
}
10 changes: 9 additions & 1 deletion linguify_yb/data/dev_samples.json
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
{"train_files": ["data/asl-fingerspelling/train_landmarks/1019715464.parquet", "data/asl-fingerspelling/train_landmarks/1021040628.parquet"], "valid_files": ["data/asl-fingerspelling/train_landmarks/105143404.parquet"]}
{
"train_files": [
"data/asl-fingerspelling/train_landmarks/1019715464.parquet",
"data/asl-fingerspelling/train_landmarks/1021040628.parquet"
],
"valid_files": [
"data/asl-fingerspelling/train_landmarks/105143404.parquet"
]
}
239 changes: 146 additions & 93 deletions linguify_yb/src/models/baseline_transfomer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,124 +3,177 @@

import torch
import torch.nn as nn
import torch.nn.functional as F


class TokenEmbedding(nn.Module):
def __init__(self, number_vocab=60, max_len=100, embedding_dim=64):
super().__init__()
self.postional_embedding_layers = nn.Embedding(number_vocab, embedding_dim)
self.embedding_layers = nn.Embedding(max_len, embedding_dim)
def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
super(TokenEmbedding, self).__init__()
self.emb = nn.Embedding(num_vocab, num_hid)
self.pos_emb = nn.Embedding(maxlen, num_hid)

def forward(self, input_x):
max_len = input_x.size()[-1]
input_x = self.embedding_layers(input_x)
# Generate positions using torch.arange
positions = torch.arange(0, max_len)
positions = self.postional_embedding_layers(positions)
return input_x + positions
def forward(self, x):
maxlen = x.size(-1)
x = self.emb(x)
positions = torch.arange(0, maxlen).to(x.device)
positions = self.pos_emb(positions)
return x + positions


class LandmarkEmbedding(nn.Module):
def __init__(self, input_dim=None, number_hidden=64, max_len=100):
super().__init__()
self.conv1 = nn.Conv1d(
in_channels=input_dim,
out_channels=number_hidden,
kernel_size=11,
padding="same",
stride=1,
def __init__(self, num_hid=64, maxlen=100):
super(LandmarkEmbedding, self).__init__()
self.conv1 = nn.Conv1d(num_hid, 11, stride=2, padding="same")
self.conv2 = nn.Conv1d(num_hid, 11, stride=2, padding="same")
self.conv3 = nn.Conv1d(num_hid, 11, stride=2, padding="same")
self.pos_emb = nn.Embedding(maxlen, num_hid)

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
return F.relu(self.conv3(x))


class TransformerEncoder(nn.Module):
def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
super(TransformerEncoder, self).__init__()
self.att = nn.MultiheadAttention(embed_dim, num_heads)
self.ffn = nn.Sequential(
nn.Linear(embed_dim, feed_forward_dim),
nn.ReLU(),
nn.Linear(feed_forward_dim, embed_dim),
)
self.conv2 = nn.Conv1d(
in_channels=number_hidden,
out_channels=number_hidden,
kernel_size=11,
padding="same",
stride=1,

self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
self.dropout1 = nn.Dropout(rate)
self.dropout2 = nn.Dropout(rate)

def forward(self, inputs, training):
attn_out, _ = self.att(inputs, inputs, inputs)
attn_out = self.dropout1(attn_out)
out1 = self.layernorm1(inputs + attn_out)

ffn_out = self.ffn(out1)
ffn_out = self.dropout2(ffn_out)
return self.layernorm2(out1 + ffn_out)


class TransformerDecoder(nn.Module):
def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
super(TransformerDecoder, self).__init__()
self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
self.layernorm3 = nn.LayerNorm(embed_dim, eps=1e-6)
self.self_att = nn.MultiheadAttention(embed_dim, num_heads)
self.enc_att = nn.MultiheadAttention(embed_dim, num_heads)
self.self_dropout = nn.Dropout(0.5)
self.enc_dropout = nn.Dropout(0.1)
self.ffn_dropout = nn.Dropout(0.1)
self.ffn = nn.Sequential(
nn.Linear(embed_dim, feed_forward_dim),
nn.ReLU(),
nn.Linear(feed_forward_dim, embed_dim),
)
self.conv3 = nn.Conv1d(
in_channels=number_hidden,
out_channels=number_hidden,
kernel_size=11,
padding="same",
stride=1,

def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
i = torch.arange(n_dest)[:, None]
j = torch.arange(n_src)
m = i >= j - n_src + n_dest
mask = m.to(dtype)
mask = mask.view(1, n_dest, n_src)
mult = torch.cat(
[batch_size[..., None], torch.tensor([1, 1], dtype=torch.int32)], 0
)
self.postions_embedding_layers = nn.Embedding(max_len, number_hidden)
self.seq_nn = nn.Sequential(
self.conv1, nn.ReLU(), self.conv2, nn.ReLU(), self.conv3, nn.ReLU()
return mask.repeat(mult)

def forward(self, enc_out, target, training):
input_shape = target.size()
batch_size = input_shape[0]
seq_len = input_shape[1]
causal_mask = self.causal_attention_mask(
batch_size, seq_len, seq_len, torch.bool
)

def forward(self, input_x):
outputs = self.seq_nn(input_x)
return outputs
target_att = self.self_att(target, target, target, attn_mask=causal_mask)
target_norm = self.layernorm1(target + self.self_dropout(target_att))

enc_out = self.enc_att(target_norm, enc_out, enc_out)
enc_out_norm = self.layernorm2(enc_out + self.enc_dropout(enc_out))

ffn_out = self.ffn(enc_out_norm)
ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))

return ffn_out_norm


class Transformer(nn.Module):
def __init__(
self,
input_dim,
output_dim,
num_hid=64,
num_head=2,
num_feed_forward=128,
source_maxlen=100,
target_maxlen=100,
no_multi_heads=6,
num_layers_enc=4,
num_layers_dec=1,
num_classes=60,
):
super().__init__()
num_encoder_layers = num_decoder_layers = 6
encoder_forward_dim = 100
# Define encoder and decoder layers
self.encoder_layer = nn.TransformerEncoderLayer(
d_model=input_dim,
nhead=no_multi_heads,
dim_feedforward=encoder_forward_dim,
activation="relu",
super(Transformer, self).__init__()
self.num_layers_enc = num_layers_enc
self.num_layers_dec = num_layers_dec
self.target_maxlen = target_maxlen
self.num_classes = num_classes

self.enc_input = LandmarkEmbedding(num_hid=num_hid, maxlen=source_maxlen)
self.dec_input = TokenEmbedding(
num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
)

self.decoder_layer = nn.TransformerDecoderLayer(
d_model=input_dim,
nhead=no_multi_heads,
dim_feedforward=output_dim,
activation="relu",
self.encoder = nn.Sequential(
self.enc_input,
*[
TransformerEncoder(num_hid, num_head, num_feed_forward)
for _ in range(num_layers_enc)
],
)

# Define encoder and decoder
self.transformer_encoder = nn.TransformerEncoder(
self.encoder_layer, num_layers=num_encoder_layers
for i in range(num_layers_dec):
self.add_module(
f"dec_layer_{i}",
TransformerDecoder(num_hid, num_head, num_feed_forward),
)

self.classifier = nn.Linear(num_hid, num_classes)

def decode(self, enc_out, target, training):
y = self.dec_input(target)
for i in range(self.num_layers_dec):
y = getattr(self, f"dec_layer_{i}")(enc_out, y, training)
return y

def forward(self, inputs, training):
source, target = inputs
x = self.encoder(source)
y = self.decode(x, target, training)
return self.classifier(y)

def generate(self, source, target_start_token_idx):
bs = source.size(0)
enc = self.encoder(source)
dec_input = (
torch.ones((bs, 1), dtype=torch.long).to(source.device)
* target_start_token_idx
)
self.transformer_decoder = nn.TransformerDecoder(
self.decoder_layer, num_layers=num_decoder_layers
)

# Input and output linear layers
self.input_linear = LandmarkEmbedding(
input_dim=input_dim, max_len=source_maxlen
)
self.target_linear = TokenEmbedding(max_len=target_maxlen)
self.num_classes = 60
self.output_linear = nn.Linear(output_dim, self.num_classes)

def forward(self, input_x, input_y):
# Apply EMbedding
input_x = self.input_linear(input_x)

# Transformer encoding
memory = self.transformer_encoder(input_x)

# Apply linear layer to the target
input_y = self.target_linear(input_y)

# Transformer decoding
output = self.transformer_decoder(input_y, memory)

# Apply linear layer to the output
output = self.output_linear(output)

return output

# TODO code generate for inference
def generate(
self,
):
pass

dec_logits = []
for i in range(self.target_maxlen - 1):
dec_out = self.decode(enc, dec_input, training=False)
logits = self.classifier(dec_out)
logits = torch.argmax(logits, dim=-1, keepdim=True)
last_logit = logits[:, -1]
dec_logits.append(last_logit)
dec_input = torch.cat([dec_input, last_logit], dim=-1)
return dec_input

def build_model():
pass
Loading

0 comments on commit 0299456

Please sign in to comment.