Skip to content

PyTorch: basic dataset wrapper #1126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ jobs:
pip install --user -r requirements.txt | cat
pip install --user --upgrade tensorflow==${{ matrix.tf-version }} | cat
pip install --user theano==0.9 | cat
pip install --user torch==1.12 | cat

- name: Test Python/Numpy/TF/Theano versions.
run: |
Expand Down
35 changes: 35 additions & 0 deletions returnn/torch/dataset_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Code to create PyTorch datasets that can be used with the PyTorch DataLoader.
"""

from torch.utils.data import IterableDataset


# noinspection PyAbstractClass
class DatasetWrapper(IterableDataset):
"""
Converts a RETURNN dataset into a PyTorch IterableDataset.
"""

def __init__(self, returnn_dataset):
"""
:param returnn.datasets.basic.Dataset returnn_dataset: dataset to be wrapped
"""
self._dataset = returnn_dataset

def __iter__(self):
"""
:return: generator providing data samples in the form of a dict data_key -> data
:rtype: Iterable[dict[str, numpy.ndarray]]
"""
data_keys = self._dataset.get_data_keys()

seq_index = 0
while self._dataset.is_less_than_num_seqs(seq_index):
self._dataset.load_seqs(seq_index, seq_index + 1)

data = {data_key: self._dataset.get_data(seq_index, data_key) for data_key in data_keys}

yield data

seq_index += 1
26 changes: 25 additions & 1 deletion returnn/torch/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
Main engine for PyTorch
"""

from torch.utils.data import DataLoader

from returnn.log import log
from returnn.engine.base import EngineBase
from returnn.datasets.basic import init_dataset
from returnn.torch.dataset_wrapper import DatasetWrapper


class Engine(EngineBase):
Expand Down Expand Up @@ -42,4 +46,24 @@ def train(self):
"""
Main training loop.
"""
pass
start_epoch, _ = self.get_train_start_epoch_batch(self.config)
final_epoch = self.config_get_final_epoch(self.config)

print("Starting training at epoch {}.".format(start_epoch), file=log.v3)

self.epoch = start_epoch
while self.epoch <= final_epoch:
print("Starting " + self.get_epoch_str() + "...", file=log.v4)

self.train_dataset.init_seq_order(epoch=self.epoch)

train_data = DatasetWrapper(self.train_dataset)

data_loader = DataLoader(train_data, batch_size=1) # TODO: implement batching

for data in data_loader:
assert data # TODO: only iterates through dataset so far

self.epoch += 1

print("Finished training at epoch {}.".format(self.epoch), file=log.v3)
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<item index="12" class="java.lang.String" itemvalue="six" />
<item index="13" class="java.lang.String" itemvalue="mpi4py" />
<item index="14" class="java.lang.String" itemvalue="sentencepiece" />
<item index="14" class="java.lang.String" itemvalue="torch" />
</list>
</value>
</option>
Expand Down