Skip to content

Commit c798bc4

Browse files
Refactor: expand datasets (#120)
* bash downalod data * update CI * sample * overwrite * kaggle * unzip * readme * dry Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 3e78c81 commit c798bc4

File tree

12 files changed

+88
-50
lines changed

12 files changed

+88
-50
lines changed

.actions/assistant.py

+35-7
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from datetime import datetime
66
from shutil import copyfile
77
from textwrap import wrap
8-
from typing import Any, Dict, Optional, Sequence, Tuple
8+
from typing import Any, Dict, List, Optional, Sequence, Tuple
99
from warnings import warn
1010

1111
import fire
@@ -131,6 +131,7 @@ def get_running_torch_version():
131131
class AssistantCLI:
132132

133133
DEVICE_ACCELERATOR = os.environ.get("ACCELERATOR", "cpu").lower()
134+
DATASET_FOLDER = os.environ.get("PATH_DATASETS", "_datasets").lower()
134135
DRY_RUN = bool(int(os.environ.get("DRY_RUN", 0)))
135136
_META_REQUIRED_FIELDS = ("title", "author", "license", "description")
136137
_SKIP_DIRS = (
@@ -154,6 +155,9 @@ class AssistantCLI:
154155
"flash_tutorials": "Kaggle",
155156
}
156157
_BASH_SCRIPT_BASE = ("#!/bin/bash", "set -e", "")
158+
_EXT_ARCHIVE_ZIP = (".zip",)
159+
_EXT_ARCHIVE_TAR = (".tar", ".gz")
160+
_EXT_ARCHIVE = _EXT_ARCHIVE_ZIP + _EXT_ARCHIVE_TAR
157161

158162
@staticmethod
159163
def _find_meta(folder: str) -> str:
@@ -231,10 +235,34 @@ def _parse_requirements(folder: str) -> Tuple[str, str]:
231235

232236
return " ".join(req), " ".join(pip_args)
233237

238+
@staticmethod
239+
def _bash_download_data(folder: str) -> List[str]:
240+
cmd = ["HERE=$PWD", f"cd {AssistantCLI.DATASET_FOLDER}"]
241+
meta = AssistantCLI._load_meta(folder)
242+
datasets = meta.get("datasets", {})
243+
data_kaggle = datasets.get("kaggle", [])
244+
cmd += [f"python -m kaggle competitions download -c {name}" for name in data_kaggle]
245+
files = [f"{name}.zip" for name in data_kaggle]
246+
data_web = datasets.get("web", [])
247+
cmd += [f"wget {web} --progress=bar:force:noscroll --tries=3" for web in data_web]
248+
files += [os.path.basename(web) for web in data_web]
249+
for fn in files:
250+
name, ext = os.path.splitext(fn)
251+
if ext not in AssistantCLI._EXT_ARCHIVE:
252+
continue
253+
if ext in AssistantCLI._EXT_ARCHIVE_ZIP:
254+
cmd += [f"mkdir -p {name}", f"unzip -o {fn} -d {name}"]
255+
else:
256+
cmd += [f"tar -zxvf {fn} --overwrite"]
257+
cmd += [f"rm {fn}"]
258+
cmd += ["ls -l", "cd $HERE"]
259+
return cmd
260+
234261
@staticmethod
235262
def bash_render(folder: str) -> str:
236-
print(f"Rendering: {folder}\n")
237-
cmd = list(AssistantCLI._BASH_SCRIPT_BASE)
263+
cmd = list(AssistantCLI._BASH_SCRIPT_BASE) + [f"# Rendering: {folder}"]
264+
if not AssistantCLI.DRY_RUN:
265+
cmd += AssistantCLI._bash_download_data(folder)
238266
ipynb_file, meta_file, thumb_file = AssistantCLI._valid_folder(folder, ext=".ipynb")
239267
pub_ipynb = os.path.join(DIR_NOTEBOOKS, f"{folder}.ipynb")
240268
pub_dir = os.path.dirname(pub_ipynb)
@@ -248,7 +276,7 @@ def bash_render(folder: str) -> str:
248276
# dry run does not execute the notebooks just takes them as they are
249277
cmd.append(f"cp {ipynb_file} {pub_ipynb}")
250278
else:
251-
print(f"available: {AssistantCLI.DEVICE_ACCELERATOR}\n")
279+
cmd.append(f"# available: {AssistantCLI.DEVICE_ACCELERATOR}\n")
252280
if AssistantCLI._valid_accelerator(folder):
253281
cmd.append(f"python -m papermill.cli {ipynb_file} {pub_ipynb} --kernel python")
254282
else:
@@ -267,8 +295,8 @@ def bash_render(folder: str) -> str:
267295

268296
@staticmethod
269297
def bash_test(folder: str) -> str:
270-
print(f"Testing: {folder}\n")
271-
cmd = list(AssistantCLI._BASH_SCRIPT_BASE)
298+
cmd = list(AssistantCLI._BASH_SCRIPT_BASE) + [f"# Testing: {folder}"]
299+
cmd += AssistantCLI._bash_download_data(folder)
272300
ipynb_file, _, _ = AssistantCLI._valid_folder(folder, ext=".ipynb")
273301

274302
# prepare isolated environment with inheriting the global packages
@@ -281,7 +309,7 @@ def bash_test(folder: str) -> str:
281309
pip_req, pip_args = AssistantCLI._parse_requirements(folder)
282310
cmd += [f"pip install {pip_req} {pip_args}", "pip list"]
283311

284-
print(f"available: {AssistantCLI.DEVICE_ACCELERATOR}\n")
312+
cmd.append(f"# available: {AssistantCLI.DEVICE_ACCELERATOR}")
285313
if AssistantCLI._valid_accelerator(folder):
286314
cmd.append(f"python -m pytest {ipynb_file} -v --nbval")
287315
else:

.actions/data-download.sh

-14
This file was deleted.

.azure-pipelines/ipynb-publish.yml

+5-4
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ jobs:
5757
- bash: |
5858
pip --version
5959
pip install --requirement requirements.txt
60+
pip install --requirement requirements/data.txt
6061
pip list
6162
displayName: 'Install dependencies'
6263
@@ -84,10 +85,6 @@ jobs:
8485
echo "##vso[task.setvariable variable=folders;isOutput=true]$COUNT"
8586
name: dropped
8687
87-
- script: |
88-
bash .actions/data-download.sh $(PATH_DATASETS)
89-
displayName: 'Pull datasets'
90-
9188
- bash: |
9289
while IFS= read -r line; do
9390
python .actions/assistant.py augment-script $line
@@ -96,6 +93,7 @@ jobs:
9693
displayName: 'Generate notebook'
9794
9895
- bash: |
96+
mkdir $(PATH_DATASETS)
9997
# render the actual notebooks
10098
while IFS= read -r line; do
10199
python .actions/assistant.py bash-render $line > .actions/_ipynb-render.sh
@@ -105,6 +103,9 @@ jobs:
105103
git status
106104
git commit -m "publish [GPU]: $(COMMIT_HASH)"
107105
condition: and(succeeded(), gt(variables['changed.folders'], 0))
106+
env:
107+
KAGGLE_USERNAME: $(KAGGLE_USERNAME)
108+
KAGGLE_KEY: $(KAGGLE_KEY)
108109
displayName: 'Render notebook'
109110
110111
- bash: |

.azure-pipelines/ipynb-tests.yml

+5-11
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ jobs:
3737
- bash: |
3838
pip --version
3939
pip install --requirement requirements.txt
40+
pip install --requirement requirements/data.txt
4041
pip list
4142
displayName: 'Install dependencies'
4243
@@ -45,6 +46,7 @@ jobs:
4546
displayName: 'Sanity check'
4647
4748
- bash: |
49+
mkdir $(PATH_DATASETS)
4850
head=$(git rev-parse origin/main)
4951
printf "Head: $head\n"
5052
git diff --name-only $head --output=target-diff.txt
@@ -59,17 +61,6 @@ jobs:
5961
echo "##vso[task.setvariable variable=folders;isOutput=true]$COUNT"
6062
name: changed
6163
62-
- task: Cache@2
63-
inputs:
64-
key: data | .actions/data-download.sh
65-
restoreKeys: data
66-
path: $(PATH_DATASETS)
67-
cacheHitVar: DATA_RESTORED
68-
69-
- script: bash .actions/data-download.sh $(PATH_DATASETS)
70-
condition: ne(variables.DATA_RESTORED, 'true')
71-
displayName: 'Pull datasets'
72-
7364
- bash: |
7465
while IFS= read -r line; do
7566
python .actions/assistant.py augment-script $line
@@ -84,4 +75,7 @@ jobs:
8475
bash .actions/_ipynb-test.sh
8576
done <<< $(cat changed-folders.txt)
8677
condition: and(succeeded(), gt(variables['changed.folders'], 0))
78+
env:
79+
KAGGLE_USERNAME: $(KAGGLE_USERNAME)
80+
KAGGLE_KEY: $(KAGGLE_KEY)
8781
displayName: 'PyTest notebook'

.github/workflows/ci_docs.yml

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ jobs:
88
runs-on: ubuntu-20.04
99
env:
1010
PUB_BRANCH: publication
11+
PATH_DATASETS: ${{ github.workspace }}/.datasets
1112
steps:
1213
- name: Checkout 🛎️
1314
uses: actions/checkout@v2
@@ -38,6 +39,7 @@ jobs:
3839

3940
- name: Process folders
4041
run: |
42+
mkdir ${PATH_DATASETS}
4143
head=$(git rev-parse origin/"${{ github.base_ref }}")
4244
git diff --name-only $head --output=master-diff.txt
4345
python .actions/assistant.py group-folders master-diff.txt

.github/workflows/ci_testing.yml

+5-12
Original file line numberDiff line numberDiff line change
@@ -38,23 +38,13 @@ jobs:
3838
run: |
3939
pip --version
4040
pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
41+
pip install --requirement requirements/data.txt
4142
pip list
4243
shell: bash
4344

44-
- name: Cache Data
45-
id: cache-data
46-
uses: actions/cache@v2
47-
with:
48-
path: $PATH_DATASETS
49-
key: ${{ runner.os }}-datasets-${{ hashFiles('.actions/data-download.sh') }}
50-
restore-keys: ${{ runner.os }}-datasets-
51-
52-
- name: Download Data
53-
if: steps.cache-data.outputs.cache-hit != 'true'
54-
run: bash .actions/data-download.sh $PATH_DATASETS
55-
5645
- name: Process folders
5746
run: |
47+
mkdir ${PATH_DATASETS}
5848
head=$(git rev-parse origin/${{ github.base_ref }})
5949
git diff --name-only $head --output=target-diff.txt
6050
python .actions/assistant.py group-folders target-diff.txt
@@ -76,6 +66,9 @@ jobs:
7666

7767
- name: PyTest notebook
7868
if: success() && steps.changed.outputs.dirs != 0
69+
env:
70+
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
71+
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
7972
run: |
8073
while IFS= read -r line; do
8174
python .actions/assistant.py bash-test $line > .actions/_ipynb-test.sh

.github/workflows/docs-deploy.yml

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ jobs:
3030

3131
- name: Install dependencies
3232
run: |
33+
mkdir ${PATH_DATASETS}
3334
# install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux
3435
sudo apt-get update
3536
sudo apt-get install -y cmake pandoc

README.md

+26
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,32 @@ The addition has to formed as new folder
4646
- CPU
4747
```
4848
49+
### Using datasets
50+
51+
It is quite common to use some public or competition's dataset for your example.
52+
We facilitate this via defining the data sources in the metafile.
53+
There are two basic options, download a file from web or pul Kaggle dataset:
54+
55+
```yaml
56+
datasets:
57+
web:
58+
- https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
59+
kaggle:
60+
- titanic
61+
```
62+
63+
In both cases, the downloaded archive (Kaggle dataset is originally downloaded as zip file) is extracted to the default dataset folder under sub-folder with the same name as the downloaded file.
64+
To get path to this dataset folder, please use environment variable `PATH_DATASETS`, so in your script use:
65+
66+
```py
67+
import os
68+
69+
data_path = os.environ.get("PATH_DATASETS", "_datasets")
70+
path_titanic = os.path.join(data_path, "titatnic")
71+
```
72+
73+
**Warning:** some Kaggle datasets can be quite large and the process is - downloading and extracting, which means that particular runner needs to have double free space. For this reason, the CPU runner is limited to 3GB datasets.
74+
4975
### Suggestions
5076

5177
- For inserting images into text cells use MarkDown formatting, so we can insert inline images to the notebooks directly and drop eventual dependency on internet connection -> generated notebooks could be better shared offline

requirements/data.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# fixed version to be able to call it as `python -m kaggle`
2+
https://github.com/Borda/kaggle-api/archive/refs/heads/setup/python-m.zip

requirements/default.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
ipython[notebook]
12
pytorch-lightning>=1.3
23
torchmetrics>=0.3
34
torch>=1.6, <1.9

requirements/devel.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
virtualenv
2-
ipython[notebook]
32
jupytext # converting
43
pytest>=6.0
54
nbval # testing

sample-template/.meta.yml

+6-1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,9 @@ requirements:
1111
accelerator:
1212
- CPU
1313
- GPU
14-
- TPU
14+
datasets:
15+
web:
16+
# starting with http is downloaded
17+
- https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
18+
kaggle:
19+
- titanic

0 commit comments

Comments
 (0)