Refactor: expand datasets (#120)

Borda · pre-commit-ci[bot] · web-flow · commit c798bc4c8874 · 2021-12-10T17:14:32.000+01:00
* bash downalod data
* update CI
* sample
* overwrite
* kaggle
* unzip
* readme
* dry

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/.actions/assistant.py b/.actions/assistant.py
@@ -5,7 +5,7 @@
 from datetime import datetime
 from shutil import copyfile
 from textwrap import wrap
-from typing import Any, Dict, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 from warnings import warn
 
 import fire
@@ -131,6 +131,7 @@ def get_running_torch_version():
 class AssistantCLI:
 
     DEVICE_ACCELERATOR = os.environ.get("ACCELERATOR", "cpu").lower()
+    DATASET_FOLDER = os.environ.get("PATH_DATASETS", "_datasets").lower()
     DRY_RUN = bool(int(os.environ.get("DRY_RUN", 0)))
     _META_REQUIRED_FIELDS = ("title", "author", "license", "description")
     _SKIP_DIRS = (
@@ -154,6 +155,9 @@ class AssistantCLI:
         "flash_tutorials": "Kaggle",
     }
     _BASH_SCRIPT_BASE = ("#!/bin/bash", "set -e", "")
+    _EXT_ARCHIVE_ZIP = (".zip",)
+    _EXT_ARCHIVE_TAR = (".tar", ".gz")
+    _EXT_ARCHIVE = _EXT_ARCHIVE_ZIP + _EXT_ARCHIVE_TAR
 
     @staticmethod
     def _find_meta(folder: str) -> str:
@@ -231,10 +235,34 @@ def _parse_requirements(folder: str) -> Tuple[str, str]:
 
         return " ".join(req), " ".join(pip_args)
 
+    @staticmethod
+    def _bash_download_data(folder: str) -> List[str]:
+        cmd = ["HERE=$PWD", f"cd {AssistantCLI.DATASET_FOLDER}"]
+        meta = AssistantCLI._load_meta(folder)
+        datasets = meta.get("datasets", {})
+        data_kaggle = datasets.get("kaggle", [])
+        cmd += [f"python -m kaggle competitions download -c {name}" for name in data_kaggle]
+        files = [f"{name}.zip" for name in data_kaggle]
+        data_web = datasets.get("web", [])
+        cmd += [f"wget {web} --progress=bar:force:noscroll --tries=3" for web in data_web]
+        files += [os.path.basename(web) for web in data_web]
+        for fn in files:
+            name, ext = os.path.splitext(fn)
+            if ext not in AssistantCLI._EXT_ARCHIVE:
+                continue
+            if ext in AssistantCLI._EXT_ARCHIVE_ZIP:
+                cmd += [f"mkdir -p {name}", f"unzip -o {fn} -d {name}"]
+            else:
+                cmd += [f"tar -zxvf {fn} --overwrite"]
+            cmd += [f"rm {fn}"]
+        cmd += ["ls -l", "cd $HERE"]
+        return cmd
+
     @staticmethod
     def bash_render(folder: str) -> str:
-        print(f"Rendering: {folder}\n")
-        cmd = list(AssistantCLI._BASH_SCRIPT_BASE)
+        cmd = list(AssistantCLI._BASH_SCRIPT_BASE) + [f"# Rendering: {folder}"]
+        if not AssistantCLI.DRY_RUN:
+            cmd += AssistantCLI._bash_download_data(folder)
         ipynb_file, meta_file, thumb_file = AssistantCLI._valid_folder(folder, ext=".ipynb")
         pub_ipynb = os.path.join(DIR_NOTEBOOKS, f"{folder}.ipynb")
         pub_dir = os.path.dirname(pub_ipynb)
@@ -248,7 +276,7 @@ def bash_render(folder: str) -> str:
             # dry run does not execute the notebooks just takes them as they are
             cmd.append(f"cp {ipynb_file} {pub_ipynb}")
         else:
-            print(f"available: {AssistantCLI.DEVICE_ACCELERATOR}\n")
+            cmd.append(f"# available: {AssistantCLI.DEVICE_ACCELERATOR}\n")
             if AssistantCLI._valid_accelerator(folder):
                 cmd.append(f"python -m papermill.cli {ipynb_file} {pub_ipynb} --kernel python")
             else:
@@ -267,8 +295,8 @@ def bash_render(folder: str) -> str:
 
     @staticmethod
     def bash_test(folder: str) -> str:
-        print(f"Testing: {folder}\n")
-        cmd = list(AssistantCLI._BASH_SCRIPT_BASE)
+        cmd = list(AssistantCLI._BASH_SCRIPT_BASE) + [f"# Testing: {folder}"]
+        cmd += AssistantCLI._bash_download_data(folder)
         ipynb_file, _, _ = AssistantCLI._valid_folder(folder, ext=".ipynb")
 
         # prepare isolated environment with inheriting the global packages
@@ -281,7 +309,7 @@ def bash_test(folder: str) -> str:
         pip_req, pip_args = AssistantCLI._parse_requirements(folder)
         cmd += [f"pip install {pip_req} {pip_args}", "pip list"]
 
-        print(f"available: {AssistantCLI.DEVICE_ACCELERATOR}\n")
+        cmd.append(f"# available: {AssistantCLI.DEVICE_ACCELERATOR}")
         if AssistantCLI._valid_accelerator(folder):
             cmd.append(f"python -m pytest {ipynb_file} -v --nbval")
         else:
diff --git a/.actions/data-download.sh b/.actions/data-download.sh
diff --git a/.azure-pipelines/ipynb-publish.yml b/.azure-pipelines/ipynb-publish.yml
@@ -57,6 +57,7 @@ jobs:
     - bash: |
         pip --version
         pip install --requirement requirements.txt
+        pip install --requirement requirements/data.txt
         pip list
       displayName: 'Install dependencies'
 
@@ -84,10 +85,6 @@ jobs:
         echo "##vso[task.setvariable variable=folders;isOutput=true]$COUNT"
       name: dropped
 
-    - script: |
-        bash .actions/data-download.sh $(PATH_DATASETS)
-      displayName: 'Pull datasets'
-
     - bash: |
         while IFS= read -r line; do
             python .actions/assistant.py augment-script $line
@@ -96,6 +93,7 @@ jobs:
       displayName: 'Generate notebook'
 
     - bash: |
+        mkdir $(PATH_DATASETS)
         # render the actual notebooks
         while IFS= read -r line; do
             python .actions/assistant.py bash-render $line > .actions/_ipynb-render.sh
@@ -105,6 +103,9 @@ jobs:
         git status
         git commit -m "publish [GPU]: $(COMMIT_HASH)"
       condition: and(succeeded(), gt(variables['changed.folders'], 0))
+      env:
+        KAGGLE_USERNAME: $(KAGGLE_USERNAME)
+        KAGGLE_KEY: $(KAGGLE_KEY)
       displayName: 'Render notebook'
 
     - bash: |
diff --git a/.azure-pipelines/ipynb-tests.yml b/.azure-pipelines/ipynb-tests.yml
@@ -37,6 +37,7 @@ jobs:
     - bash: |
         pip --version
         pip install --requirement requirements.txt
+        pip install --requirement requirements/data.txt
         pip list
       displayName: 'Install dependencies'
 
@@ -45,6 +46,7 @@ jobs:
       displayName: 'Sanity check'
 
     - bash: |
+        mkdir $(PATH_DATASETS)
         head=$(git rev-parse origin/main)
         printf "Head: $head\n"
         git diff --name-only $head --output=target-diff.txt
@@ -59,17 +61,6 @@ jobs:
         echo "##vso[task.setvariable variable=folders;isOutput=true]$COUNT"
       name: changed
 
-    - task: Cache@2
-      inputs:
-        key: data | .actions/data-download.sh
-        restoreKeys: data
-        path: $(PATH_DATASETS)
-        cacheHitVar: DATA_RESTORED
-
-    - script: bash .actions/data-download.sh $(PATH_DATASETS)
-      condition: ne(variables.DATA_RESTORED, 'true')
-      displayName: 'Pull datasets'
-
     - bash: |
         while IFS= read -r line; do
             python .actions/assistant.py augment-script $line
@@ -84,4 +75,7 @@ jobs:
             bash .actions/_ipynb-test.sh
         done <<< $(cat changed-folders.txt)
       condition: and(succeeded(), gt(variables['changed.folders'], 0))
+      env:
+        KAGGLE_USERNAME: $(KAGGLE_USERNAME)
+        KAGGLE_KEY: $(KAGGLE_KEY)
       displayName: 'PyTest notebook'
diff --git a/.github/workflows/ci_docs.yml b/.github/workflows/ci_docs.yml
@@ -8,6 +8,7 @@ jobs:
     runs-on: ubuntu-20.04
     env:
       PUB_BRANCH: publication
+      PATH_DATASETS: ${{ github.workspace }}/.datasets
     steps:
       - name: Checkout 🛎️
         uses: actions/checkout@v2
@@ -38,6 +39,7 @@ jobs:
 
       - name: Process folders
         run: |
+          mkdir ${PATH_DATASETS}
           head=$(git rev-parse origin/"${{ github.base_ref }}")
           git diff --name-only $head --output=master-diff.txt
           python .actions/assistant.py group-folders master-diff.txt
diff --git a/.github/workflows/ci_testing.yml b/.github/workflows/ci_testing.yml
@@ -38,23 +38,13 @@ jobs:
         run: |
           pip --version
           pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+          pip install --requirement requirements/data.txt
           pip list
         shell: bash
 
-      - name: Cache Data
-        id: cache-data
-        uses: actions/cache@v2
-        with:
-          path: $PATH_DATASETS
-          key: ${{ runner.os }}-datasets-${{ hashFiles('.actions/data-download.sh') }}
-          restore-keys: ${{ runner.os }}-datasets-
-
-      - name: Download Data
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        run: bash .actions/data-download.sh $PATH_DATASETS
-
       - name: Process folders
         run: |
+          mkdir ${PATH_DATASETS}
           head=$(git rev-parse origin/${{ github.base_ref }})
           git diff --name-only $head --output=target-diff.txt
           python .actions/assistant.py group-folders target-diff.txt
@@ -76,6 +66,9 @@ jobs:
 
       - name: PyTest notebook
         if: success() && steps.changed.outputs.dirs != 0
+        env:
+          KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
+          KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
         run: |
           while IFS= read -r line; do
               python .actions/assistant.py bash-test $line > .actions/_ipynb-test.sh
diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml
@@ -30,6 +30,7 @@ jobs:
 
       - name: Install dependencies
         run: |
+          mkdir ${PATH_DATASETS}
           # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux
           sudo apt-get update
           sudo apt-get install -y cmake pandoc
diff --git a/README.md b/README.md
@@ -46,6 +46,32 @@ The addition has to formed as new folder
     - CPU
   ```
 
+### Using datasets
+
+It is quite common to use some public or competition's dataset for your example.
+We facilitate this via defining the data sources in the metafile.
+There are two basic options, download a file from web or pul Kaggle dataset:
+
+```yaml
+datasets:
+  web:
+    - https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+  kaggle:
+    - titanic
+```
+
+In both cases, the downloaded archive (Kaggle dataset is originally downloaded as zip file) is extracted to the default dataset folder under sub-folder with the same name as the downloaded file.
+To get path to this dataset folder, please use environment variable `PATH_DATASETS`, so in your script use:
+
+```py
+import os
+
+data_path = os.environ.get("PATH_DATASETS", "_datasets")
+path_titanic = os.path.join(data_path, "titatnic")
+```
+
+**Warning:** some Kaggle datasets can be quite large and the process is - downloading and extracting, which means that particular runner needs to have double free space. For this reason, the CPU runner is limited to 3GB datasets.
+
 ### Suggestions
 
 - For inserting images into text cells use MarkDown formatting, so we can insert inline images to the notebooks directly and drop eventual dependency on internet connection -> generated notebooks could be better shared offline
diff --git a/requirements/data.txt b/requirements/data.txt
@@ -0,0 +1,2 @@
+# fixed version to be able to call it as `python -m kaggle`
+https://github.com/Borda/kaggle-api/archive/refs/heads/setup/python-m.zip
diff --git a/requirements/default.txt b/requirements/default.txt
@@ -1,3 +1,4 @@
+ipython[notebook]
 pytorch-lightning>=1.3
 torchmetrics>=0.3
 torch>=1.6, <1.9
diff --git a/requirements/devel.txt b/requirements/devel.txt
@@ -1,5 +1,4 @@
 virtualenv
-ipython[notebook]
 jupytext  # converting
 pytest>=6.0
 nbval  # testing
diff --git a/sample-template/.meta.yml b/sample-template/.meta.yml
@@ -11,4 +11,9 @@ requirements:
 accelerator:
   - CPU
   - GPU
-  - TPU
+datasets:
+  web:
+    # starting with http is downloaded
+    - https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+  kaggle:
+    - titanic

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+# fixed version to be able to call it as `python -m kaggle`
	`2`	`+https://github.com/Borda/kaggle-api/archive/refs/heads/setup/python-m.zip`