diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 602f49e..02737aa 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -11,4 +11,6 @@ RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bi RUN apt-get update && apt-get -y install --no-install-recommends \ build-essential cmake libtool-bin git less wget \ - libhdf5-dev libhdf5-mpi-dev hdf5-tools libyajl-dev + libhdf5-dev libhdf5-mpi-dev hdf5-tools libyajl-dev python3-pip +RUN conda install -n base h5py matplotlib h5netcdf xarray +RUN pip3 install h5pyd s3fs diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e3af3ee..28283ff 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,10 +1,16 @@ { "name": "HDF5 Tutorial", - "build": { - "context": "..", - "dockerfile": "Dockerfile" - }, + "dockerComposeFile": "docker-compose.yaml", "postCreateCommand": "conda init", + "service": "app", + "workspaceFolder": "/workspace", + "forwardPorts": [5101], + "portsAttributes": { + "5101": {"label": "HSDS port", "onAutoForward": "silent"} + }, + "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker": {} + }, "customizations": { "vscode": { "extensions": [ @@ -23,4 +29,4 @@ } } } -} +} \ No newline at end of file diff --git a/.devcontainer/docker-compose.yaml b/.devcontainer/docker-compose.yaml new file mode 100644 index 0000000..7646845 --- /dev/null +++ b/.devcontainer/docker-compose.yaml @@ -0,0 +1,120 @@ +version: "3" +services: + app: + build: + context: .. + dockerfile: .devcontainer/Dockerfile + environment: + - HS_ENDPOINT=http://localhost:5101 + - HS_USERNAME=vscode + - HS_PASSWORD=vscode + volumes: + - ..:/workspace:cached + + # Overrides default command so things don't shut down after the process ends. + command: sleep infinity + + # Runs app on the same network as the SN container, allows "forwardPorts" in devcontainer.json function. + network_mode: service:sn + head: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 512m + environment: + - TARGET_SN_COUNT=1 + - TARGET_DN_COUNT=4 + - NODE_TYPE=head_node + - ROOT_DIR=/data + - BUCKET_NAME=hsdstest + ports: + - 5100 + volumes: + - ../hsds/data:/data + - ../hsds/config/:/config/ + dn1: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + - ROOT_DIR=/data + - BUCKET_NAME=hsdstest + ports: + - 6101 + depends_on: + - head + volumes: + - ../hsds/data:/data + - ../hsds/config/:/config/ + links: + - head + dn2: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + - ROOT_DIR=/data + - BUCKET_NAME=hsdstest + ports: + - 6102 + depends_on: + - head + volumes: + - ../hsds/data:/data + - ../hsds/config/:/config/ + links: + - head + dn3: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + - ROOT_DIR=/data + - BUCKET_NAME=hsdstest + ports: + - 6103 + depends_on: + - head + volumes: + - ../hsds/data:/data + - ../hsds/config/:/config/ + links: + - head + dn4: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - NODE_TYPE=dn + - ROOT_DIR=/data + - BUCKET_NAME=hsdstest + ports: + - 6104 + depends_on: + - head + volumes: + - ../hsds/data:/data + - ../hsds/config/:/config/ + links: + - head + sn: + image: hdfgroup/hsds:master + restart: on-failure + mem_limit: 1g + environment: + - SN_PORT=5101 + - NODE_TYPE=sn + - ROOT_DIR=/data + - BUCKET_NAME=hsdstest + - HSDS_ENDPOINT=http://localhost:5101 + ports: + - 5101:5101 + depends_on: + - head + volumes: + - ../hsds/data:/data + - ../hsds/config/:/config/ + links: + - head diff --git a/05-HSDS.ipynb b/05-HSDS.ipynb index 145fa9c..7f90800 100644 --- a/05-HSDS.ipynb +++ b/05-HSDS.ipynb @@ -10,45 +10,13 @@ "\n", "## Installation\n", "\n", - "For this simplistic setup, the installation is easy:\n", + "This code space is already configured to run HSDS, so no special setup is needed. \n", + "Refer to the files under .vscode in this project to see how it is done.\n", "\n", - "1. Create a directory for the HSDS data files (`~/hsds_data`).\n", - "2. Use the user name `vscode` and password `vscode` to authenticate to HSDS.\n", - "3. Launch the service.\n", - "\n", - "For more sophisticated setups (e.g., Kubernetes), please refer to the [HSDS documentation](https://github.com/HDFGroup/hsds/tree/master/docs)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "export HS_ENDPOINT=http://localhost:5101\n", - "export HS_USERNAME=$USER\n", - "export HS_PASSWORD=$USER\n", - "mkdir ~/hsds_data\n", - "hsds --root_dir ~/hsds_data --hs_username $USER --hs_password $USER >~/hs.log 2>&1 &\n", - "sleep 5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We create a configuration file for HSDS: `~/.hscfg`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "hsconfigure <<< $'http://localhost:5101\\nvscode\\nvscode\\n\\nY\\n'" + "Here HSDS is running in the cloud as part of Github Code Spaces. You can also installed \n", + "on your Mac/Windows/or Linux desktop.\n", + "Please refer to the [HSDS documentation](https://github.com/HDFGroup/hsds/tree/master/docs)\n", + "for detailed instructions." ] }, { @@ -72,7 +40,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Create the top-level domain and a user \"directory\" for the user `vscode`:" + "When you first run the codespace, HSDS will have just a hdf5:/home/ and \n", + "hdf5:/home/vscode/ folders." ] }, { @@ -82,8 +51,7 @@ "outputs": [], "source": [ "%%bash\n", - "hstouch /home/ && hstouch /home/$USER/\n", - "hsinfo" + "hsls hdf5://home/" ] }, { @@ -140,7 +108,7 @@ "outputs": [], "source": [ "%%bash\n", - "curl -s -u vscode:vscode http://localhost:5101/?domain=/home/vscode/foo.h5 | jq" + "curl -s http://localhost:5101/?domain=/home/vscode/foo.h5 | jq" ] }, { @@ -150,7 +118,7 @@ "outputs": [], "source": [ "%%bash\n", - "find ~/hsds_data" + "find ~/hsds/data/" ] }, { @@ -407,7 +375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/hsds/config/override.yml b/hsds/config/override.yml new file mode 100644 index 0000000..bd5ab55 --- /dev/null +++ b/hsds/config/override.yml @@ -0,0 +1,13 @@ +log_level: ERROR # or DEBUG or INFO or WARNING +bucket_name: hsdstest # default bucket name +server_name: HSDS for Github codespaces # this is displayed by the hsinfo command +chunk_mem_cache_size: 256m # chunk_cache memory per DN container +root_dir: /workspace/hsds/data # this is where HSDS data files will be stored +default_public: True # New domains will be created as default read +aws_region: us-west-2 # The majority of public data files are in us-west-2 +aws_s3_gateway: http://s3.us-west-2.amazonaws.com/ # s3 gateway for us-west-2 +aws_s3_no_sign_request: True # Set to False if setting aws_access_key_id and aws_secret_access_key here +head_sleep_time: 60 # max sleep time between health checks for head node +node_sleep_time: 60 # max sleep time between health checks for SN/DN nodes +aws_region: us-west-2 # (original was us-east-1) +aws_s3_gateway: http://s3.us-west-2.amazonaws.com/ # (original was null) \ No newline at end of file diff --git a/hsds/config/passwd.txt b/hsds/config/passwd.txt new file mode 100644 index 0000000..63a9849 --- /dev/null +++ b/hsds/config/passwd.txt @@ -0,0 +1,2 @@ +# username/passwords for HSDS... +vscode:vscode diff --git a/hsds/data/hsdstest/db/.gitignore b/hsds/data/hsdstest/db/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/hsds/data/hsdstest/home/.domain.json b/hsds/data/hsdstest/home/.domain.json new file mode 100644 index 0000000..d6f9199 --- /dev/null +++ b/hsds/data/hsdstest/home/.domain.json @@ -0,0 +1 @@ +{"owner": "admin", "acls": {"admin": {"create": true, "read": true, "update": true, "delete": true, "readACL": true, "updateACL": true}, "default": {"create": false, "read": true, "update": false, "delete": false, "readACL": false, "updateACL": false}}, "created": 1708897646.0599918, "lastModified": 1708897646.0599918} \ No newline at end of file diff --git a/hsds/data/hsdstest/home/vscode/.domain.json b/hsds/data/hsdstest/home/vscode/.domain.json new file mode 100644 index 0000000..8a27da6 --- /dev/null +++ b/hsds/data/hsdstest/home/vscode/.domain.json @@ -0,0 +1 @@ +{"owner": "vscode", "acls": {"vscode": {"create": true, "read": true, "update": true, "delete": true, "readACL": true, "updateACL": true}, "default": {"create": false, "read": true, "update": false, "delete": false, "readACL": false, "updateACL": false}}, "created": 1711992550.3733413, "lastModified": 1711992550.3733413} \ No newline at end of file diff --git a/hsds/data/hsdstest/home/vscode/.gitignore b/hsds/data/hsdstest/home/vscode/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/hsds/data/hsdstest/readme.txt b/hsds/data/hsdstest/readme.txt new file mode 100644 index 0000000..d43d778 --- /dev/null +++ b/hsds/data/hsdstest/readme.txt @@ -0,0 +1 @@ +This is the default bucket for HSDS diff --git a/hsds/data/readme.txt b/hsds/data/readme.txt new file mode 100644 index 0000000..3d18e5d --- /dev/null +++ b/hsds/data/readme.txt @@ -0,0 +1 @@ +This is the root directory for HSDS data diff --git a/python/01-Intro.ipynb b/python/01-Intro.ipynb index 6550dce..b59d0d5 100644 --- a/python/01-Intro.ipynb +++ b/python/01-Intro.ipynb @@ -24,13 +24,13 @@ "metadata": {}, "outputs": [], "source": [ - "USE_H5PY = True # set to False to use HSDS instead\n", + "USE_H5PY = False # set to False to use HSDS instead\n", "if USE_H5PY:\n", " import h5py\n", - " WORK_DIR=\".\" # this directory\n", + " WORK_DIR = \".\"\n", "else:\n", " import h5pyd as h5py\n", - " WORK_DIR=\"hdf5://home/test_user1/\"\n", + " WORK_DIR = \"hdf5://home/vscode/\"\n", "import os.path as op" ] }, @@ -187,7 +187,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/python/02-Command Line Tools.ipynb b/python/02-Command Line Tools.ipynb index b777c65..b939d6d 100644 --- a/python/02-Command Line Tools.ipynb +++ b/python/02-Command Line Tools.ipynb @@ -19,56 +19,46 @@ " " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using HDF5 Library Tools\n", + "------------------------\n", + "\n", + "There are several command line tools that are included with the HDF5 library.\n", + "The most commonly used ones are: \n", + "\n", + "* `h5ls` - list contents of an HDF5 file\n", + "* `h5dump` - dump out the contents of an HDF5 file\n", + "* `h5diff` - compare two HDF5 files\n", + "* `h5stat` - get detailed statistics on an HDF5 file\n", + "\n", + "We'll explore each of these below..." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0] Downloading 'https://s3.amazonaws.com/hdfgroup/data/hdf5test/tall.h5' ...\n", - "Saving 'tall.h5'\n", - "HTTP response 200 OK [https://s3.amazonaws.com/hdfgroup/data/hdf5test/tall.h5]\n", - "\u001b[m\u001b[m\u001b[m\u001b[m" - ] - } - ], + "outputs": [], "source": [ - "# get a sample HDF5 file\n", + "# To start with, let's grab an HDF5 file to work with...\n", + "# The exclamation sign indicates to jupyter to execute the following cell in the shell\n", + "# Alternatively you use the codespace terminal tool and run wget there.\n", "! wget https://s3.amazonaws.com/hdfgroup/data/hdf5test/tall.h5" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/ Group\n", - "/g1 Group\n", - "/g1/g1.1 Group\n", - "/g1/g1.1/dset1.1.1 Dataset {10, 10}\n", - "/g1/g1.1/dset1.1.2 Dataset {20}\n", - "/g1/g1.2 Group\n", - "/g1/g1.2/extlink External Link {somefile//somepath}\n", - "/g1/g1.2/g1.2.1 Group\n", - "/g1/g1.2/g1.2.1/slink Soft Link {somevalue}\n", - "/g2 Group\n", - "/g2/dset2.1 Dataset {10}\n", - "/g2/dset2.2 Dataset {3, 5}\n" - ] - } - ], + "outputs": [], "source": [ "# Display objects with a file. Use -r for recursive mode\n", "! h5ls -r tall.h5" @@ -76,111 +66,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "HDF5 \"tall.h5\" {\n", - "GROUP \"/\" {\n", - " ATTRIBUTE \"attr1\" {\n", - " DATATYPE H5T_STD_I8BE\n", - " DATASPACE SIMPLE { ( 10 ) / ( 10 ) }\n", - " DATA {\n", - " \"abcdefghi\\000\"\n", - " }\n", - " }\n", - " ATTRIBUTE \"attr2\" {\n", - " DATATYPE H5T_STD_I32BE\n", - " DATASPACE SIMPLE { ( 2, 2 ) / ( 2, 2 ) }\n", - " DATA {\n", - " (0,0): 0, 1,\n", - " (1,0): 2, 3\n", - " }\n", - " }\n", - " GROUP \"g1\" {\n", - " GROUP \"g1.1\" {\n", - " DATASET \"dset1.1.1\" {\n", - " DATATYPE H5T_STD_I32BE\n", - " DATASPACE SIMPLE { ( 10, 10 ) / ( 10, 10 ) }\n", - " DATA {\n", - " (0,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " (1,0): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n", - " (2,0): 0, 2, 4, 6, 8, 10, 12, 14, 16, 18,\n", - " (3,0): 0, 3, 6, 9, 12, 15, 18, 21, 24, 27,\n", - " (4,0): 0, 4, 8, 12, 16, 20, 24, 28, 32, 36,\n", - " (5,0): 0, 5, 10, 15, 20, 25, 30, 35, 40, 45,\n", - " (6,0): 0, 6, 12, 18, 24, 30, 36, 42, 48, 54,\n", - " (7,0): 0, 7, 14, 21, 28, 35, 42, 49, 56, 63,\n", - " (8,0): 0, 8, 16, 24, 32, 40, 48, 56, 64, 72,\n", - " (9,0): 0, 9, 18, 27, 36, 45, 54, 63, 72, 81\n", - " }\n", - " ATTRIBUTE \"attr1\" {\n", - " DATATYPE H5T_STD_I8BE\n", - " DATASPACE SIMPLE { ( 27 ) / ( 27 ) }\n", - " DATA {\n", - " \"1st attribute of dset1.1.1\\000\"\n", - " }\n", - " }\n", - " ATTRIBUTE \"attr2\" {\n", - " DATATYPE H5T_STD_I8BE\n", - " DATASPACE SIMPLE { ( 27 ) / ( 27 ) }\n", - " DATA {\n", - " \"2nd attribute of dset1.1.1\\000\"\n", - " }\n", - " }\n", - " }\n", - " DATASET \"dset1.1.2\" {\n", - " DATATYPE H5T_STD_I32BE\n", - " DATASPACE SIMPLE { ( 20 ) / ( 20 ) }\n", - " DATA {\n", - " (0): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", - " (17): 17, 18, 19\n", - " }\n", - " }\n", - " }\n", - " GROUP \"g1.2\" {\n", - " EXTERNAL_LINK \"extlink\" {\n", - " TARGETFILE \"somefile\"\n", - " TARGETPATH \"somepath\"\n", - " }\n", - " GROUP \"g1.2.1\" {\n", - " SOFTLINK \"slink\" {\n", - " LINKTARGET \"somevalue\"\n", - " }\n", - " }\n", - " }\n", - " }\n", - " GROUP \"g2\" {\n", - " DATASET \"dset2.1\" {\n", - " DATATYPE H5T_IEEE_F32BE\n", - " DATASPACE SIMPLE { ( 10 ) / ( 10 ) }\n", - " DATA {\n", - " (0): 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9\n", - " }\n", - " }\n", - " DATASET \"dset2.2\" {\n", - " DATATYPE H5T_IEEE_F32BE\n", - " DATASPACE SIMPLE { ( 3, 5 ) / ( 3, 5 ) }\n", - " DATA {\n", - " (0,0): 0, 0.1, 0.2, 0.3, 0.4,\n", - " (1,0): 0, 0.2, 0.4, 0.6, 0.8,\n", - " (2,0): 0, 0.3, 0.6, 0.9, 1.2\n", - " }\n", - " }\n", - " }\n", - "}\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "# h5dump will show the dataset contents by default\n", "! h5dump -r tall.h5" @@ -188,146 +81,165 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# h5dump will display not just the objects in the file, but (by default) print\n", + "# the dataset data as well\n", + "! h5dump -r tall.h5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# h5stat will show many detailed statitistics about the file\n", + "! h5stat tall.h5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using h5pyd Tools\n", + "------------------------\n", + "\n", + "The h5pyd Python package provides a Python interface for accessing HSDS. \n", + "It's based on the h5py API, so most programs should be easily converted from using\n", + "h5py to h5pyd. \n", + "The h5pyd package also include a set of command line tools for working with HSDS content.\n", + "There are analogs to the library tools (`hsls` rather than `h5ls`) plus some additional tools\n", + "that serve as standins for the common Linux command line tools (e.g. `hsrm` rather than `rm`).\n", + "There are also tools for uploading an HDF5 file to an HSDS domain (`hsload`) and \n", + "downloading an HSDS domain to an HDF5 file (hopefully not the worse for wear).\n", + "The tools linclude: hsconfigure, hsload, hsget, hsls, hstouch, hsrm, hsacl, hsdiff,\n", + "and hsstat: \n", + "\n", + "* `hsconfigure` - setup a connection to an HSDS server\n", + "* `hsload` - copy an HDF5 file to an HSDS domain\n", + "* `hsget` - copy an HSDS domain to an HDF5 file\n", + "* `hsls` - list the contents of an HSDS domain (or HSDS folders)\n", + "* `hstouch` - create a new domain or folder\n", + "* `hsrm` - remove a domain or folder\n", + "* `hsacl` - view or edit HSDS folder or domain ACLs (permission settings)\n", + "* `hsdiff` - compare an HDF5 file with an HSDS domain\n", + "* `hsstat` - get detailed statistics on an HSDS domain\n", + "\n", + "Running any of these with `--help` will provide usage info.\n", + "\n", + "In addition we'll try out some of these below..." + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "server name: hsds on devone\n", - "server state: READY\n", - "endpoint: http://hsds.hdf.test:5101\n", - "username: test_user1 \n", - "password: ****\n", - "home: /home/test_user1/\n", - "server version: 0.8.1\n", - "node count: 1\n", - "up: 59 min 48 sec\n", - "h5pyd version: 0.15.1\n" - ] - } - ], + "outputs": [], "source": [ - "# On the HSDS side, hsinfo will show the current server status\n", + "# A dedicated instance of HSDS should be running as part of this \n", + "# codespace.\n", + "# You can verify this by using the hsinfo command. \n", + "# It will show the current server status\n", "! hsinfo" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# you can use hsload to copy a local file to HSDS\n", - "! hsload tall.h5 hdf5://home/test_user1/" + "# When you first create the codespace, there are no domains loaded in HSDS,\n", + "# but you can use hsload any HDF5 to HSDS.\n", + "# Let's try it with the file we downloaded earlier.\n", + "! hsload tall.h5 hdf5://home/vscode/" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/ Group\n", - "/g1 Group\n", - "/g1/g1.1 Group\n", - "/g1/g1.1/dset1.1.1 Dataset {10, 10}\n", - "/g1/g1.1/dset1.1.2 Dataset {20}\n", - "/g1/g1.2 Group\n", - "/g1/g1.2/g1.2.1 Group\n", - "/g1/g1.2/g1.2.1/slink SoftLink {somevalue}\n", - "/g1/g1.2/extlink ExternalLink {somepath//somefile}\n", - "/g2 Group\n", - "/g2/dset2.1 Dataset {10}\n", - "/g2/dset2.2 Dataset {3, 5}\n" - ] - } - ], + "outputs": [], "source": [ "# hsls works like h5ls but with content managed by the server\n", - "! hsls -r hdf5://home/test_user1/tall.h5" + "! hsls -r hdf5://home/vscode/tall.h5" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "test_user1 folder 2023-03-10 17:20:29 hdf5://home/test_user1/\n", - "test_user1 domain 2023-08-09 16:59:50 hdf5://home/test_user1/01.h5\n", - "test_user1 domain 2023-08-09 16:39:16 hdf5://home/test_user1/03.h5\n", - "test_user1 domain 2023-08-09 18:39:25 hdf5://home/test_user1/04.h5\n", - "test_user1 domain 2023-08-10 17:21:23 hdf5://home/test_user1/05.h5\n", - "test_user1 domain 2023-08-10 17:37:10 hdf5://home/test_user1/06.h5\n", - "test_user1 domain 2023-04-11 16:47:20 hdf5://home/test_user1/ascii_attr.h5\n", - "test_user1 domain 2023-04-10 12:43:53 hdf5://home/test_user1/d_nullref.h5\n", - "test_user1 domain 2023-03-22 21:37:59 hdf5://home/test_user1/d_objref.h5\n", - "test_user1 folder 2023-03-31 13:05:11 hdf5://home/test_user1/equinor\n", - "test_user1 domain 2023-04-10 14:55:33 hdf5://home/test_user1/fletcher32.h5\n", - "test_user1 domain 2023-04-26 20:21:05 hdf5://home/test_user1/g1.h5\n", - "test_user1 folder 2023-03-20 22:04:21 hdf5://home/test_user1/h5pyd_test\n", - "test_user1 folder 2023-03-11 20:42:12 hdf5://home/test_user1/hsds_test\n", - "test_user1 folder 2023-06-21 12:39:00 hdf5://home/test_user1/icesat2\n", - "test_user1 domain 2023-04-10 14:55:52 hdf5://home/test_user1/scale_offset.h5\n", - "test_user1 domain 2023-03-30 19:57:15 hdf5://home/test_user1/scaleoffset.h5\n", - "test_user1 domain 2023-04-10 15:32:54 hdf5://home/test_user1/shuffle_compress.h5\n", - "test_user1 domain 2023-03-23 17:27:25 hdf5://home/test_user1/simpleattr.h5\n", - "test_user1 domain 2023-04-28 16:33:47 hdf5://home/test_user1/snp500_link.h5\n", - "test_user1 folder 2023-04-05 22:21:07 hdf5://home/test_user1/stream\n", - "test_user1 domain 2023-08-10 17:54:20 hdf5://home/test_user1/tall.h5\n", - "test_user1 domain 2023-03-11 20:39:53 hdf5://home/test_user1/tall2.h5\n", - "test_user1 domain 2023-03-13 13:10:12 hdf5://home/test_user1/tall3.h5\n", - "test_user1 domain 2023-04-10 16:24:25 hdf5://home/test_user1/tall_compress.h5\n", - "test_user1 folder 2023-03-10 17:21:28 hdf5://home/test_user1/test\n", - "test_user1 folder 2023-07-16 22:25:44 hdf5://home/test_user1/tmp\n", - "test_user1 domain 2023-05-09 18:41:51 hdf5://home/test_user1/wordmap.h5\n", - "28 items\n" - ] - } - ], + "outputs": [], "source": [ - "# hsls can also be used to display contents of an HSDS folder\n", + "# hsls can also be used to display contents of an HSDS folder.\n", + "# HSDS folders are similar in concept to directories. They allow you\n", + "# to organize collections of domains and sub-folders\n", "# Note: trailing slash is required\n", - "! hsls hdf5://home/test_user1/" + "\n", + "! hsls hdf5://home/vscode/" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "domain: hdf5://home/test_user1/tall.h5\n", - " owner: test_user1\n", - " id: g-d761f590-972a9c10-5fe9-7d181f-a21498\n", - " last modified: 2023-08-10 17:54:20\n", - " total_size: 0\n", - " allocated_bytes: 0\n", - " num objects: 0\n", - " num chunks: 0\n" - ] - } - ], + "outputs": [], + "source": [ + "# hsstat can be used to see statistics of the domain\n", + "! hsstat hdf5://home/vscode/tall.h5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# and hsget allows you to create an hdf5 file from an HSDS domain\n", + "! hsget hdf5://home/vscode/tall.h5 tall2.h5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# compare this to the original. No output indicates that the two are equivalent\n", + "! h5diff tall.h5 tall2.h5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "HDF5 File Linking\n", + "-----------------\n", + "\n", + "If you would like to load a HDF5 file in the cloud (with s3 or azure blob storage), you can *link* to it rather\n", + "than copying all the data into the limited storage include with your codespace.\n", + "Linking we just copy the HDF5 file metadata (typically a small fraction of the over file size) to your\n", + "local HSDS store. The HDF5 \"chunks\" (where dataset data is stored) are accessed on demand from the cloud provider.\n", + "Since your vscode space is also in the cloud, this should be quite fast compared with accessing directly from your \n", + "laptop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# hsstat provides information about storage used for HSDS objects similar to how h5stat works\n", - "# Note: it may take a minute before the information displayed by hsstat is current\n", - "! hsstat hdf5://home/test_user1/tall.h5" + "# Use the --link option to link to an existing file.\n", + "! hsload --link s3://hdf5.sample/data/hdf5test/snp500.h5 hdf5://home/vsode/snp500.h5" ] }, { @@ -458,7 +370,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/python/03-Datasets.ipynb b/python/03-Datasets.ipynb index b88bfd1..9bab2c1 100644 --- a/python/03-Datasets.ipynb +++ b/python/03-Datasets.ipynb @@ -27,13 +27,13 @@ "metadata": {}, "outputs": [], "source": [ - "USE_H5PY = True # set to False to use HSDS instead\n", + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", "if USE_H5PY:\n", " import h5py\n", " WORK_DIR=\".\" # this directory\n", "else:\n", " import h5pyd as h5py\n", - " WORK_DIR=\"hdf5://home/test_user1/\"\n", + " WORK_DIR=\"hdf5://home/vscode/\"\n", "import os.path as op" ] }, @@ -271,7 +271,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/python/04-Compression.ipynb b/python/04-Compression.ipynb index ce01e39..e3a9c1b 100644 --- a/python/04-Compression.ipynb +++ b/python/04-Compression.ipynb @@ -25,13 +25,13 @@ "metadata": {}, "outputs": [], "source": [ - "USE_H5PY = True # set to False to use HSDS instead\n", + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", "if USE_H5PY:\n", " import h5py\n", " WORK_DIR=\".\" # this directory\n", "else:\n", " import h5pyd as h5py\n", - " WORK_DIR=\"hdf5://home/test_user1/\"\n", + " WORK_DIR=\"hdf5://home/vscode/\"\n", "import os.path as op\n", "import random" ] @@ -252,7 +252,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/python/05-GroupsAndLinks.ipynb b/python/05-GroupsAndLinks.ipynb index 18a8a76..0da8f22 100644 --- a/python/05-GroupsAndLinks.ipynb +++ b/python/05-GroupsAndLinks.ipynb @@ -25,13 +25,13 @@ "metadata": {}, "outputs": [], "source": [ - "USE_H5PY = True # set to False to use HSDS instead\n", + "USE_H5PY = False # set to False to use h5py/hdf5lib instead\n", "if USE_H5PY:\n", " import h5py\n", " WORK_DIR=\".\" # this directory\n", "else:\n", " import h5pyd as h5py\n", - " WORK_DIR=\"hdf5://home/test_user1/\"\n", + " WORK_DIR=\"hdf5://home/vscode/\"\n", "import os.path as op" ] }, @@ -375,7 +375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/python/06-Attributes.ipynb b/python/06-Attributes.ipynb index 1e4309e..88d6d4e 100644 --- a/python/06-Attributes.ipynb +++ b/python/06-Attributes.ipynb @@ -25,13 +25,13 @@ "metadata": {}, "outputs": [], "source": [ - "USE_H5PY = True # set to False to use HSDS instead\n", + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", "if USE_H5PY:\n", " import h5py\n", " WORK_DIR=\".\" # this directory\n", "else:\n", " import h5pyd as h5py\n", - " WORK_DIR=\"hdf5://home/test_user1/\"\n", + " WORK_DIR=\"hdf5://home/vscode/\"\n", "import os.path as op" ] }, @@ -174,7 +174,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/python/07-Types.ipynb b/python/07-Types.ipynb index d420f32..923feb9 100644 --- a/python/07-Types.ipynb +++ b/python/07-Types.ipynb @@ -25,13 +25,13 @@ "metadata": {}, "outputs": [], "source": [ - "USE_H5PY = True # set to False to use HSDS instead\n", + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", "if USE_H5PY:\n", " import h5py\n", " WORK_DIR=\".\" # this directory\n", "else:\n", " import h5pyd as h5py\n", - " WORK_DIR=\"hdf5://home/test_user1/\"\n", + " WORK_DIR=\"hdf5://home/vscode/\"\n", "import os.path as op\n", "import numpy as np" ] @@ -184,7 +184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/python/nrel_nsrdb_example.ipynb b/python/nrel_nsrdb_example.ipynb new file mode 100644 index 0000000..9fa75ed --- /dev/null +++ b/python/nrel_nsrdb_example.ipynb @@ -0,0 +1,237 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NREL NSRDB Example\n", + "\n", + "This notebook illustrates accessing the NREL NSRDB (National Solar Radiation Database) using both h5pyd with HSDS and h5py with the HDF5 library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", + "if USE_H5PY:\n", + " import h5py\n", + " import s3fs\n", + "else:\n", + " import h5pyd as h5py\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# In the shell, use the --bucket option to list files from NREL's S3 bucket \n", + "# run with \"-r\" option to see all domains\n", + "! hsls --bucket s3://nrel-pds-hsds /nrel/nsrdb/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# Open the nsrdb file. Use the bucket param to get the data from NREL's S3 bucket\n", + "if USE_H5PY:\n", + " s3 = s3fs.S3FileSystem()\n", + " f = h5py.File(s3.open(\"s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2022.h5\", \"rb\"), \"r\")\n", + "else:\n", + " f = h5py.File(\"/nrel/nsrdb/conus/nsrdb_conus_2022.h5\", bucket=\"s3://nrel-pds-hsds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# attributes can be used to provide desriptions of the content\n", + "%time f.attrs['version'] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(f) # datasets under root group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset = f[\"air_temperature\"]\n", + "dset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset.id.id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset.shape # two-dimensional time x station_index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset.chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.prod(dset.chunks) * dset.dtype.itemsize # number of bytes per chunk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[0]) # number of chunks in the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read one year of measurments for a given station_index\n", + "%time tseries = dset[::,1234567]\n", + "tseries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get min, max, and mean values\n", + "tseries.min(), tseries.max(), tseries.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the data\n", + "x = range(len(tseries))\n", + "plt.plot(x, tseries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This dataset is actually linked from an HDF5 file in a different bucket\n", + "if USE_H5PY:\n", + " # this property doesn't exist for h5py\n", + " layout = None\n", + "else:\n", + " layout = dset.id.layout\n", + "layout" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The HSDS domain actually maps to several different HDF5 files\n", + "# compile a list of all the files\n", + "hdf5_files = set()\n", + "if not USE_H5PY:\n", + " for k in f:\n", + " dset = f[k]\n", + " layout = dset.id.layout\n", + " if \"file_uri\" in layout:\n", + " hdf5_files.add(layout[\"file_uri\"])\n", + "hdf5_files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/nrel_wtk_example.ipynb b/python/nrel_wtk_example.ipynb new file mode 100644 index 0000000..3c5bef1 --- /dev/null +++ b/python/nrel_wtk_example.ipynb @@ -0,0 +1,334 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NREL WIND Toolkit Example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates basic usage of the National Renewable Energy Laboratory (NREL) Wind Integration National Dataset (WIND) Toolkit data. More complete examples can be found here: https://github.com/NREL/hsds-examples. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import h5pyd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.image as mpimg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "h5pyd.version.version # should be >= 0.4.2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! hsinfo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# In the shell, use the --bucket option to list files from NREL's S3 bucket \n", + "! hsls --bucket s3://nrel-pds-hsds /nrel/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# Open the wind data \"file\". Use the bucket param to get data from NREL's S3 bucket\n", + "%time f = h5pyd.File(\"/nrel/wtk-us.h5\", 'r', bucket=\"s3://nrel-pds-hsds\") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "%time f.attrs['history'] # attributes can be used to provide desriptions of the content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "list(f) # list the datasets in the file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the windspeed at 80 meters\n", + "dset = f['windspeed_80m']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dset.id.id # if this is an int, then you are using h5py!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dset.shape # shape is three-dimensional time x lat x lon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dset.dtype # type is four byte floats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dset.chunks # chunks describe how the dataset data is stored" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dset.shape[0] * dset.shape[1] * dset.shape[2] * 4 # ~1 TB per dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# read one slice of the data\n", + "%time data = dset[522,::,::]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "plt.imshow(data, origin=\"lower\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# Get all the values for a given geographic point\n", + "# this may take up to a minute\n", + "%time tseries = dset[:, 290, 201]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "len(tseries) # 7 years * 365 days * 24 hours" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tseries.min(), tseries.max(), tseries.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "x = range(len(tseries))\n", + "plt.plot(x, tseries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# get just one month of values\n", + "start = 25000 # anything between 0 and 61367 will work\n", + "end = start + 30*24\n", + "%time tseries = dset[start:end, 1292, 601]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tseries.min(), tseries.max(), tseries.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = range(len(tseries))\n", + "plt.plot(x, tseries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}