diff --git a/demo.ipynb b/demo.ipynb new file mode 100644 index 0000000..2c433b7 --- /dev/null +++ b/demo.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000001", + "metadata": {}, + "source": "# MMseqs2 CTS Demo\n\nRuns MMseqs2 `easy-cluster` on CDM genome files via the CDM Task Service.\n\n- **Image:** `ghcr.io/kbaseincubator/cdm_mmseqs2:0.1.0`\n- **Mode:** `easy-cluster`. All-vs-all clustering, no reference DB needed\n- **Cluster:** `kbase`\n- **Output:** `cts/io/jplfaria/output/mmseqs2/`\n\nSee [cdm_mmseqs2 repo](https://github.com/kbaseincubator/cdm_mmseqs2) for details." + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000002", + "metadata": {}, + "source": "## 1. Setup clients" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000003", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "tscli = get_task_service_client()\nmincli = get_minio_client()\nprint(tscli.whoami())" + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000004", + "metadata": {}, + "source": "## 2. List input files\n\nUsing the shared test genome files already in MinIO with CRC64NVME checksums." + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000005", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "objs = list(mincli.list_objects(\"cts\", prefix=\"io/gavin/test_files\", recursive=True))\ninput_files = [f\"cts/{o.object_name}\" for o in objs]\nprint(f\"{len(input_files)} input files:\")\nfor f in input_files:\n print(f)" + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000006", + "metadata": {}, + "source": "## 3. Submit MMseqs2 easy-cluster job" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000007", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "IMAGE = \"ghcr.io/kbaseincubator/cdm_mmseqs2:0.1.0@sha256:24afa107c0dac1a6f093cc081ec19a577c4d0b156dfa8ee66bb2d3c41b97c082\"\nOUTPUT_DIR = \"cts/io/jplfaria/output/mmseqs2/test/v1\"\n\njob = tscli.submit_job(\n IMAGE,\n input_files,\n OUTPUT_DIR,\n cluster=\"kbase\",\n declobber=True,\n output_mount_point=\"/out\",\n args=[\n \"easy-cluster\",\n tscli.insert_files(),\n \"/out/cluster_results\",\n \"/out/tmp\",\n \"--threads\", \"4\",\n ],\n num_containers=1,\n cpus=4,\n memory=\"16GB\",\n runtime=\"PT30M\"\n)\nprint(\"Job ID:\", job.id)" + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000008", + "metadata": {}, + "source": "## 4. Monitor job status" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000009", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "# Re-attach if notebook restarted:\n# job = tscli.get_job_by_id(\"PASTE_JOB_ID_HERE\")\n\nimport threading, json\nthread = threading.Thread(\n target=lambda: print(json.dumps(job.wait_for_completion(), indent=4)),\n daemon=True\n)\nthread.start()" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000010", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "print(job.get_job_status())" + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000011", + "metadata": {}, + "source": "## 5. Inspect output files" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000012", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "for o in job.get_job()[\"outputs\"]:\n print(o[\"crc64nvme\"], o[\"file\"])" + }, + { + "cell_type": "markdown", + "id": "a1b2c3d4-0001-0001-0001-000000000013", + "metadata": {}, + "source": "## 6. View cluster results\n\n`cluster_results_cluster.tsv` columns: `representative_id`, `member_id`" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b2c3d4-0001-0001-0001-000000000014", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "import io, pandas as pd\ncluster_obj = mincli.get_object(\"cts\", \"io/jplfaria/output/mmseqs2/test/v1/0/cluster_results_cluster.tsv\")\ndf = pd.read_csv(io.BytesIO(cluster_obj.read()), sep=\"\\t\", header=None, names=[\"representative\", \"member\"])\nprint(f\"{len(df)} cluster memberships, {df['representative'].nunique()} clusters\")\ndf.head(20)" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b684aa0-bf82-4d6b-b423-18206e32aaf4", + "metadata": { + "trusted": true + }, + "outputs": [], + "source": "" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file