diff --git a/.gitignore b/.gitignore index 0c04a67..5357979 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +notebooks/web_summary/ .DS_Store __pycache__ .ipynb_checkpoints diff --git a/notebooks/Run_CellRangerArc.ipynb b/notebooks/Run_CellRangerArc.ipynb index 83ba3fc..c0b2881 100644 --- a/notebooks/Run_CellRangerArc.ipynb +++ b/notebooks/Run_CellRangerArc.ipynb @@ -2,7 +2,112 @@ "cells": [ { "cell_type": "code", - "execution_count": 137, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OLD" + ] + }, + { + "cell_type": "code", + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,13 +123,134 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from utils.utils import *" ] }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### DB query" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Common query col: id, request_id, Sample\n", + "def get_sample_name(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.Sample\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_names = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_names.append(result[0])\n", + " return sample_names\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def get_aws_path(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.AWS_storage\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_paths = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_paths.append(result[0])\n", + " return sample_paths\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def get_sample_id(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.id\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_ids = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_ids.append(result[0])\n", + " return sample_ids\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def format_sample_aws(querys, query_col, creds):\n", + " sample_names = []\n", + " sample_paths = []\n", + " sample_ids = []\n", + " \n", + " for query in querys:\n", + " sample_names += get_sample_name(query, query_col, creds)\n", + " sample_paths += get_aws_path(query, query_col, creds)\n", + " sample_ids += get_sample_id(query, query_col, creds)\n", + " \n", + " sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", + " \n", + " samples = pd.DataFrame(\n", + " dict(S3_Path=sample_paths, Sample_ID=sample_ids),\n", + " index=sample_names,\n", + " dtype=str,\n", + " )\n", + " return samples" + ] + }, { "cell_type": "markdown", "metadata": { @@ -45,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +280,7 @@ "\n", "prefix = \"CellRangerArc\" # Workflow to run; also .wdl filename prefix\n", "pipeline_type = prefix # field in *.labels.json\n", - "output_dirname = \"cr-arc-results\"\n", + "output_dirname = \"mito_cr-arc-results\"\n", "\n", "# If need to add comment, put here\n", "comment = \"\"" @@ -62,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -79,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -90,23 +316,181 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMIRDUREVH\n", + "env: AWS_SECRET_ACCESS_KEY=N85006UCgijp8CGqzDy0MqLiz/hI/oGV8uSdmpTc\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEPv//////////wEaCXVzLWVhc3QtMSJHMEUCIQCqvxhWToi0KBwHNijvb9qersxDpPG+R9BY+2giQIgkWAIgSJHSQm8qENlVPnLIbJRoLh/P+0JY8vFmy89VKMIYmZYq+AEIxP//////////ARAEGgw1ODM2NDM1Njc1MTIiDGCfleiZCLh2Drd32irMAckv+DjF7GsRWDiNgxyfABRKUSrzNgSHdpkOKXZ3SE+1Ly0lob1mgtBH7eXLG73OO/SgG5CoxD74kTDoE/0Ofcr8tTuvwE64e5g6Jj4YR5ZR3YW7bWbn+C5OKr/CQu3oepGM6bBVDA43DsNXtDPmQhfpYj8LxT4uMocOUGyEDKZKyNh8T5+1ttt7BUv9zHz63zJcpsIMOB0+l+P2NV7xPWuPabuj3IOFpnCgzZxm4OSQytUTV1pLdIpsksl9hX2A0i2jCQohESGkbLb6RDDMvvKgBjqYAesAHvWnvogi9HG1gGztGKFkxIwhnqBc9Jcn+/lQDFRz6yQTOIsVaujBvMFSjoa8mFvp+eNCHWoXlbJrsAUgQjTuCnO5ZWjMotI+tJe/VzE5R3GynOGv4f7zNGfossJ0rqKi1Xku7KG61NF18BrhRNQk4QEcf8Uq6oaSjfnNm2GMwu2GfClpeb8C/UTFkTJXSQRIDddxn227\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], "source": [ - "# Samples on which to run CellRangerATAC\n", - "# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)\n", - "# Note: Assumes directory name is name of sample\n", - "common_dir = \"s3://dp-lab-data/collaborators/VanDenBrink/AgingHematopoiesis\"\n", - "samples = [\n", - " \"HA-1536_Young_mice_multiome\",\n", - " \"HA-1536_Old_mice_multiome\"\n", - "]\n", - "sample_paths = [\n", - " f\"{common_dir}/{sample}\" for sample in samples\n", - "]" + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMIRDUREVH\n", + "%env AWS_SECRET_ACCESS_KEY=N85006UCgijp8CGqzDy0MqLiz/hI/oGV8uSdmpTc\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEPv//////////wEaCXVzLWVhc3QtMSJHMEUCIQCqvxhWToi0KBwHNijvb9qersxDpPG+R9BY+2giQIgkWAIgSJHSQm8qENlVPnLIbJRoLh/P+0JY8vFmy89VKMIYmZYq+AEIxP//////////ARAEGgw1ODM2NDM1Njc1MTIiDGCfleiZCLh2Drd32irMAckv+DjF7GsRWDiNgxyfABRKUSrzNgSHdpkOKXZ3SE+1Ly0lob1mgtBH7eXLG73OO/SgG5CoxD74kTDoE/0Ofcr8tTuvwE64e5g6Jj4YR5ZR3YW7bWbn+C5OKr/CQu3oepGM6bBVDA43DsNXtDPmQhfpYj8LxT4uMocOUGyEDKZKyNh8T5+1ttt7BUv9zHz63zJcpsIMOB0+l+P2NV7xPWuPabuj3IOFpnCgzZxm4OSQytUTV1pLdIpsksl9hX2A0i2jCQohESGkbLb6RDDMvvKgBjqYAesAHvWnvogi9HG1gGztGKFkxIwhnqBc9Jcn+/lQDFRz6yQTOIsVaujBvMFSjoa8mFvp+eNCHWoXlbJrsAUgQjTuCnO5ZWjMotI+tJe/VzE5R3GynOGv4f7zNGfossJ0rqKi1Xku7KG61NF18BrhRNQk4QEcf8Uq6oaSjfnNm2GMwu2GfClpeb8C/UTFkTJXSQRIDddxn227\n", + "!aws s3 ls" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sample Information" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "execute_query() takes 2 positional arguments but 3 were given", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [15]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Common query col: id, request_id, Sample\u001b[39;00m\n\u001b[1;32m 3\u001b[0m request_ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAE-2166\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 5\u001b[0m samples \u001b[38;5;241m=\u001b[39m \u001b[43mformat_sample_aws\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrequest_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m samples\n", + "Input \u001b[0;32mIn [10]\u001b[0m, in \u001b[0;36mformat_sample_aws\u001b[0;34m(querys, query_col, creds)\u001b[0m\n\u001b[1;32m 67\u001b[0m sample_ids \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 69\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m query \u001b[38;5;129;01min\u001b[39;00m querys:\n\u001b[0;32m---> 70\u001b[0m sample_names \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[43mget_sample_name\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_col\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 71\u001b[0m sample_paths \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m get_aws_path(query, query_col, creds)\n\u001b[1;32m 72\u001b[0m sample_ids \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m get_sample_id(query, query_col, creds)\n", + "Input \u001b[0;32mIn [10]\u001b[0m, in \u001b[0;36mget_sample_name\u001b[0;34m(query, query_col, creds)\u001b[0m\n\u001b[1;32m 9\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;124mSELECT \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtable_sample_data\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.Sample\u001b[39m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124mFROM \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtable_sample_data\u001b[38;5;132;01m}\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124mWHERE \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtable_sample_data\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquery_col\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquery\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 14\u001b[0m sample_names \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 15\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mexecute_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muser\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[1;32m 17\u001b[0m sample_names\u001b[38;5;241m.\u001b[39mappend(result[\u001b[38;5;241m0\u001b[39m])\n", + "\u001b[0;31mTypeError\u001b[0m: execute_query() takes 2 positional arguments but 3 were given" + ] + } + ], + "source": [ + "# Common query col: id, request_id, Sample\n", + "\n", + "request_ids = ['AE-2166']\n", + "\n", + "samples = format_sample_aws(request_ids, 'request_id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_PathSample_ID
JR-1603_Mm_veh_CSF_multiomes3://dp-lab-data/collaborators/aboir...3708
JR-1603_LM_CSF_multiomes3://dp-lab-data/collaborators/aboir...3709
JR-1603_LPS_CSF_multiomes3://dp-lab-data/collaborators/aboir...3710
JR-1603_LM_blood_multiomes3://dp-lab-data/collaborators/aboir...3711
JR-1603_LPS_blood_multiomes3://dp-lab-data/collaborators/aboir...3712
JR-1603_veh_blood_multiomes3://dp-lab-data/collaborators/aboir...3713
\n", + "
" + ], + "text/plain": [ + " S3_Path Sample_ID\n", + "JR-1603_Mm_veh_CSF_multiome s3://dp-lab-data/collaborators/aboir... 3708\n", + "JR-1603_LM_CSF_multiome s3://dp-lab-data/collaborators/aboir... 3709\n", + "JR-1603_LPS_CSF_multiome s3://dp-lab-data/collaborators/aboir... 3710\n", + "JR-1603_LM_blood_multiome s3://dp-lab-data/collaborators/aboir... 3711\n", + "JR-1603_LPS_blood_multiome s3://dp-lab-data/collaborators/aboir... 3712\n", + "JR-1603_veh_blood_multiome s3://dp-lab-data/collaborators/aboir... 3713" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = samples.iloc[:6, ]\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": { @@ -118,33 +502,162 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 15, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "['HA-1536_Young_mice_multiome', 'HA-1536_Old_mice_multiome']\n" + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_4449/3639555678.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"FASTQs_GEX\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, folder=\"FASTQ\"))\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_4449/3639555678.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"FASTQs_ATAC\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x + \"_ATAC\", folder=\"FASTQ\"))\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_4449/3639555678.py:8: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"Reference\"] = samples[\"Sample_ID\"].apply(lambda x: get_cr_reference(x, prefix, creds[\"user\"], creds[\"password\"]))\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_PathSample_IDFASTQs_GEXFASTQs_ATACReference
JR-1603_Mm_veh_CSF_multiomes3://dp-lab-data/collaborators/aboir...3708{'All': ['s3://dp-lab-data/collabora...{'All': ['s3://dp-lab-data/collabora...https://cf.10xgenomics.com/supp/cell...
JR-1603_LM_CSF_multiomes3://dp-lab-data/collaborators/aboir...3709{'All': ['s3://dp-lab-data/collabora...{'All': ['s3://dp-lab-data/collabora...https://cf.10xgenomics.com/supp/cell...
JR-1603_LPS_CSF_multiomes3://dp-lab-data/collaborators/aboir...3710{'All': ['s3://dp-lab-data/collabora...{'All': ['s3://dp-lab-data/collabora...https://cf.10xgenomics.com/supp/cell...
JR-1603_LM_blood_multiomes3://dp-lab-data/collaborators/aboir...3711{'All': ['s3://dp-lab-data/collabora...{'All': ['s3://dp-lab-data/collabora...https://cf.10xgenomics.com/supp/cell...
JR-1603_LPS_blood_multiomes3://dp-lab-data/collaborators/aboir...3712{'All': ['s3://dp-lab-data/collabora...{'All': ['s3://dp-lab-data/collabora...https://cf.10xgenomics.com/supp/cell...
JR-1603_veh_blood_multiomes3://dp-lab-data/collaborators/aboir...3713{'All': ['s3://dp-lab-data/collabora...{'All': ['s3://dp-lab-data/collabora...https://cf.10xgenomics.com/supp/cell...
\n", + "
" + ], + "text/plain": [ + " S3_Path \\\n", + "JR-1603_Mm_veh_CSF_multiome s3://dp-lab-data/collaborators/aboir... \n", + "JR-1603_LM_CSF_multiome s3://dp-lab-data/collaborators/aboir... \n", + "JR-1603_LPS_CSF_multiome s3://dp-lab-data/collaborators/aboir... \n", + "JR-1603_LM_blood_multiome s3://dp-lab-data/collaborators/aboir... \n", + "JR-1603_LPS_blood_multiome s3://dp-lab-data/collaborators/aboir... \n", + "JR-1603_veh_blood_multiome s3://dp-lab-data/collaborators/aboir... \n", + "\n", + " Sample_ID \\\n", + "JR-1603_Mm_veh_CSF_multiome 3708 \n", + "JR-1603_LM_CSF_multiome 3709 \n", + "JR-1603_LPS_CSF_multiome 3710 \n", + "JR-1603_LM_blood_multiome 3711 \n", + "JR-1603_LPS_blood_multiome 3712 \n", + "JR-1603_veh_blood_multiome 3713 \n", + "\n", + " FASTQs_GEX \\\n", + "JR-1603_Mm_veh_CSF_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LM_CSF_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LPS_CSF_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LM_blood_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LPS_blood_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_veh_blood_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "\n", + " FASTQs_ATAC \\\n", + "JR-1603_Mm_veh_CSF_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LM_CSF_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LPS_CSF_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LM_blood_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_LPS_blood_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "JR-1603_veh_blood_multiome {'All': ['s3://dp-lab-data/collabora... \n", + "\n", + " Reference \n", + "JR-1603_Mm_veh_CSF_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LM_CSF_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LPS_CSF_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LM_blood_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LPS_blood_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_veh_blood_multiome https://cf.10xgenomics.com/supp/cell... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# Get information for all samples\n", - "sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", - "sample_names = [os.path.basename(s) for s in sample_paths]\n", - "#sample_names = [s.replace(\"Redo_\", \"\") for s in sample_names]\n", - "print(sample_names)\n", - "samples = pd.DataFrame(\n", - " sample_paths,\n", - " index=sample_names,\n", - " columns=[\"S3_Path\"],\n", - " dtype=str,\n", - ")\n", - "samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", - " lambda x: get_sample_id(x, creds['user'], creds['password'])\n", - ").values\n", - "\n", "# Get FASTQ paths from S3\n", "# Note: Uses same FASTQ file ids for all samples\n", "#fastq_file_ids = fastq_map[prefix]\n", @@ -152,12 +665,13 @@ "samples[\"FASTQs_ATAC\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x + \"_ATAC\", folder=\"FASTQ\"))\n", "\n", "# Get reference genome location\n", - "samples[\"Reference\"] = samples[\"Sample_ID\"].apply(lambda x: get_cr_reference(x, prefix, creds[\"user\"], creds[\"password\"]))" + "samples[\"Reference\"] = samples[\"Sample_ID\"].apply(lambda x: get_cr_reference(x, prefix, creds[\"user\"], creds[\"password\"]))\n", + "samples" ] }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -196,7 +710,14 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -231,22 +752,62 @@ " \n", " \n", " \n", - " HA-1536_Young_mice_multiome\n", - " HA-1536_Young_mice_multiome\n", - " 3573_HA-1536_Young_mice_multiome_IGO...\n", - " [s3://dp-lab-data/collaborators/VanD...\n", - " 3575_HA-1536_Young_mice_multiome_ATA...\n", - " [s3://dp-lab-data/collaborators/VanD...\n", + " JR-1603_Mm_veh_CSF_multiome\n", + " JR-1603_Mm_veh_CSF_multiome\n", + " 3708_JR-1603_Mm_veh_CSF_multiome_IGO...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " 3714_JR-1603_Mm_veh_CSF_multiome_ATA...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", " https://cf.10xgenomics.com/supp/cell...\n", " quay.io/hisplan\n", " \n", " \n", - " HA-1536_Old_mice_multiome\n", - " HA-1536_Old_mice_multiome\n", - " 3574_HA-1536_Old_mice_multiome_IGO_1...\n", - " [s3://dp-lab-data/collaborators/VanD...\n", - " 3576_HA-1536_Old_mice_multiome_ATAC_...\n", - " [s3://dp-lab-data/collaborators/VanD...\n", + " JR-1603_LM_CSF_multiome\n", + " JR-1603_LM_CSF_multiome\n", + " 3709_JR-1603_LM_CSF_multiome_IGO_124...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " 3715_JR-1603_LM_CSF_multiome_ATAC_IG...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " https://cf.10xgenomics.com/supp/cell...\n", + " quay.io/hisplan\n", + " \n", + " \n", + " JR-1603_LPS_CSF_multiome\n", + " JR-1603_LPS_CSF_multiome\n", + " 3710_JR-1603_LPS_CSF_multiome_IGO_12...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " 3716_JR-1603_LPS_CSF_multiome_ATAC_I...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " https://cf.10xgenomics.com/supp/cell...\n", + " quay.io/hisplan\n", + " \n", + " \n", + " JR-1603_LM_blood_multiome\n", + " JR-1603_LM_blood_multiome\n", + " 3711_JR-1603_LM_blood_multiome_IGO_1...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " 3718_JR-1603_LM_blood_multiome_ATAC_...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " https://cf.10xgenomics.com/supp/cell...\n", + " quay.io/hisplan\n", + " \n", + " \n", + " JR-1603_LPS_blood_multiome\n", + " JR-1603_LPS_blood_multiome\n", + " 3712_JR-1603_LPS_blood_multiome_IGO_...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " 3719_JR-1603_LPS_blood_multiome_ATAC...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " https://cf.10xgenomics.com/supp/cell...\n", + " quay.io/hisplan\n", + " \n", + " \n", + " JR-1603_veh_blood_multiome\n", + " JR-1603_veh_blood_multiome\n", + " 3713_JR-1603_veh_blood_multiome_IGO_...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " 3717_JR-1603_veh_blood_multiome_ATAC...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", " https://cf.10xgenomics.com/supp/cell...\n", " quay.io/hisplan\n", " \n", @@ -256,35 +817,63 @@ ], "text/plain": [ " CellRangerArc.runID \\\n", - "HA-1536_Young_mice_multiome HA-1536_Young_mice_multiome \n", - "HA-1536_Old_mice_multiome HA-1536_Old_mice_multiome \n", + "JR-1603_Mm_veh_CSF_multiome JR-1603_Mm_veh_CSF_multiome \n", + "JR-1603_LM_CSF_multiome JR-1603_LM_CSF_multiome \n", + "JR-1603_LPS_CSF_multiome JR-1603_LPS_CSF_multiome \n", + "JR-1603_LM_blood_multiome JR-1603_LM_blood_multiome \n", + "JR-1603_LPS_blood_multiome JR-1603_LPS_blood_multiome \n", + "JR-1603_veh_blood_multiome JR-1603_veh_blood_multiome \n", "\n", " CellRangerArc.gexFastqName \\\n", - "HA-1536_Young_mice_multiome 3573_HA-1536_Young_mice_multiome_IGO... \n", - "HA-1536_Old_mice_multiome 3574_HA-1536_Old_mice_multiome_IGO_1... \n", + "JR-1603_Mm_veh_CSF_multiome 3708_JR-1603_Mm_veh_CSF_multiome_IGO... \n", + "JR-1603_LM_CSF_multiome 3709_JR-1603_LM_CSF_multiome_IGO_124... \n", + "JR-1603_LPS_CSF_multiome 3710_JR-1603_LPS_CSF_multiome_IGO_12... \n", + "JR-1603_LM_blood_multiome 3711_JR-1603_LM_blood_multiome_IGO_1... \n", + "JR-1603_LPS_blood_multiome 3712_JR-1603_LPS_blood_multiome_IGO_... \n", + "JR-1603_veh_blood_multiome 3713_JR-1603_veh_blood_multiome_IGO_... \n", "\n", " CellRangerArc.gexFastqFiles \\\n", - "HA-1536_Young_mice_multiome [s3://dp-lab-data/collaborators/VanD... \n", - "HA-1536_Old_mice_multiome [s3://dp-lab-data/collaborators/VanD... \n", + "JR-1603_Mm_veh_CSF_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LM_CSF_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LPS_CSF_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LM_blood_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LPS_blood_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_veh_blood_multiome [s3://dp-lab-data/collaborators/aboi... \n", "\n", " CellRangerArc.atacFastqName \\\n", - "HA-1536_Young_mice_multiome 3575_HA-1536_Young_mice_multiome_ATA... \n", - "HA-1536_Old_mice_multiome 3576_HA-1536_Old_mice_multiome_ATAC_... \n", + "JR-1603_Mm_veh_CSF_multiome 3714_JR-1603_Mm_veh_CSF_multiome_ATA... \n", + "JR-1603_LM_CSF_multiome 3715_JR-1603_LM_CSF_multiome_ATAC_IG... \n", + "JR-1603_LPS_CSF_multiome 3716_JR-1603_LPS_CSF_multiome_ATAC_I... \n", + "JR-1603_LM_blood_multiome 3718_JR-1603_LM_blood_multiome_ATAC_... \n", + "JR-1603_LPS_blood_multiome 3719_JR-1603_LPS_blood_multiome_ATAC... \n", + "JR-1603_veh_blood_multiome 3717_JR-1603_veh_blood_multiome_ATAC... \n", "\n", " CellRangerArc.atacFastqFiles \\\n", - "HA-1536_Young_mice_multiome [s3://dp-lab-data/collaborators/VanD... \n", - "HA-1536_Old_mice_multiome [s3://dp-lab-data/collaborators/VanD... \n", + "JR-1603_Mm_veh_CSF_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LM_CSF_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LPS_CSF_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LM_blood_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_LPS_blood_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "JR-1603_veh_blood_multiome [s3://dp-lab-data/collaborators/aboi... \n", "\n", " CellRangerArc.reference \\\n", - "HA-1536_Young_mice_multiome https://cf.10xgenomics.com/supp/cell... \n", - "HA-1536_Old_mice_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_Mm_veh_CSF_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LM_CSF_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LPS_CSF_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LM_blood_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_LPS_blood_multiome https://cf.10xgenomics.com/supp/cell... \n", + "JR-1603_veh_blood_multiome https://cf.10xgenomics.com/supp/cell... \n", "\n", " CellRangerArc.dockerRegistry \n", - "HA-1536_Young_mice_multiome quay.io/hisplan \n", - "HA-1536_Old_mice_multiome quay.io/hisplan " + "JR-1603_Mm_veh_CSF_multiome quay.io/hisplan \n", + "JR-1603_LM_CSF_multiome quay.io/hisplan \n", + "JR-1603_LPS_CSF_multiome quay.io/hisplan \n", + "JR-1603_LM_blood_multiome quay.io/hisplan \n", + "JR-1603_LPS_blood_multiome quay.io/hisplan \n", + "JR-1603_veh_blood_multiome quay.io/hisplan " ] }, - "execution_count": 142, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -295,7 +884,40 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz'],\n", + " dtype=object)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs[f'{prefix}.reference'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -330,48 +952,104 @@ " \n", " \n", " \n", - " HA-1536_Young_mice_multiome\n", + " JR-1603_Mm_veh_CSF_multiome\n", + " CellRangerARC\n", + " mouse CSF multiome\n", + " JR-1603_Mm_veh_CSF_multiome\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " JR-1603_LM_CSF_multiome\n", " CellRangerARC\n", - " Aging hematopoiesis\n", - " HA-1536_Young_mice_multiome\n", - " moormana\n", - " s3://dp-lab-data/collaborators/VanDe...\n", + " mouse CSF multiome\n", + " JR-1603_LM_CSF_multiome\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", " -\n", - " moormana\n", + " sohailn\n", " \n", " \n", - " HA-1536_Old_mice_multiome\n", + " JR-1603_LPS_CSF_multiome\n", " CellRangerARC\n", - " Aging hematopoiesis\n", - " HA-1536_Old_mice_multiome\n", - " moormana\n", - " s3://dp-lab-data/collaborators/VanDe...\n", + " mouse CSF multiome\n", + " JR-1603_LPS_CSF_multiome\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", " -\n", - " moormana\n", + " sohailn\n", + " \n", + " \n", + " JR-1603_LM_blood_multiome\n", + " CellRangerARC\n", + " mouse CSF multiome\n", + " JR-1603_LM_blood_multiome\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " JR-1603_LPS_blood_multiome\n", + " CellRangerARC\n", + " mouse CSF multiome\n", + " JR-1603_LPS_blood_multiome\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " JR-1603_veh_blood_multiome\n", + " CellRangerARC\n", + " mouse CSF multiome\n", + " JR-1603_veh_blood_multiome\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pipelineType project \\\n", - "HA-1536_Young_mice_multiome CellRangerARC Aging hematopoiesis \n", - "HA-1536_Old_mice_multiome CellRangerARC Aging hematopoiesis \n", + " pipelineType project \\\n", + "JR-1603_Mm_veh_CSF_multiome CellRangerARC mouse CSF multiome \n", + "JR-1603_LM_CSF_multiome CellRangerARC mouse CSF multiome \n", + "JR-1603_LPS_CSF_multiome CellRangerARC mouse CSF multiome \n", + "JR-1603_LM_blood_multiome CellRangerARC mouse CSF multiome \n", + "JR-1603_LPS_blood_multiome CellRangerARC mouse CSF multiome \n", + "JR-1603_veh_blood_multiome CellRangerARC mouse CSF multiome \n", "\n", - " sample owner \\\n", - "HA-1536_Young_mice_multiome HA-1536_Young_mice_multiome moormana \n", - "HA-1536_Old_mice_multiome HA-1536_Old_mice_multiome moormana \n", + " sample owner \\\n", + "JR-1603_Mm_veh_CSF_multiome JR-1603_Mm_veh_CSF_multiome sohailn \n", + "JR-1603_LM_CSF_multiome JR-1603_LM_CSF_multiome sohailn \n", + "JR-1603_LPS_CSF_multiome JR-1603_LPS_CSF_multiome sohailn \n", + "JR-1603_LM_blood_multiome JR-1603_LM_blood_multiome sohailn \n", + "JR-1603_LPS_blood_multiome JR-1603_LPS_blood_multiome sohailn \n", + "JR-1603_veh_blood_multiome JR-1603_veh_blood_multiome sohailn \n", "\n", " destination transfer \\\n", - "HA-1536_Young_mice_multiome s3://dp-lab-data/collaborators/VanDe... - \n", - "HA-1536_Old_mice_multiome s3://dp-lab-data/collaborators/VanDe... - \n", + "JR-1603_Mm_veh_CSF_multiome s3://dp-lab-data/collaborators/aboir... - \n", + "JR-1603_LM_CSF_multiome s3://dp-lab-data/collaborators/aboir... - \n", + "JR-1603_LPS_CSF_multiome s3://dp-lab-data/collaborators/aboir... - \n", + "JR-1603_LM_blood_multiome s3://dp-lab-data/collaborators/aboir... - \n", + "JR-1603_LPS_blood_multiome s3://dp-lab-data/collaborators/aboir... - \n", + "JR-1603_veh_blood_multiome s3://dp-lab-data/collaborators/aboir... - \n", "\n", - " comment \n", - "HA-1536_Young_mice_multiome moormana \n", - "HA-1536_Old_mice_multiome moormana " + " comment \n", + "JR-1603_Mm_veh_CSF_multiome sohailn \n", + "JR-1603_LM_CSF_multiome sohailn \n", + "JR-1603_LPS_CSF_multiome sohailn \n", + "JR-1603_LM_blood_multiome sohailn \n", + "JR-1603_LPS_blood_multiome sohailn \n", + "JR-1603_veh_blood_multiome sohailn " ] }, - "execution_count": 143, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -382,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 32, "metadata": { "tags": [] }, @@ -390,12 +1068,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b8a2a9c63f794d19a5ec5dfb07189768", + "model_id": "607945276fa64aadb7b02efee705038f", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomes3://dp-lab-data/collaborators/strud...4531humanmultiomeDOGMAseq sample plusGRCh38
\n", + "" + ], + "text/plain": [ + " AWS_storage \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome s3://dp-lab-data/collaborators/strud... \n", + "\n", + " id species sc_tech \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4531 human multiome \n", + "\n", + " project_id reference \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome DOGMAseq sample plus GRCh38 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs or even AWS paths\n", + "\n", + "request_ids = ['AE-2116']\n", + "samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "samples = samples.iloc[0:1]\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AE-2116_mDA_neurons_DOGMA-seq_multiome\n", + " PRE AE-2116_mDA_neurons_DOGMA-seq_multiome/\n", + " PRE AE-2116_mDA_neurons_DOGMA-seq_multiome_HTO/\n", + " PRE AE-2116_mDA_neurons_DOGMA-seq_multiome_mATAC/\n", + "\n" + ] + } + ], + "source": [ + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQ_gexFASTQ_atac
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomes3://dp-lab-data/collaborators/strud...4531humanmultiomeDOGMAseq sample plusGRCh38{'I1': ['s3://dp-lab-data/collaborat...{'I1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome s3://dp-lab-data/collaborators/strud... \n", + "\n", + " id species sc_tech \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4531 human multiome \n", + "\n", + " project_id reference \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome DOGMAseq sample plus GRCh38 \n", + "\n", + " FASTQ_gex \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome {'I1': ['s3://dp-lab-data/collaborat... \n", + "\n", + " FASTQ_atac \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome {'I1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQ_gex\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map['CellRangerGex'], \"FASTQ\"))\n", + "samples[\"FASTQ_atac\"] = (samples[\"AWS_storage\"] + '_mATAC').apply(lambda x: get_fastqs(x, fastq_map['CellRangerATAC'], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE\n", + "\n", + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AE-2116_mDA_neurons_DOGMA-seq_multiome\n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQ_gex'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text \n", + " \n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQ_atac'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "For CellRanger you need to supply an HTTPS path. So if you are using a custom genome stored on AWS, you must make the reference public ! Be sure to manually change the \"reference\" argument if it has not been updated correctly!!!!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://dp-lab-data.s3.amazonaws.com/collaborators/sfeira/YfDogmaSeqMtdna/references/cr-arc-GRCh38-atac-with-mito-2020.tar.gz']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['reference'] = 'https://dp-lab-data.s3.amazonaws.com/collaborators/sfeira/YfDogmaSeqMtdna/references/cr-arc-GRCh38-atac-with-mito-2020.tar.gz'\n", + "samples['reference'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerArc.runIDCellRangerArc.gexFastqNameCellRangerArc.gexFastqFilesCellRangerArc.atacFastqNameCellRangerArc.atacFastqFilesCellRangerArc.referenceCellRangerArc.dockerRegistry
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomeAE-2116_mDA_neurons_DOGMA-seq_multiome4531_AE-2116_mDA_neurons_DOGMA-seq_m...[s3://dp-lab-data/collaborators/stru...4532_AE-2116_mDA_neurons_DOGMA-seq_m...[s3://dp-lab-data/collaborators/stru...https://dp-lab-data.s3.amazonaws.com...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerArc.runID \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome AE-2116_mDA_neurons_DOGMA-seq_multiome \n", + "\n", + " CellRangerArc.gexFastqName \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4531_AE-2116_mDA_neurons_DOGMA-seq_m... \n", + "\n", + " CellRangerArc.gexFastqFiles \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome [s3://dp-lab-data/collaborators/stru... \n", + "\n", + " CellRangerArc.atacFastqName \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4532_AE-2116_mDA_neurons_DOGMA-seq_m... \n", + "\n", + " CellRangerArc.atacFastqFiles \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome [s3://dp-lab-data/collaborators/stru... \n", + "\n", + " CellRangerArc.reference \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome https://dp-lab-data.s3.amazonaws.com... \n", + "\n", + " CellRangerArc.dockerRegistry \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome quay.io/hisplan " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.runID\"] = inputs.index\n", + "inputs[f\"{prefix}.gexFastqFiles\"] = samples[\"FASTQ_gex\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f\"{prefix}.gexFastqName\"] = inputs[f\"{prefix}.gexFastqFiles\"].apply(lambda x: get_fastqs_name(x))\n", + "\n", + "inputs[f\"{prefix}.atacFastqFiles\"] = samples[\"FASTQ_atac\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f\"{prefix}.atacFastqName\"] = inputs[f\"{prefix}.atacFastqFiles\"].apply(lambda x: get_fastqs_name(x))\n", + "\n", + "inputs[f\"{prefix}.reference\"] = samples[\"reference\"] \n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomeCellRangerArcDOGMAseq sample plusAE-2116_mDA_neurons_DOGMA-seq_multiomesohailns3://dp-lab-data/collaborators/strud...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome CellRangerArc DOGMAseq sample plus \n", + "\n", + " sample \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome AE-2116_mDA_neurons_DOGMA-seq_multiome \n", + "\n", + " owner \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome sohailn \n", + "\n", + " destination \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome s3://dp-lab-data/collaborators/strud... \n", + "\n", + " transfer comment \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome - sohailn " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/struder/DogmaseqSamplePlus/AE-2116_mDA_neurons_DOGMA-seq_multiome/mito_cr-arc-results']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels['destination'] = labels['destination'].str.replace(output_dirname, 'mito_cr-arc-results')\n", + "labels['destination'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerArc.runIDCellRangerArc.gexFastqNameCellRangerArc.gexFastqFilesCellRangerArc.atacFastqNameCellRangerArc.atacFastqFilesCellRangerArc.referenceCellRangerArc.dockerRegistry
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomeAE-2116_mDA_neurons_DOGMA-seq_multiome4531_AE-2116_mDA_neurons_DOGMA-seq_m...[s3://dp-lab-data/collaborators/stru...4532_AE-2116_mDA_neurons_DOGMA-seq_m...[s3://dp-lab-data/collaborators/stru...https://dp-lab-data.s3.amazonaws.com...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerArc.runID \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome AE-2116_mDA_neurons_DOGMA-seq_multiome \n", + "\n", + " CellRangerArc.gexFastqName \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4531_AE-2116_mDA_neurons_DOGMA-seq_m... \n", + "\n", + " CellRangerArc.gexFastqFiles \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome [s3://dp-lab-data/collaborators/stru... \n", + "\n", + " CellRangerArc.atacFastqName \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4532_AE-2116_mDA_neurons_DOGMA-seq_m... \n", + "\n", + " CellRangerArc.atacFastqFiles \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome [s3://dp-lab-data/collaborators/stru... \n", + "\n", + " CellRangerArc.reference \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome https://dp-lab-data.s3.amazonaws.com... \n", + "\n", + " CellRangerArc.dockerRegistry \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome quay.io/hisplan " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomeCellRangerArcDOGMAseq sample plusAE-2116_mDA_neurons_DOGMA-seq_multiomesohailns3://dp-lab-data/collaborators/strud...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome CellRangerArc DOGMAseq sample plus \n", + "\n", + " sample \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome AE-2116_mDA_neurons_DOGMA-seq_multiome \n", + "\n", + " owner \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome sohailn \n", + "\n", + " destination \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome s3://dp-lab-data/collaborators/strud... \n", + "\n", + " transfer comment \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome - sohailn " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dab6f042bbfd4a19b7c562305e480540", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilotGRCh38-1.1.0
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilotGRCh38-1.1.0
\n", + "" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq GRCh38-1.1.0 \n", + "RB-2041_mRB54_1003_DOGMAseq GRCh38-1.1.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Get species from database for given sample\n", - "from mysql.connector import connect, Error\n", + "# You can supply a list of IDs or request IDs or even AWS paths\n", "\n", - "def get_sample_id(sample_name, user, password):\n", - " try:\n", - " table_sample_data = \"peer_lab_db.sample_data\"\n", - " query = f\"\"\"\n", - " SELECT {table_sample_data}.id\n", - " FROM {table_sample_data}\n", - " WHERE {table_sample_data}.Sample=\"{sample_name}\"\n", - " \"\"\"\n", - " result = execute_query(query, user, password)[0][0]\n", - " return result\n", - " except Error as e:\n", - " print(f\"Error: {e}\")" + "# request_ids = ['PM-1779']\n", + "# samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "# sample_ids = [3970]\n", + "# samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "aws_storage = ['s3://dp-lab-data/collaborators/sfeira/ScatacSeqPilot/RB-2041_mRB54_1003_DOGMAseq/',\n", + " 's3://dp-lab-data/collaborators/sfeira/ScatacSeqPilot/RB-2041_WildType_DOGMAseq/']\n", + "samples = sample_scridb_info(aws_storage, 'AWS_storage', creds)\n", + "\n", + "\n", + "samples" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "# Get species from database for given sample\n", - "from mysql.connector import connect, Error\n", + "IMPORTANT NOTE \n", "\n", - "def get_project_id(sample_id, user, password):\n", - " try:\n", - " table_sample_data = \"peer_lab_db.sample_data\"\n", - " table_project_data = \"peer_lab_db.project_data\"\n", - " query = f\"\"\"\n", - " SELECT {table_project_data}.projectName\n", - " FROM {table_project_data}\n", - " LEFT JOIN {table_sample_data}\n", - " ON {table_project_data}.id = {table_sample_data}.projectData_id\n", - " WHERE {table_sample_data}.id = {sample_id}\n", - " \"\"\"\n", - " result = execute_query(query, user, password)[0][0]\n", - " return result\n", - " except Error as e:\n", - " print(f\"Error: {e}\")" + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RB-2041_WildType_DOGMAseq\n", + " PRE RB-2041_WildType_DOGMAseq/\n", + "\n", + "RB-2041_mRB54_1003_DOGMAseq\n", + " PRE RB-2041_mRB54_1003_DOGMAseq/\n", + "\n" + ] + } + ], "source": [ - "def get_SEQC_version(loc):\n", - " try:\n", - " cmd = f\"aws s3 cp {loc}/seqc-results/seqc_log.txt -\"\n", - " out = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True).__dict__[\"stdout\"]\n", - " version = re.match(r\".*SEQC=v(\\d+\\.\\d+\\.\\d+).*\", out)[1]\n", - " return version\n", - " except:\n", - " return \"N/A\"" + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilotGRCh38-1.1.0{'I1': ['s3://dp-lab-data/collaborat...
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilotGRCh38-1.1.0{'I1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq GRCh38-1.1.0 \n", + "RB-2041_mRB54_1003_DOGMAseq GRCh38-1.1.0 \n", + "\n", + " FASTQs \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'I1': ['s3://dp-lab-data/collaborat... \n", + "RB-2041_mRB54_1003_DOGMAseq {'I1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def get_file_prefix(loc):\n", - " try:\n", - " cmd = f\"aws s3 ls {loc}/seqc-results/\"\n", - " out = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True).__dict__[\"stdout\"]\n", - " \n", - " # Note: I'm expecting the aligned bam file to be in loc\n", - " bam_pattern = re.compile(r\"(.*)_Aligned\\.out\\.bam$\")\n", - " filename = list(filter(bam_pattern.match, out.split()))[0]\n", - " file_prefix = re.match(bam_pattern, filename)[1]\n", - " return file_prefix\n", - " except:\n", - " raise ValueError(f\"BAM file not found in {loc}\")\n", - " return \"\"" + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "# FASTQ reads/indices required for each workflow\n", - "# Shoudl replace with JSON file\n", - "cr_reference_map = {\n", - " 'CellRangerArc':\n", - " {\n", - " 'Human': \"https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-GRCh38-2020-A.tar.gz\",\n", - " 'Mouse': \"https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz\",\n", - " },\n", - " 'CellRangerATAC':\n", - " {\n", - " 'Human': \"https://cf.10xgenomics.com/supp/cell-atac/refdata-cellranger-arc-GRCh38-2020-A-2.0.0.tar.gz\",\n", - " 'Mouse': \"https://cf.10xgenomics.com/supp/cell-atac/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz\",\n", - " }\n", - "}\n", + "IMPORTANT NOTE\n", "\n", - "def get_cr_reference(sample_id, prefix, user, password):\n", - " # Get species from database to decide reference\n", - " species = get_species(sample_id, user, password)\n", - " \n", - " # Map to reference locations\n", - " try:\n", - " return cr_reference_map[prefix][species]\n", - " except:\n", - " raise ValueError(f\"Unknown Species: {species}\")" + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RB-2041_WildType_DOGMAseq\n", + "RB-2041_mRB54_1003_DOGMAseq\n" + ] + } + ], "source": [ - "def get_bc_whitelist(sample_id):\n", - " # Get version from database to decide whitelist\n", - " sc_tech = get_sc_tech(sample_id, creds[\"user\"], creds[\"password\"])\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQs'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", " \n", - " # Map to reference locations\n", - " if \"V3\" in sc_tech:\n", - " return \"s3://seqc-public/barcodes/ten_x_v3/flat/3M-february-2018.txt\"\n", - " elif \"V2\" in sc_tech:\n", - " return \"s3://seqc-public/barcodes/ten_x_v2/flat/737K-august-2016.txt\"\n", - " else:\n", - " raise ValueError(f\"Unknown Technology: {sc_tech}\")" + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], - "source": [ - "def run(\n", - " workflow_path: str,\n", - " execp: str,\n", - " secrets: str,\n", - " inputs: str,\n", - " labels: str,\n", - " options: str,\n", - "):\n", - " # change working directory to the pipeline package\n", - " oldwd = os.getcwd()\n", - " os.chdir(workflow_path)\n", - " \n", - " # execute the pipeline command\n", - " cmd = f\"{workflow_path}/{execp} -k {secrets} -i {inputs} -l {labels} -o {options}\"\n", - " var = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True)\n", - " out = var.__dict__\n", - " \n", - " # change working directory back\n", - " os.chdir(oldwd)\n", - " \n", - " return out" - ] + "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Process Samples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Setup" + "IMPORTANT NOTE \n", + "\n", + "For CellRanger you need to supply an HTTPS path. So if you are using a custom genome stored on AWS, you must make the reference public ! Be sure to manually change the \"reference\" argument if it has not been updated correctly!!!!" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Location of docker files\n", - "common_docker_registry = \"quay.io/hisplan\"\n", - "\n", - "prefix = \"CellRangerATAC\" # Workflow to run; also .wdl filename prefix\n", - "pipeline_type = prefix # field in *.labels.json\n", - "output_dirname = \"cr-atac-results\"\n", + "samples = update_ref(samples, prefix)\n", "\n", - "# If need to add comment, put here\n", - "comment = \"\"" + "if not samples['reference'].isna().any():\n", + " samples[\"reference\"].apply(lambda x: {\n", + " \"name\": re.match(r'.*refdata-cellranger-arc-(.*).tar.gz', x)[1],\n", + " \"location\": x,\n", + " }) \n", + "samples" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Sample\n", + "RB-2041_WildType_DOGMAseq {'name': 'GRCh38_atac_mito_mask_refe...\n", + "RB-2041_mRB54_1003_DOGMAseq {'name': 'GRCh38_atac_mito_mask_refe...\n", + "Name: reference, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Locations of workflow-related directories and files\n", - "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\" # CHANGE THIS\n", - "workflow_dir = f\"{Path.home()}/scing/bin/cellranger-atac-2.0.0\" # CHANGE THIS\n", - "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", - "config_dir = f\"{workflow_dir}/configs\"\n", - "path_to_options = f\"{workflow_dir}/{prefix}.options.aws.json\"\n", - "\n", - "# Other file locations\n", - "db_credentials_path = f\"{Path.home()}/.config.json\" # CHANGE THIS" + "samples['reference'] = [\n", + " {\n", + " 'name' : 'GRCh38_atac_mito_mask_reference',\n", + " 'location' : 'https://dp-lab-data.s3.amazonaws.com/collaborators/sfeira/ScatacSeqPilot/GRCh38_atac_mito_mask_reference/GRCh38_atac_mito_mask_reference.tar.gz'\n", + " }] * len(samples)\n", + "samples['reference']" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['https://dp-lab-data.s3.amazonaws.com/collaborators/sfeira/ScatacSeqPilot/GRCh38_atac_mito_mask_reference/GRCh38_atac_mito_mask_reference.tar.gz',\n", + " 'https://dp-lab-data.s3.amazonaws.com/collaborators/sfeira/ScatacSeqPilot/GRCh38_atac_mito_mask_reference/GRCh38_atac_mito_mask_reference.tar.gz']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Set credentials based on SCRIdb CLI config file\n", - "with open(db_credentials_path) as f:\n", - " creds = json.load(f)" + "samples['reference'].str['location'].tolist()" ] }, { "cell_type": "code", - "execution_count": 176, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Samples on which to run CellRangerATAC\n", - "# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)\n", - "# Note: Assumes directory name is name of sample\n", - "common_dir = \"s3://dp-lab-data/collaborators/tabarv/IdhWtVersusMutantImmunome\"\n", - "samples = [\n", - " #\"KY-1178_20210308_ATAC\",\n", - " #\"KY-1178_20210218_ATAC\",\n", - " #\"KY-1178_20201117_ATAC\",\n", - " #\"KY-1178_20201119_ATAC\",\n", - " \"KY-1178_20210330_ATAC\",\n", - " #\"KY-1178_20210603_ATAC\",\n", - " #\"KY-1178_20210405_ATAC\",\n", - " #\"KY-1223_20210428_ATAC\",\n", - " #\"KY-1223_20210416_ATAC\",\n", - " #\"KY-1223_20210628_ATAC\",\n", - " #\"KY-1223_20210413_ATAC\",\n", - " #'20201012_Redo_ATAC',\n", - " #'20201013_Redo_ATAC',\n", - " #'20201116_Redo_ATAC',\n", - " #'20201117-PM_Redo_ATAC',\n", - "]\n", - "sample_paths = [\n", - " f\"{common_dir}/{sample}\" for sample in samples\n", - "]" - ] + "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Execution" + "# Generate inputs" ] }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "['KY-1178_20210330_ATAC']\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerATAC.sampleNameCellRangerATAC.fastqNamesCellRangerATAC.fastqFilesCellRangerATAC.referenceGenomeCellRangerATAC.dockerRegistry
Sample
RB-2041_WildType_DOGMAseqRB-2041_WildType_DOGMAseq4440_RB-2041_WildType_DOGMAseq_IGO_1...[s3://dp-lab-data/collaborators/sfei...{'name': 'GRCh38_atac_mito_mask_refe...quay.io/hisplan
RB-2041_mRB54_1003_DOGMAseqRB-2041_mRB54_1003_DOGMAseq4441_RB-2041_mRB54_1003_DOGMAseq_IGO...[s3://dp-lab-data/collaborators/sfei...{'name': 'GRCh38_atac_mito_mask_refe...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerATAC.sampleName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq \n", + "\n", + " CellRangerATAC.fastqNames \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 4440_RB-2041_WildType_DOGMAseq_IGO_1... \n", + "RB-2041_mRB54_1003_DOGMAseq 4441_RB-2041_mRB54_1003_DOGMAseq_IGO... \n", + "\n", + " CellRangerATAC.fastqFiles \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "\n", + " CellRangerATAC.referenceGenome \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'name': 'GRCh38_atac_mito_mask_refe... \n", + "RB-2041_mRB54_1003_DOGMAseq {'name': 'GRCh38_atac_mito_mask_refe... \n", + "\n", + " CellRangerATAC.dockerRegistry \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq quay.io/hisplan \n", + "RB-2041_mRB54_1003_DOGMAseq quay.io/hisplan " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# Get information for all samples\n", - "sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", - "sample_names = [os.path.basename(s) for s in sample_paths]\n", - "#sample_names = [s.replace(\"Redo_\", \"\") for s in sample_names]\n", - "print(sample_names)\n", - "samples = pd.DataFrame(\n", - " sample_paths,\n", - " index=sample_names,\n", - " columns=[\"S3_Path\"],\n", - " dtype=str,\n", - ")\n", - "samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", - " lambda x: get_sample_id(x, creds['user'], creds['password'])\n", - ").values\n", + "# Standard inputs, modify fields as need be\n", "\n", - "# Get FASTQ paths from S3\n", - "# Note: Uses same FASTQ file ids for all samples\n", - "fastq_file_ids = fastq_map[prefix]\n", - "samples[\"FASTQs\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, fastq_file_ids))\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index\n", + "inputs[f\"{prefix}.fastqFiles\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f\"{prefix}.fastqNames\"] = inputs[f\"{prefix}.fastqFiles\"].apply(lambda x: get_fastqs_name(x))\n", + "inputs[f\"{prefix}.referenceGenome\"] = samples[\"reference\"] \n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", "\n", - "# Get reference genome location\n", - "samples[\"Reference\"] = samples[\"Sample_ID\"].apply(lambda x: get_cr_reference(x, prefix, creds[\"user\"], creds[\"password\"]))" + "inputs" ] }, { "cell_type": "code", - "execution_count": 178, + "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "# Load minimum inputs and labels fields from templates\n", - "with open(f\"{config_dir}/template.inputs.json\") as f:\n", - " std_inputs_fields = list(json.load(f).keys())\n", - " \n", + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
RB-2041_WildType_DOGMAseqCellRangerATACscATAC-seq pilotRB-2041_WildType_DOGMAseqsohailns3://dp-lab-data/collaborators/sfeir...-sohailn
RB-2041_mRB54_1003_DOGMAseqCellRangerATACscATAC-seq pilotRB-2041_mRB54_1003_DOGMAseqsohailns3://dp-lab-data/collaborators/sfeir...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq CellRangerATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq CellRangerATAC scATAC-seq pilot \n", + "\n", + " sample owner \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq sohailn \n", + "\n", + " destination transfer \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "\n", + " comment \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq sohailn " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", "with open(f\"{config_dir}/template.labels.json\") as f:\n", " std_labels_fields = list(json.load(f).keys())\n", " \n", "# Annotate all samples with workflow inputs and labels\n", - "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", "\n", - "# Annotate inputs\n", - "inputs[f\"{prefix}.sampleName\"] = inputs.index # may need to change\n", - "inputs[f\"{prefix}.fastqFiles\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", - "inputs[f\"{prefix}.fastqNames\"] = inputs[f\"{prefix}.fastqFiles\"].apply(lambda x: get_fastqs_name(x))\n", - "inputs[f\"{prefix}.referenceGenome\"] = samples[\"Reference\"].apply(lambda x: {\n", - " \"name\": re.match(r'.*refdata-cellranger-arc-(.*).tar.gz', x)[1],\n", - " \"location\": x,\n", - "}) \n", - "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", - "\n", - "# Annotate labels\n", "labels[\"pipelineType\"] = pipeline_type\n", - "labels[\"project\"] = samples[\"Sample_ID\"].apply(lambda x: get_project_id(x, creds[\"user\"], creds[\"password\"]))\n", + "labels[\"project\"] = samples['project_id']\n", "labels[\"sample\"] = labels.index\n", "labels[\"owner\"] = creds[\"user\"]\n", - "labels[\"destination\"] = samples['S3_Path'] + \"/\" + output_dirname\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", "labels[\"transfer\"] = \"-\"\n", "labels[\"comment\"] = creds[\"user\"]\n", "\n", - "assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())\n", - "assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())" + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "labels['destination'] = labels['destination'].str.replace(output_dirname, 'cr-atac-mito-results')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" ] }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -592,14 +863,30 @@ " CellRangerATAC.referenceGenome\n", " CellRangerATAC.dockerRegistry\n", " \n", + " \n", + " Sample\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " KY-1178_20210330_ATAC\n", - " KY-1178_20210330_ATAC\n", - " 3029_KY-1178_20210330_ATAC_IGO_12437...\n", - " [s3://dp-lab-data/collaborators/taba...\n", - " {'name': 'GRCh38-2020-A-2.0.0', 'loc...\n", + " RB-2041_WildType_DOGMAseq\n", + " RB-2041_WildType_DOGMAseq\n", + " 4440_RB-2041_WildType_DOGMAseq_IGO_1...\n", + " [s3://dp-lab-data/collaborators/sfei...\n", + " {'name': 'GRCh38_atac_mito_mask_refe...\n", + " quay.io/hisplan\n", + " \n", + " \n", + " RB-2041_mRB54_1003_DOGMAseq\n", + " RB-2041_mRB54_1003_DOGMAseq\n", + " 4441_RB-2041_mRB54_1003_DOGMAseq_IGO...\n", + " [s3://dp-lab-data/collaborators/sfei...\n", + " {'name': 'GRCh38_atac_mito_mask_refe...\n", " quay.io/hisplan\n", " \n", " \n", @@ -607,23 +894,33 @@ "" ], "text/plain": [ - " CellRangerATAC.sampleName \\\n", - "KY-1178_20210330_ATAC KY-1178_20210330_ATAC \n", + " CellRangerATAC.sampleName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq \n", "\n", - " CellRangerATAC.fastqNames \\\n", - "KY-1178_20210330_ATAC 3029_KY-1178_20210330_ATAC_IGO_12437... \n", + " CellRangerATAC.fastqNames \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 4440_RB-2041_WildType_DOGMAseq_IGO_1... \n", + "RB-2041_mRB54_1003_DOGMAseq 4441_RB-2041_mRB54_1003_DOGMAseq_IGO... \n", "\n", - " CellRangerATAC.fastqFiles \\\n", - "KY-1178_20210330_ATAC [s3://dp-lab-data/collaborators/taba... \n", + " CellRangerATAC.fastqFiles \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", "\n", - " CellRangerATAC.referenceGenome \\\n", - "KY-1178_20210330_ATAC {'name': 'GRCh38-2020-A-2.0.0', 'loc... \n", + " CellRangerATAC.referenceGenome \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'name': 'GRCh38_atac_mito_mask_refe... \n", + "RB-2041_mRB54_1003_DOGMAseq {'name': 'GRCh38_atac_mito_mask_refe... \n", "\n", - " CellRangerATAC.dockerRegistry \n", - "KY-1178_20210330_ATAC quay.io/hisplan " + " CellRangerATAC.dockerRegistry \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq quay.io/hisplan \n", + "RB-2041_mRB54_1003_DOGMAseq quay.io/hisplan " ] }, - "execution_count": 179, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -634,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -666,37 +963,65 @@ " transfer\n", " comment\n", " \n", + " \n", + " Sample\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " KY-1178_20210330_ATAC\n", + " RB-2041_WildType_DOGMAseq\n", " CellRangerATAC\n", - " IDH wt versus mutant Immunome\n", - " KY-1178_20210330_ATAC\n", - " moormana\n", - " s3://dp-lab-data/collaborators/tabar...\n", + " scATAC-seq pilot\n", + " RB-2041_WildType_DOGMAseq\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/sfeir...\n", " -\n", - " moormana\n", + " sohailn\n", + " \n", + " \n", + " RB-2041_mRB54_1003_DOGMAseq\n", + " CellRangerATAC\n", + " scATAC-seq pilot\n", + " RB-2041_mRB54_1003_DOGMAseq\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/sfeir...\n", + " -\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pipelineType project \\\n", - "KY-1178_20210330_ATAC CellRangerATAC IDH wt versus mutant Immunome \n", + " pipelineType project \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq CellRangerATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq CellRangerATAC scATAC-seq pilot \n", "\n", - " sample owner \\\n", - "KY-1178_20210330_ATAC KY-1178_20210330_ATAC moormana \n", + " sample owner \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq sohailn \n", "\n", - " destination transfer \\\n", - "KY-1178_20210330_ATAC s3://dp-lab-data/collaborators/tabar... - \n", + " destination transfer \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", "\n", - " comment \n", - "KY-1178_20210330_ATAC moormana " + " comment \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq sohailn " ] }, - "execution_count": 180, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -707,20 +1032,25 @@ }, { "cell_type": "code", - "execution_count": 181, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "bf400f8f2caf424fa398e48d8e832bbb", + "model_id": "53694d4b522c4bf19dc886bfc779d946", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
IgG_As3://dp-lab-data/collaborators/lowe/...2638mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2639mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...
IFNAR_As3://dp-lab-data/collaborators/lowe/...2640mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...
\n", + "" + ], + "text/plain": [ + " AWS_storage id species sc_tech \\\n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2638 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2639 mouse 10X_V3.1 \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... 2640 mouse 10X_V3.1 \n", + "\n", + " project_id reference \n", + "Sample \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR_A 9p s3://seqc-public/genomes/mm38_long_p... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Setup" + "# You can supply a list of IDs or request IDs\n", + "\n", + "# request_ids = ['ARN-1167']\n", + "# samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "sample_ids = list(range(2638, 2641))\n", + "samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "samples" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, + "id": "fd4c6943-69e1-4986-ae3b-2ec58e8f0d58", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "422c1151-9e53-4ff3-a178-7a37d69a586d", + "metadata": {}, "source": [ - "# Location of docker files\n", - "common_docker_registry = \"quay.io/hisplan\"\n", - "\n", - "prefix = \"CellRangerGex\" # Workflow to run; also .wdl filename prefix\n", - "pipeline_type = prefix # field in *.labels.json\n", - "output_dirname = \"cr-gex-results\"\n", + "IMPORTANT NOTE \n", "\n", - "# If need to add comment, put here\n", - "comment = \"\"" + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, + "id": "502590c9-15ac-40e1-8d78-1aba15cb02cd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IgG_A\n", + " PRE BM-1754_IgG_A_IFNpooled/\n", + " PRE IgG_A/\n", + " PRE IgG_A_HTO/\n", + "\n", + "IgG_C\n", + " PRE BM-1286_IgG_C_IFN_enriched/\n", + " PRE BM-1754_IgG_C_IFNpooled/\n", + " PRE IgG_C/\n", + " PRE IgG_C_HTO/\n", + "\n", + "IFNAR_A\n", + " PRE BM-1754_IFNAR_A_IFNpooled/\n", + " PRE IFNAR_A/\n", + " PRE IFNAR_A_HTO/\n", + "\n" + ] + } + ], "source": [ - "# Locations of workflow-related directories and files\n", - "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\" # CHANGE THIS\n", - "workflow_dir = f\"{Path.home()}/scing/bin/cellranger-gex-6.1.2\" # CHANGE THIS\n", - "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", - "config_dir = f\"{workflow_dir}/configs\"\n", - "path_to_options = f\"{workflow_dir}/{prefix}.options.aws.json\"\n", + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", "\n", - "# Other file locations\n", - "db_credentials_path = f\"{Path.home()}/.config.json\" # CHANGE THIS" + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, + "id": "175055cd-ebda-4485-a2bf-0f869cb4180b", "metadata": {}, "outputs": [], - "source": [ - "# Set credentials based on SCRIdb CLI config file\n", - "with open(db_credentials_path) as f:\n", - " creds = json.load(f)" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 10, + "id": "e85154ba-47ad-4f3a-b2c8-9c81f2d33e23", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IgG_A\n", + "IgG_C\n", + "IFNAR_A\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
IgG_As3://dp-lab-data/collaborators/lowe/...2638mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2639mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IFNAR_As3://dp-lab-data/collaborators/lowe/...2640mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species sc_tech \\\n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2638 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2639 mouse 10X_V3.1 \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... 2640 mouse 10X_V3.1 \n", + "\n", + " project_id reference \\\n", + "Sample \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " FASTQs \n", + "Sample \n", + "IgG_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R1': ['s3://dp-lab-data/collaborat... \n", + "IFNAR_A {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If FASTQ files are in one folder\n", + "# fastq_map = ['R1','R2']\n", + "# samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map, \"FASTQ\"))\n", + "\n", + "# samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "\n", + "# If the sample has SEQC split FASTQ files into different folders\n", + "fastqs = []\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " r1 = get_fastqs(row['AWS_storage'], ['R1'], \"barcode\")\n", + " r2 = get_fastqs(row['AWS_storage'], ['R2'], \"genomic\")\n", + " \n", + " fastqs.append({**r1, **r2})\n", + " \n", + "samples['FASTQs'] = fastqs\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f09b80a6-bed0-4eb0-80c6-45af8a88e35f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5cd3153e-5291-4d9b-b094-6e0704e594a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQsFASTQ_list
Sample
IgG_As3://dp-lab-data/collaborators/lowe/...2638mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...[s3://dp-lab-data/collaborators/lowe...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2639mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...[s3://dp-lab-data/collaborators/lowe...
IFNAR_As3://dp-lab-data/collaborators/lowe/...2640mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...[s3://dp-lab-data/collaborators/lowe...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species sc_tech \\\n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2638 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2639 mouse 10X_V3.1 \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... 2640 mouse 10X_V3.1 \n", + "\n", + " project_id reference \\\n", + "Sample \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " FASTQs \\\n", + "Sample \n", + "IgG_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R1': ['s3://dp-lab-data/collaborat... \n", + "IFNAR_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "\n", + " FASTQ_list \n", + "Sample \n", + "IgG_A [s3://dp-lab-data/collaborators/lowe... \n", + "IgG_C [s3://dp-lab-data/collaborators/lowe... \n", + "IFNAR_A [s3://dp-lab-data/collaborators/lowe... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['FASTQ_list'] = [[]] * len(samples)\n", + "\n", + "fastqs = []\n", + "\n", + "for sample, row in samples.iterrows():\n", + " fastq_list = np.ravel(list(row['FASTQs'].values()))\n", + " \n", + " fastq_samp = []\n", + " for fastq in fastq_list:\n", + " filename = fastq.split('/')[-1]\n", + " if filename.startswith('26'):\n", + " fastq_samp.append(fastq)\n", + " fastqs.append(fastq_samp)\n", + "\n", + "samples['FASTQ_list'] = fastqs\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8f7b1bf-a605-4748-8858-978c2392c271", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5a520309-4ea8-444e-ab81-bd3586cbb02b", + "metadata": {}, + "source": [ + "IMPORTANT NOTE\n", + "\n", + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e8d952ba-cdad-4b93-80d2-7b0dba6f6a89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IgG_A\n", + "collaborators/lowe/9P/IgG_A/genomic/2638_IgG_A_IGO_12104_32_S20_L001_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2638_IgG_A_IGO_12104_32_S20_L002_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2638_IgG_A_IGO_12104_32_S20_L003_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2638_IgG_A_IGO_12104_32_S20_L004_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2917_IgG_A_IGO_12317_37_S35_L001_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2917_IgG_A_IGO_12317_37_S35_L002_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2917_IgG_A_IGO_12317_37_S35_L003_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/genomic/2917_IgG_A_IGO_12317_37_S35_L004_R2_001.fastq.gz\n", + "collaborators/lowe/9P/IgG_A/barcode/2638_IgG_A_IGO_12104_32_S20_L001_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2638_IgG_A_IGO_12104_32_S20_L002_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2638_IgG_A_IGO_12104_32_S20_L003_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2638_IgG_A_IGO_12104_32_S20_L004_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2917_IgG_A_IGO_12317_37_S35_L001_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2917_IgG_A_IGO_12317_37_S35_L002_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2917_IgG_A_IGO_12317_37_S35_L003_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_A/barcode/2917_IgG_A_IGO_12317_37_S35_L004_R1_001.fastq.gz\n", + "IgG_C\n", + "collaborators/lowe/9P/IgG_C/barcode/2639_IgG_C_IGO_12104_33_S19_L001_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2639_IgG_C_IGO_12104_33_S19_L002_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2639_IgG_C_IGO_12104_33_S19_L003_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2639_IgG_C_IGO_12104_33_S19_L004_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2918_IgG_C_IGO_12317_38_S36_L001_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2918_IgG_C_IGO_12317_38_S36_L002_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2918_IgG_C_IGO_12317_38_S36_L003_R1_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/barcode/2918_IgG_C_IGO_12317_38_S36_L004_R1_001.fastq.gz\n", + "collaborators/lowe/9P/IgG_C/genomic/2639_IgG_C_IGO_12104_33_S19_L001_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2639_IgG_C_IGO_12104_33_S19_L002_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2639_IgG_C_IGO_12104_33_S19_L003_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2639_IgG_C_IGO_12104_33_S19_L004_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2918_IgG_C_IGO_12317_38_S36_L001_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2918_IgG_C_IGO_12317_38_S36_L002_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2918_IgG_C_IGO_12317_38_S36_L003_R2_001.fastq.gz\tcollaborators/lowe/9P/IgG_C/genomic/2918_IgG_C_IGO_12317_38_S36_L004_R2_001.fastq.gz\n", + "IFNAR_A\n", + "collaborators/lowe/9P/IFNAR_A/barcode/2640_IFNAR_A_IGO_12104_34_S21_L001_R1_001.fastq.gz\tcollaborators/lowe/9P/IFNAR_A/barcode/2640_IFNAR_A_IGO_12104_34_S21_L002_R1_001.fastq.gz\tcollaborators/lowe/9P/IFNAR_A/barcode/2640_IFNAR_A_IGO_12104_34_S21_L003_R1_001.fastq.gz\tcollaborators/lowe/9P/IFNAR_A/barcode/2640_IFNAR_A_IGO_12104_34_S21_L004_R1_001.fastq.gz\n", + "collaborators/lowe/9P/IFNAR_A/genomic/2640_IFNAR_A_IGO_12104_34_S21_L001_R2_001.fastq.gz\tcollaborators/lowe/9P/IFNAR_A/genomic/2640_IFNAR_A_IGO_12104_34_S21_L002_R2_001.fastq.gz\tcollaborators/lowe/9P/IFNAR_A/genomic/2640_IFNAR_A_IGO_12104_34_S21_L003_R2_001.fastq.gz\tcollaborators/lowe/9P/IFNAR_A/genomic/2640_IFNAR_A_IGO_12104_34_S21_L004_R2_001.fastq.gz\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQs'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a59060ca-b11f-4e74-9edf-5ce3e863a064", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "45b907a9-53cc-4cf7-a678-5d2f4798e457", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "For CellRanger you need to supply an HTTPS path. So if you are using a custom genome stored on AWS, you must make the reference public !" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "16d0ca53-edc7-4e1b-9c9c-0d812a1ae1c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQsFASTQ_list
Sample
IgG_As3://dp-lab-data/collaborators/lowe/...2638mouse10X_V3.19phttps://cf.10xgenomics.com/supp/cell...{'R1': ['s3://dp-lab-data/collaborat...[s3://dp-lab-data/collaborators/lowe...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2639mouse10X_V3.19phttps://cf.10xgenomics.com/supp/cell...{'R1': ['s3://dp-lab-data/collaborat...[s3://dp-lab-data/collaborators/lowe...
IFNAR_As3://dp-lab-data/collaborators/lowe/...2640mouse10X_V3.19phttps://cf.10xgenomics.com/supp/cell...{'R1': ['s3://dp-lab-data/collaborat...[s3://dp-lab-data/collaborators/lowe...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species sc_tech \\\n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2638 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2639 mouse 10X_V3.1 \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... 2640 mouse 10X_V3.1 \n", + "\n", + " project_id reference \\\n", + "Sample \n", + "IgG_A 9p https://cf.10xgenomics.com/supp/cell... \n", + "IgG_C 9p https://cf.10xgenomics.com/supp/cell... \n", + "IFNAR_A 9p https://cf.10xgenomics.com/supp/cell... \n", + "\n", + " FASTQs \\\n", + "Sample \n", + "IgG_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R1': ['s3://dp-lab-data/collaborat... \n", + "IFNAR_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "\n", + " FASTQ_list \n", + "Sample \n", + "IgG_A [s3://dp-lab-data/collaborators/lowe... \n", + "IgG_C [s3://dp-lab-data/collaborators/lowe... \n", + "IFNAR_A [s3://dp-lab-data/collaborators/lowe... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = update_ref(samples, prefix)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e5dee5bc-3e06-4e62-a780-1a5b4aeff1f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz',\n", + " 'https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# samples['reference'] = 'https://dp-lab-data.s3.amazonaws.com/collaborators/aboire/LeptomeningealMetHeterogeneity/transgene_reference/refdata-cellranger/Leptomeningeal_metastasis_heterogeneity-GRCm38-Ensembl-87-transgenes.tar.gz'\n", + "list(samples['reference'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c17d2515-3139-4f9f-ac81-af33948c8d9d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "3fde96b2-f3d4-4c8c-b1c8-8cf5d67e4f73", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "493bb219-4f15-4028-bc80-79b13a24b6f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerGex.sampleNameCellRangerGex.fastqNameCellRangerGex.inputFastqCellRangerGex.referenceUrlCellRangerGex.includeIntronsCellRangerGex.expectCellsCellRangerGex.memoryCellRangerGex.dockerRegistry
Sample
IgG_AIgG_A2638_IgG_A_IGO_12104_32[s3://dp-lab-data/collaborators/lowe...https://cf.10xgenomics.com/supp/cell...False5000256quay.io/hisplan
IgG_CIgG_C2639_IgG_C_IGO_12104_33[s3://dp-lab-data/collaborators/lowe...https://cf.10xgenomics.com/supp/cell...False5000256quay.io/hisplan
IFNAR_AIFNAR_A2640_IFNAR_A_IGO_12104_34[s3://dp-lab-data/collaborators/lowe...https://cf.10xgenomics.com/supp/cell...False5000256quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerGex.sampleName CellRangerGex.fastqName \\\n", + "Sample \n", + "IgG_A IgG_A 2638_IgG_A_IGO_12104_32 \n", + "IgG_C IgG_C 2639_IgG_C_IGO_12104_33 \n", + "IFNAR_A IFNAR_A 2640_IFNAR_A_IGO_12104_34 \n", + "\n", + " CellRangerGex.inputFastq \\\n", + "Sample \n", + "IgG_A [s3://dp-lab-data/collaborators/lowe... \n", + "IgG_C [s3://dp-lab-data/collaborators/lowe... \n", + "IFNAR_A [s3://dp-lab-data/collaborators/lowe... \n", + "\n", + " CellRangerGex.referenceUrl \\\n", + "Sample \n", + "IgG_A https://cf.10xgenomics.com/supp/cell... \n", + "IgG_C https://cf.10xgenomics.com/supp/cell... \n", + "IFNAR_A https://cf.10xgenomics.com/supp/cell... \n", + "\n", + " CellRangerGex.includeIntrons CellRangerGex.expectCells \\\n", + "Sample \n", + "IgG_A False 5000 \n", + "IgG_C False 5000 \n", + "IFNAR_A False 5000 \n", + "\n", + " CellRangerGex.memory CellRangerGex.dockerRegistry \n", + "Sample \n", + "IgG_A 256 quay.io/hisplan \n", + "IgG_C 256 quay.io/hisplan \n", + "IFNAR_A 256 quay.io/hisplan " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.sampleName\"] = inputs.index # may need to change\n", + "inputs[f\"{prefix}.inputFastq\"] = samples['FASTQ_list']\n", + "# inputs[f\"{prefix}.inputFastq\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f\"{prefix}.fastqName\"] = inputs[f\"{prefix}.inputFastq\"].apply(lambda x: get_fastqs_name(x))\n", + "inputs[f\"{prefix}.referenceUrl\"] = samples[\"reference\"] \n", + "inputs[f\"{prefix}.includeIntrons\"] = False\n", + "inputs[f\"{prefix}.expectCells\"] = 5000\n", + "inputs[f\"{prefix}.memory\"] = 256\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5597cb90-97fd-434e-91b8-d45139e7f9b4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "627d7281-9c1c-4644-8925-7d709cc6ae82", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2c8fc170-74c6-4cc8-b9b0-a756701d5643", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check to see if these samples are single nuclei. If they are nuclei, then you must set includeIntrons to be TRUE" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c30e1518-da11-4069-a501-32f9e04ef389", + "metadata": {}, + "outputs": [], + "source": [ + "# Modification for specific samples\n", + "inputs[f'{prefix}.includeIntrons'] = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b9c2618-f4a5-45ce-aea5-cdeb671f88c5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a8f190ac-c493-4bec-b64d-2b35d1e2d104", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "03d5abbb-6bf0-4636-8bb7-edd2463439c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
IgG_ACellRangerGex9pIgG_Asohailns3://dp-lab-data/collaborators/lowe/...-sohailn
IgG_CCellRangerGex9pIgG_Csohailns3://dp-lab-data/collaborators/lowe/...-sohailn
IFNAR_ACellRangerGex9pIFNAR_Asohailns3://dp-lab-data/collaborators/lowe/...-sohailn
\n", + "
" + ], "text/plain": [ - "['SU-1358_C10_T2_on_treatment',\n", - " 'SU-1377_C11_screening_pancreas',\n", - " 'SU-1400_B06_screening_pancreas',\n", - " 'SU-1410_B07_liver_screening',\n", - " 'SU-1419_C12_liver_screening']" + " pipelineType project sample owner \\\n", + "Sample \n", + "IgG_A CellRangerGex 9p IgG_A sohailn \n", + "IgG_C CellRangerGex 9p IgG_C sohailn \n", + "IFNAR_A CellRangerGex 9p IFNAR_A sohailn \n", + "\n", + " destination transfer comment \n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... - sohailn \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... - sohailn \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... - sohailn " ] }, - "execution_count": 129, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "l = !aws s3 ls $common_dir/\n", - "[s.strip()[4:-1] for s in l]" + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" ] }, { "cell_type": "code", - "execution_count": 142, + "execution_count": null, + "id": "ee1ac339-1817-49d2-9e84-047087539e9e", "metadata": {}, "outputs": [], - "source": [ - "# Samples on which to run CellRangerATAC\n", - "# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)\n", - "# Note: Assumes directory name is name of sample\n", - "common_dir = \"s3://dp-lab-data/sc-seq/Project_12437_S\"\n", - "samples = [\n", - " 'SU-1358_C10_T2_on_treatment',\n", - " #'SU-1377_C11_screening_pancreas',\n", - " #'SU-1400_B06_screening_pancreas',\n", - " #'SU-1410_B07_liver_screening',\n", - " #'SU-1419_C12_liver_screening'\n", - "]\n", - "sample_paths = [\n", - " f\"{common_dir}/{sample}\" for sample in samples\n", - "]" - ] + "source": [] }, { - "cell_type": "code", - "execution_count": 112, + "cell_type": "markdown", + "id": "7a60e244-5f8b-4437-9274-ee7ce7e8135f", "metadata": {}, - "outputs": [], "source": [ - "# Set path to transgene reference S3\n", - "# Note: This is an exceptional case\n", - "# path_to_reference = f\"{common_dir}/transgene_reference/refdata-cellranger/3PS19_SNSEQ-GRCm38-Ensembl-87-transgenes.tar.gz\"\n", - "path_to_reference = \"https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz\"" + "# Run samples" ] }, { "cell_type": "markdown", + "id": "60e68adf-8cdd-4421-97ba-202103e5e014", "metadata": {}, "source": [ - "## Execution" + "Look over the samples before submitting one last time" ] }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 18, + "id": "ee601aed-3a19-4317-a918-4120db7ee7bf", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "['SU-1358_C10_T2_on_treatment']\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerGex.sampleNameCellRangerGex.fastqNameCellRangerGex.inputFastqCellRangerGex.referenceUrlCellRangerGex.includeIntronsCellRangerGex.expectCellsCellRangerGex.memoryCellRangerGex.dockerRegistry
Sample
IgG_AIgG_A2638_IgG_A_IGO_12104_32[s3://dp-lab-data/collaborators/lowe...https://cf.10xgenomics.com/supp/cell...False5000256quay.io/hisplan
IgG_CIgG_C2639_IgG_C_IGO_12104_33[s3://dp-lab-data/collaborators/lowe...https://cf.10xgenomics.com/supp/cell...False5000256quay.io/hisplan
IFNAR_AIFNAR_A2640_IFNAR_A_IGO_12104_34[s3://dp-lab-data/collaborators/lowe...https://cf.10xgenomics.com/supp/cell...False5000256quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerGex.sampleName CellRangerGex.fastqName \\\n", + "Sample \n", + "IgG_A IgG_A 2638_IgG_A_IGO_12104_32 \n", + "IgG_C IgG_C 2639_IgG_C_IGO_12104_33 \n", + "IFNAR_A IFNAR_A 2640_IFNAR_A_IGO_12104_34 \n", + "\n", + " CellRangerGex.inputFastq \\\n", + "Sample \n", + "IgG_A [s3://dp-lab-data/collaborators/lowe... \n", + "IgG_C [s3://dp-lab-data/collaborators/lowe... \n", + "IFNAR_A [s3://dp-lab-data/collaborators/lowe... \n", + "\n", + " CellRangerGex.referenceUrl \\\n", + "Sample \n", + "IgG_A https://cf.10xgenomics.com/supp/cell... \n", + "IgG_C https://cf.10xgenomics.com/supp/cell... \n", + "IFNAR_A https://cf.10xgenomics.com/supp/cell... \n", + "\n", + " CellRangerGex.includeIntrons CellRangerGex.expectCells \\\n", + "Sample \n", + "IgG_A False 5000 \n", + "IgG_C False 5000 \n", + "IFNAR_A False 5000 \n", + "\n", + " CellRangerGex.memory CellRangerGex.dockerRegistry \n", + "Sample \n", + "IgG_A 256 quay.io/hisplan \n", + "IgG_C 256 quay.io/hisplan \n", + "IFNAR_A 256 quay.io/hisplan " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# Get information for all samples\n", - "sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", - "sample_names = [os.path.basename(s) for s in sample_paths]\n", - "\n", - "print(sample_names)\n", - "samples = pd.DataFrame(\n", - " sample_paths,\n", - " index=sample_names,\n", - " columns=[\"S3_Path\"],\n", - " dtype=str,\n", - ")\n", - "samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", - " lambda x: get_sample_id(x, creds['user'], creds['password'])\n", - ").values\n", - "\n", - "# Get FASTQ paths from S3\n", - "# Note: Uses same FASTQ file ids for all samples\n", - "fastq_file_ids = fastq_map[prefix]\n", - "samples[\"FASTQs\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, fastq_file_ids, \"FASTQ\"))\n", - "\n", - "# Get reference genome location\n", - "samples[\"Reference\"] = samples[\"Sample_ID\"].apply(lambda x: get_cr_reference(x, prefix, creds[\"user\"], creds[\"password\"]))\n", - "#samples[\"Reference\"] = path_to_reference" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [], - "source": [ - "# Load minimum inputs and labels fields from templates\n", - "with open(f\"{config_dir}/template.inputs.json\") as f:\n", - " std_inputs_fields = list(json.load(f).keys())\n", - " \n", - "with open(f\"{config_dir}/template.labels.json\") as f:\n", - " std_labels_fields = list(json.load(f).keys())\n", - " \n", - "# Annotate all samples with workflow inputs and labels\n", - "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", - "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", - "\n", - "# Annotate inputs\n", - "inputs[f\"{prefix}.sampleName\"] = inputs.index # may need to change\n", - "inputs[f\"{prefix}.inputFastq\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", - "inputs[f\"{prefix}.fastqName\"] = inputs[f\"{prefix}.inputFastq\"].apply(lambda x: get_fastqs_name(x))\n", - "inputs[f\"{prefix}.referenceUrl\"] = samples[\"Reference\"] \n", - "inputs[f\"{prefix}.includeIntrons\"] = False\n", - "inputs[f\"{prefix}.expectCells\"] = 5000\n", - "inputs[f\"{prefix}.memory\"] = 256\n", - "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", - "\n", - "# Annotate labels\n", - "labels[\"pipelineType\"] = pipeline_type\n", - "labels[\"project\"] = samples[\"Sample_ID\"].apply(lambda x: get_project_id(x, creds[\"user\"], creds[\"password\"]))\n", - "labels[\"sample\"] = labels.index\n", - "labels[\"owner\"] = creds[\"user\"]\n", - "labels[\"destination\"] = samples['S3_Path'] + \"/\" + output_dirname\n", - "labels[\"transfer\"] = \"-\"\n", - "labels[\"comment\"] = creds[\"user\"]\n", - "\n", - "assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())\n", - "assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())" + "inputs" ] }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 19, + "id": "22230ce9-a027-474f-90f5-c8d12c1096eb", "metadata": {}, "outputs": [ { @@ -630,17 +1369,50 @@ " CellRangerGex.memory\n", " CellRangerGex.dockerRegistry\n", " \n", + " \n", + " Sample\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " SU-1358_C10_T2_on_treatment\n", - " SU-1358_C10_T2_on_treatment\n", - " 3447_SU-1358_C10_T2_on_treatment_IGO...\n", - " [s3://dp-lab-data/sc-seq/Project_124...\n", + " IgG_A\n", + " IgG_A\n", + " 2638_IgG_A_IGO_12104_32\n", + " [s3://dp-lab-data/collaborators/lowe...\n", " https://cf.10xgenomics.com/supp/cell...\n", " False\n", " 5000\n", - " 256\n", + " 384\n", + " quay.io/hisplan\n", + " \n", + " \n", + " IgG_C\n", + " IgG_C\n", + " 2639_IgG_C_IGO_12104_33\n", + " [s3://dp-lab-data/collaborators/lowe...\n", + " https://cf.10xgenomics.com/supp/cell...\n", + " False\n", + " 5000\n", + " 384\n", + " quay.io/hisplan\n", + " \n", + " \n", + " IFNAR_A\n", + " IFNAR_A\n", + " 2640_IFNAR_A_IGO_12104_34\n", + " [s3://dp-lab-data/collaborators/lowe...\n", + " https://cf.10xgenomics.com/supp/cell...\n", + " False\n", + " 5000\n", + " 384\n", " quay.io/hisplan\n", " \n", " \n", @@ -648,40 +1420,52 @@ "" ], "text/plain": [ - " CellRangerGex.sampleName \\\n", - "SU-1358_C10_T2_on_treatment SU-1358_C10_T2_on_treatment \n", - "\n", - " CellRangerGex.fastqName \\\n", - "SU-1358_C10_T2_on_treatment 3447_SU-1358_C10_T2_on_treatment_IGO... \n", + " CellRangerGex.sampleName CellRangerGex.fastqName \\\n", + "Sample \n", + "IgG_A IgG_A 2638_IgG_A_IGO_12104_32 \n", + "IgG_C IgG_C 2639_IgG_C_IGO_12104_33 \n", + "IFNAR_A IFNAR_A 2640_IFNAR_A_IGO_12104_34 \n", "\n", - " CellRangerGex.inputFastq \\\n", - "SU-1358_C10_T2_on_treatment [s3://dp-lab-data/sc-seq/Project_124... \n", + " CellRangerGex.inputFastq \\\n", + "Sample \n", + "IgG_A [s3://dp-lab-data/collaborators/lowe... \n", + "IgG_C [s3://dp-lab-data/collaborators/lowe... \n", + "IFNAR_A [s3://dp-lab-data/collaborators/lowe... \n", "\n", - " CellRangerGex.referenceUrl \\\n", - "SU-1358_C10_T2_on_treatment https://cf.10xgenomics.com/supp/cell... \n", + " CellRangerGex.referenceUrl \\\n", + "Sample \n", + "IgG_A https://cf.10xgenomics.com/supp/cell... \n", + "IgG_C https://cf.10xgenomics.com/supp/cell... \n", + "IFNAR_A https://cf.10xgenomics.com/supp/cell... \n", "\n", - " CellRangerGex.includeIntrons \\\n", - "SU-1358_C10_T2_on_treatment False \n", + " CellRangerGex.includeIntrons CellRangerGex.expectCells \\\n", + "Sample \n", + "IgG_A False 5000 \n", + "IgG_C False 5000 \n", + "IFNAR_A False 5000 \n", "\n", - " CellRangerGex.expectCells CellRangerGex.memory \\\n", - "SU-1358_C10_T2_on_treatment 5000 256 \n", - "\n", - " CellRangerGex.dockerRegistry \n", - "SU-1358_C10_T2_on_treatment quay.io/hisplan " + " CellRangerGex.memory CellRangerGex.dockerRegistry \n", + "Sample \n", + "IgG_A 384 quay.io/hisplan \n", + "IgG_C 384 quay.io/hisplan \n", + "IFNAR_A 384 quay.io/hisplan " ] }, - "execution_count": 146, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "inputs[f'{prefix}.memory'] = inputs[f'{prefix}.memory'] * 1.5\n", + "inputs[f'{prefix}.memory'] = inputs[f'{prefix}.memory'].astype(int)\n", "inputs" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 20, + "id": "91318853-0886-4c2b-8b87-76fa8e5ab30d", "metadata": {}, "outputs": [ { @@ -713,37 +1497,67 @@ " transfer\n", " comment\n", " \n", + " \n", + " Sample\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " SU-1358_C10_T2_on_treatment\n", + " IgG_A\n", " CellRangerGex\n", - " POLAR\n", - " SU-1358_C10_T2_on_treatment\n", - " moormana\n", - " s3://dp-lab-data/sc-seq/Project_1243...\n", + " 9p\n", + " IgG_A\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/lowe/...\n", " -\n", - " moormana\n", + " sohailn\n", + " \n", + " \n", + " IgG_C\n", + " CellRangerGex\n", + " 9p\n", + " IgG_C\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/lowe/...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " IFNAR_A\n", + " CellRangerGex\n", + " 9p\n", + " IFNAR_A\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/lowe/...\n", + " -\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pipelineType project \\\n", - "SU-1358_C10_T2_on_treatment CellRangerGex POLAR \n", + " pipelineType project sample owner \\\n", + "Sample \n", + "IgG_A CellRangerGex 9p IgG_A sohailn \n", + "IgG_C CellRangerGex 9p IgG_C sohailn \n", + "IFNAR_A CellRangerGex 9p IFNAR_A sohailn \n", "\n", - " sample owner \\\n", - "SU-1358_C10_T2_on_treatment SU-1358_C10_T2_on_treatment moormana \n", - "\n", - " destination transfer \\\n", - "SU-1358_C10_T2_on_treatment s3://dp-lab-data/sc-seq/Project_1243... - \n", - "\n", - " comment \n", - "SU-1358_C10_T2_on_treatment moormana " + " destination transfer comment \n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... - sohailn \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... - sohailn \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... - sohailn " ] }, - "execution_count": 149, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -754,20 +1568,27 @@ }, { "cell_type": "code", - "execution_count": 150, - "metadata": { - "tags": [] - }, + "execution_count": null, + "id": "acd1c04b-d582-481b-b8d4-bd9a9a8f97c4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "54ef3d9c-2361-4b8e-bcfe-bc9c3878876b", + "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c261084afa9f437daabaecb8054fea46", + "model_id": "62455c03a5c04c3abc1aee15d02ce014", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/1 [00:00IMPORTANT NOTE \n", + "\n", + "You won't be able to see these outputs until the sample processing finishes succesfully :)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e0a6894-0ce4-4041-be2f-49999c972503", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import IFrame\n", + "\n", + "for sample, row in labels.iterrows():\n", + " destination = row['destination']\n", + " html = f'{destination}/web_summary.html'\n", + " file_out = f'web_summary/{sample}.web_summary.html'\n", + " \n", + " !aws s3 cp $html $file_out\n", + " display(IFrame(src=file_out, width=980, height=600))" + ] } ], "metadata": { @@ -861,5 +1718,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/Run_CellRangerVdj.ipynb b/notebooks/Run_CellRangerVdj.ipynb new file mode 100644 index 0000000..f4fbdaf --- /dev/null +++ b/notebooks/Run_CellRangerVdj.ipynb @@ -0,0 +1,1096 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fbbdcd09-b369-493d-9402-ca18a45eee5d", + "metadata": {}, + "outputs": [], + "source": [ + "import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging\n", + "import pandas as pd\n", + "import numpy as np\n", + "from s3path import S3Path\n", + "from pathlib import Path\n", + "from tqdm.notebook import tqdm\n", + "from packaging import version\n", + "\n", + "import glob\n", + "import os\n", + "\n", + "pd.set_option(\"display.max_colwidth\", 40)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ca66d7bc-a113-454a-b4d4-7e2a56203dc4", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92d253e4-bb0c-410f-9608-0bb729af6649", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2de5f2a6-dba8-486d-bb07-92dbeff10150", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "# AWS setup" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1023bdd2-6db6-4274-bd1b-13f092fd11d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMGLBAIGO6\n", + "env: AWS_SECRET_ACCESS_KEY=i6fm87l+UNBLyxlwU4AXD2HEGTXYutOlj4lfB9bk\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEK///////////wEaCXVzLWVhc3QtMSJHMEUCIQCAs8AyD3w8/luR3fB4wyOA0fC5vzw2nyzspQ7/+Pgf/AIgBhFvsspOJmLbACqO8/42UuRe7nu6vdi7DaJ5k/UFDxsq7wEISBADGgw1ODM2NDM1Njc1MTIiDP2ESPVfC1OK7pmq+SrMAQRwXr+RvcapTJP/skUqP4og+mqro7RYgBPQ8OQKWReZ0JLWYk2+/npsDqlMM5j7Zre4aQPkttvkyacSoDBo4B/SufvOD5eotZyXsKR5zEjrgn9HTQGxlylIakbUK/7pJSe4pJxAya79ZEC/1PrAUYnZR2ZGGgYgRF0oIwzbLblSoAmBYeNbbkbluJC3qbicERnH/BHlO4oHXPMesuS+QncRWn8rlRxGRySgTaW+gcH2tBwDBhB+RURyCtXC3AyJmQ7tp4MerTHhTToIaTCIqe2YBjqYAbdygCv6Qz2i+YqCafTb2npOoT7PPA6ooeWnXXzZXPc8POEASCBAq6bOIWvGWcKxwVwgode8mb8/R9oBnXQh0wdqm+ufpGuUAaVE8Y8rgNGXiKnCq/QywzTtjpgOTvJkYw8WPUWE6J8KhN10WqJtEdvif9E15Zqj7H69BL+Zy2wayYbu35OTPBuK3q4QlAVNk5fS+Uvkwxyz\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "# Load aws\n", + "\n", + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMGLBAIGO6\n", + "%env AWS_SECRET_ACCESS_KEY=i6fm87l+UNBLyxlwU4AXD2HEGTXYutOlj4lfB9bk\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEK///////////wEaCXVzLWVhc3QtMSJHMEUCIQCAs8AyD3w8/luR3fB4wyOA0fC5vzw2nyzspQ7/+Pgf/AIgBhFvsspOJmLbACqO8/42UuRe7nu6vdi7DaJ5k/UFDxsq7wEISBADGgw1ODM2NDM1Njc1MTIiDP2ESPVfC1OK7pmq+SrMAQRwXr+RvcapTJP/skUqP4og+mqro7RYgBPQ8OQKWReZ0JLWYk2+/npsDqlMM5j7Zre4aQPkttvkyacSoDBo4B/SufvOD5eotZyXsKR5zEjrgn9HTQGxlylIakbUK/7pJSe4pJxAya79ZEC/1PrAUYnZR2ZGGgYgRF0oIwzbLblSoAmBYeNbbkbluJC3qbicERnH/BHlO4oHXPMesuS+QncRWn8rlRxGRySgTaW+gcH2tBwDBhB+RURyCtXC3AyJmQ7tp4MerTHhTToIaTCIqe2YBjqYAbdygCv6Qz2i+YqCafTb2npOoT7PPA6ooeWnXXzZXPc8POEASCBAq6bOIWvGWcKxwVwgode8mb8/R9oBnXQh0wdqm+ufpGuUAaVE8Y8rgNGXiKnCq/QywzTtjpgOTvJkYw8WPUWE6J8KhN10WqJtEdvif9E15Zqj7H69BL+Zy2wayYbu35OTPBuK3q4QlAVNk5fS+Uvkwxyz\n", + "!aws s3 ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec0b1510-6658-48b8-a9ec-76e52d414357", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "f3151feb-24bb-4ef2-89db-c6e2ef1ea6f1", + "metadata": { + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "344f296d-dcda-4c92-9a18-57d587e6827e", + "metadata": {}, + "outputs": [], + "source": [ + "# Which pipeline are you running\n", + "\n", + "prefix = \"CellRangerVdj\" # Workflow to run; also .wdl filename prefix\n", + "output_dirname = \"cr-vdj-results\"\n", + "\n", + "workflow_dir = glob.glob(f\"{Path.home()}/scing/bin/cellranger-vdj-*\")[0]\n", + "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", + "\n", + "# Locations of workflow-related directories and files\n", + "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\"\n", + "db_credentials_path = f\"{Path.home()}/.config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9db6a119-0ad5-4965-b988-9c85de2591b6", + "metadata": {}, + "outputs": [], + "source": [ + "# Location of docker files\n", + "common_docker_registry = \"quay.io/hisplan\"\n", + "pipeline_type = prefix # field in *.labels.json\n", + "comment = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "66f7b9a1-9d9d-4afa-9dd9-91ac30b6cefb", + "metadata": {}, + "outputs": [], + "source": [ + "# Workflow file paths\n", + "config_dir = f\"{workflow_dir}/configs\"\n", + "path_to_options = f\"{workflow_dir}/{prefix}.options.aws.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a067d6c1-49da-4901-ba59-bd21c5a69871", + "metadata": {}, + "outputs": [], + "source": [ + "# Set credentials based on SCRIdb CLI config file\n", + "with open(db_credentials_path) as f:\n", + " creds = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5778d94-6cb3-474c-b4ad-e051ccd66602", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "343fb1a2-db6a-4ada-a12f-a62493d55834", + "metadata": {}, + "source": [ + "# Sample information" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3bad3986-1d7e-4194-928b-51b4e2debe35", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " SELECT Sample, AWS_storage, id\n", + " FROM peer_lab_db.sample_data \n", + " WHERE peer_lab_db.sample_data.id = \"4006\"\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
PM-1779_LC479A_5primes3://dp-lab-data/collaborators/rudin...4006human10X_5primePre-_post-IO NSCLCGRCh38-3.0.0
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "PM-1779_LC479A_5prime s3://dp-lab-data/collaborators/rudin... 4006 human \n", + "\n", + " sc_tech project_id reference \n", + "Sample \n", + "PM-1779_LC479A_5prime 10X_5prime Pre-_post-IO NSCLC GRCh38-3.0.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "# request_ids = ['PM-1779']\n", + "# samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "sample_ids = [4006]\n", + "samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ea67344-a367-4906-9fa4-041c7f0b4d9c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "422c1151-9e53-4ff3-a178-7a37d69a586d", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "502590c9-15ac-40e1-8d78-1aba15cb02cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PM-1779_LC479A_5prime\n", + " PRE PM-1779_LC479A_5prime/\n", + " PRE PM-1779_LC479A_5prime_TCR_VDJ/\n", + "\n" + ] + } + ], + "source": [ + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6b77de06-8a75-4131-a435-0aac440fdb80", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/rudin/PrePostIoNsclc/PM-1779_LC479A_5prime_TCR_VDJ']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Modify the path to be the correct folder if necessary\n", + "\n", + "samples['AWS_storage'] += '_TCR_VDJ'\n", + "list(samples['AWS_storage'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e85154ba-47ad-4f3a-b2c8-9c81f2d33e23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
PM-1779_LC479A_5primes3://dp-lab-data/collaborators/rudin...4006human10X_5primePre-_post-IO NSCLCGRCh38-3.0.0{'I1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "PM-1779_LC479A_5prime s3://dp-lab-data/collaborators/rudin... 4006 human \n", + "\n", + " sc_tech project_id reference \\\n", + "Sample \n", + "PM-1779_LC479A_5prime 10X_5prime Pre-_post-IO NSCLC GRCh38-3.0.0 \n", + "\n", + " FASTQs \n", + "Sample \n", + "PM-1779_LC479A_5prime {'I1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2ac745a-92f1-4140-bb67-d904d7de581a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "45b907a9-53cc-4cf7-a678-5d2f4798e457", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "For CellRanger you need to supply an HTTPS path. So if you are using a custom genome stored on AWS, you must make the reference public !" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "16d0ca53-edc7-4e1b-9c9c-0d812a1ae1c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
PM-1779_LC479A_5primes3://dp-lab-data/collaborators/rudin...4006human10X_5primePre-_post-IO NSCLCGRCh38{'I1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "PM-1779_LC479A_5prime s3://dp-lab-data/collaborators/rudin... 4006 human \n", + "\n", + " sc_tech project_id reference \\\n", + "Sample \n", + "PM-1779_LC479A_5prime 10X_5prime Pre-_post-IO NSCLC GRCh38 \n", + "\n", + " FASTQs \n", + "Sample \n", + "PM-1779_LC479A_5prime {'I1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = update_ref(samples, prefix)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d58e81b0-43b9-433f-a462-85e9d99ed00f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "9d999472-93a6-4a4c-ab82-48fd5ade8894", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ddcab3aa-023e-41bb-8de5-0d9c7454d7c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerVdj.sampleNameCellRangerVdj.fastqNamesCellRangerVdj.referenceGenomeCellRangerVdj.inputFastqCellRangerVdj.chainCellRangerVdj.dockerRegistry
Sample
PM-1779_LC479A_5primePM-1779_LC479A_5prime4006_PM-1779_LC479A_5prime_TCR_VDJ_I...GRCh38[s3://dp-lab-data/collaborators/rudi...autoquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerVdj.sampleName \\\n", + "Sample \n", + "PM-1779_LC479A_5prime PM-1779_LC479A_5prime \n", + "\n", + " CellRangerVdj.fastqNames \\\n", + "Sample \n", + "PM-1779_LC479A_5prime 4006_PM-1779_LC479A_5prime_TCR_VDJ_I... \n", + "\n", + " CellRangerVdj.referenceGenome \\\n", + "Sample \n", + "PM-1779_LC479A_5prime GRCh38 \n", + "\n", + " CellRangerVdj.inputFastq \\\n", + "Sample \n", + "PM-1779_LC479A_5prime [s3://dp-lab-data/collaborators/rudi... \n", + "\n", + " CellRangerVdj.chain CellRangerVdj.dockerRegistry \n", + "Sample \n", + "PM-1779_LC479A_5prime auto quay.io/hisplan " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "inputs[f'{prefix}.sampleName'] = samples.index\n", + "inputs[f'{prefix}.inputFastq'] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f'{prefix}.fastqNames'] = inputs[f\"{prefix}.inputFastq\"].apply(lambda x: get_fastqs_name(x))\n", + "inputs[f'{prefix}.referenceGenome'] = samples['reference']\n", + "inputs[f'{prefix}.chain'] = 'auto'\n", + "inputs[f'{prefix}.dockerRegistry'] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c30e1518-da11-4069-a501-32f9e04ef389", + "metadata": {}, + "outputs": [], + "source": [ + "# Modification for specific samples\n", + "\n", + "inputs[f'{prefix}.chain'] = 'TR'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5edf8be1-51c3-4163-a776-d673c6264fc3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "133f5b48-cef9-480b-b824-7d96b253c909", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "59710506-35a6-487b-8e94-ca86513e29c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
PM-1779_LC479A_5primeCellRangerVdjPre-_post-IO NSCLCPM-1779_LC479A_5primesohailns3://dp-lab-data/collaborators/rudin...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "PM-1779_LC479A_5prime CellRangerVdj Pre-_post-IO NSCLC \n", + "\n", + " sample owner \\\n", + "Sample \n", + "PM-1779_LC479A_5prime PM-1779_LC479A_5prime sohailn \n", + "\n", + " destination transfer \\\n", + "Sample \n", + "PM-1779_LC479A_5prime s3://dp-lab-data/collaborators/rudin... - \n", + "\n", + " comment \n", + "Sample \n", + "PM-1779_LC479A_5prime sohailn " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ac67f52-2cf2-42f6-9d19-11e164021e40", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "51ea6068-d51b-494f-819b-4f9fda7a0ac6", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "id": "3417414b-07c1-4bd3-886d-4751c6ec5693", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6a77a19c-37bf-4f6d-93ff-5b5d4f33fbc7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerVdj.sampleNameCellRangerVdj.fastqNamesCellRangerVdj.referenceGenomeCellRangerVdj.inputFastqCellRangerVdj.chainCellRangerVdj.dockerRegistry
Sample
PM-1779_LC479A_5primePM-1779_LC479A_5prime4006_PM-1779_LC479A_5prime_TCR_VDJ_I...GRCh38[s3://dp-lab-data/collaborators/rudi...TRquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " CellRangerVdj.sampleName \\\n", + "Sample \n", + "PM-1779_LC479A_5prime PM-1779_LC479A_5prime \n", + "\n", + " CellRangerVdj.fastqNames \\\n", + "Sample \n", + "PM-1779_LC479A_5prime 4006_PM-1779_LC479A_5prime_TCR_VDJ_I... \n", + "\n", + " CellRangerVdj.referenceGenome \\\n", + "Sample \n", + "PM-1779_LC479A_5prime GRCh38 \n", + "\n", + " CellRangerVdj.inputFastq \\\n", + "Sample \n", + "PM-1779_LC479A_5prime [s3://dp-lab-data/collaborators/rudi... \n", + "\n", + " CellRangerVdj.chain CellRangerVdj.dockerRegistry \n", + "Sample \n", + "PM-1779_LC479A_5prime TR quay.io/hisplan " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "58110f32-95b2-4446-8cee-843985fc273e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
PM-1779_LC479A_5primeCellRangerVdjPre-_post-IO NSCLCPM-1779_LC479A_5primesohailns3://dp-lab-data/collaborators/rudin...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "PM-1779_LC479A_5prime CellRangerVdj Pre-_post-IO NSCLC \n", + "\n", + " sample owner \\\n", + "Sample \n", + "PM-1779_LC479A_5prime PM-1779_LC479A_5prime sohailn \n", + "\n", + " destination transfer \\\n", + "Sample \n", + "PM-1779_LC479A_5prime s3://dp-lab-data/collaborators/rudin... - \n", + "\n", + " comment \n", + "Sample \n", + "PM-1779_LC479A_5prime sohailn " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a0c8d8a-a6b8-4f03-b48d-9482d310d5ad", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a58189c6-1fd1-4e6a-ae7b-7835530867f7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2450bf742d3548c3a60eece5e6dfb53b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
ARN-1379_CP-18-21_multiomes3://dp-lab-data/collaborators/aboir...3358humanmultiomeHuman multiomeGRCh38
ARN-1379_CP-18-24_multiomes3://dp-lab-data/collaborators/aboir...3359humanmultiomeHuman multiomeGRCh38
ARN-1379_CP-18-21_multiome_ATACs3://dp-lab-data/collaborators/aboir...3360humanmultiomeHuman multiomeGRCh38
ARN-1379_CP-18-24_multiome_ATACs3://dp-lab-data/collaborators/aboir...3361humanmultiomeHuman multiomeGRCh38
ARN-1449_RA19-09_multiomes3://dp-lab-data/collaborators/aboir...3469humanmultiomeHuman multiomeGRCh38
ARN-1449_RA18-16_multiomes3://dp-lab-data/collaborators/aboir...3470humanmultiomeHuman multiomeGRCh38
ARN-1449_RA19-09_multiome_ATACs3://dp-lab-data/collaborators/aboir...3471humanmultiomeHuman multiomeGRCh38
ARN-1449_RA18-16_multiome_ATACs3://dp-lab-data/collaborators/aboir...3472humanmultiomeHuman multiomeGRCh38
\n", + "" + ], + "text/plain": [ + " AWS_storage \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-21_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "\n", + " id species sc_tech project_id \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome 3358 human multiome Human multiome \n", + "ARN-1379_CP-18-24_multiome 3359 human multiome Human multiome \n", + "ARN-1379_CP-18-21_multiome_ATAC 3360 human multiome Human multiome \n", + "ARN-1379_CP-18-24_multiome_ATAC 3361 human multiome Human multiome \n", + "ARN-1449_RA19-09_multiome 3469 human multiome Human multiome \n", + "ARN-1449_RA18-16_multiome 3470 human multiome Human multiome \n", + "ARN-1449_RA19-09_multiome_ATAC 3471 human multiome Human multiome \n", + "ARN-1449_RA18-16_multiome_ATAC 3472 human multiome Human multiome \n", + "\n", + " reference \n", + "Sample \n", + "ARN-1379_CP-18-21_multiome GRCh38 \n", + "ARN-1379_CP-18-24_multiome GRCh38 \n", + "ARN-1379_CP-18-21_multiome_ATAC GRCh38 \n", + "ARN-1379_CP-18-24_multiome_ATAC GRCh38 \n", + "ARN-1449_RA19-09_multiome GRCh38 \n", + "ARN-1449_RA18-16_multiome GRCh38 \n", + "ARN-1449_RA19-09_multiome_ATAC GRCh38 \n", + "ARN-1449_RA18-16_multiome_ATAC GRCh38 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "request_ids = ['ARN-1449', 'ARN-1379']\n", + "samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "# sample_ids = [4006]\n", + "# samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ea67344-a367-4906-9fa4-041c7f0b4d9c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e85154ba-47ad-4f3a-b2c8-9c81f2d33e23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
ARN-1379_CP-18-21_multiomes3://dp-lab-data/collaborators/aboir...3358humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1379_CP-18-24_multiomes3://dp-lab-data/collaborators/aboir...3359humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1379_CP-18-21_multiome_ATACs3://dp-lab-data/collaborators/aboir...3360humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1379_CP-18-24_multiome_ATACs3://dp-lab-data/collaborators/aboir...3361humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1449_RA19-09_multiomes3://dp-lab-data/collaborators/aboir...3469humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1449_RA18-16_multiomes3://dp-lab-data/collaborators/aboir...3470humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1449_RA19-09_multiome_ATACs3://dp-lab-data/collaborators/aboir...3471humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
ARN-1449_RA18-16_multiome_ATACs3://dp-lab-data/collaborators/aboir...3472humanmultiomeHuman multiomeGRCh38[s3://dp-lab-data/collaborators/aboi...
\n", + "
" + ], + "text/plain": [ + " AWS_storage \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-21_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "\n", + " id species sc_tech project_id \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome 3358 human multiome Human multiome \n", + "ARN-1379_CP-18-24_multiome 3359 human multiome Human multiome \n", + "ARN-1379_CP-18-21_multiome_ATAC 3360 human multiome Human multiome \n", + "ARN-1379_CP-18-24_multiome_ATAC 3361 human multiome Human multiome \n", + "ARN-1449_RA19-09_multiome 3469 human multiome Human multiome \n", + "ARN-1449_RA18-16_multiome 3470 human multiome Human multiome \n", + "ARN-1449_RA19-09_multiome_ATAC 3471 human multiome Human multiome \n", + "ARN-1449_RA18-16_multiome_ATAC 3472 human multiome Human multiome \n", + "\n", + " reference \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome GRCh38 \n", + "ARN-1379_CP-18-24_multiome GRCh38 \n", + "ARN-1379_CP-18-21_multiome_ATAC GRCh38 \n", + "ARN-1379_CP-18-24_multiome_ATAC GRCh38 \n", + "ARN-1449_RA19-09_multiome GRCh38 \n", + "ARN-1449_RA18-16_multiome GRCh38 \n", + "ARN-1449_RA19-09_multiome_ATAC GRCh38 \n", + "ARN-1449_RA18-16_multiome_ATAC GRCh38 \n", + "\n", + " FASTQs \n", + "Sample \n", + "ARN-1379_CP-18-21_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-24_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-21_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-24_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA19-09_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA18-16_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA19-09_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA18-16_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_all_fastqs(x, \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d58e81b0-43b9-433f-a462-85e9d99ed00f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "9d999472-93a6-4a4c-ab82-48fd5ade8894", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ddcab3aa-023e-41bb-8de5-0d9c7454d7c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FastQC.fastqFilesFastQC.dockerRegistry
Sample
ARN-1379_CP-18-21_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1379_CP-18-24_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1379_CP-18-21_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1379_CP-18-24_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA19-09_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA18-16_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA19-09_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA18-16_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " FastQC.fastqFiles \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-24_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-21_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-24_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA19-09_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA18-16_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA19-09_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA18-16_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "\n", + " FastQC.dockerRegistry \n", + "Sample \n", + "ARN-1379_CP-18-21_multiome quay.io/hisplan \n", + "ARN-1379_CP-18-24_multiome quay.io/hisplan \n", + "ARN-1379_CP-18-21_multiome_ATAC quay.io/hisplan \n", + "ARN-1379_CP-18-24_multiome_ATAC quay.io/hisplan \n", + "ARN-1449_RA19-09_multiome quay.io/hisplan \n", + "ARN-1449_RA18-16_multiome quay.io/hisplan \n", + "ARN-1449_RA19-09_multiome_ATAC quay.io/hisplan \n", + "ARN-1449_RA18-16_multiome_ATAC quay.io/hisplan " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "inputs[f'{prefix}.fastqFiles'] = samples[\"FASTQs\"]\n", + "inputs[f'{prefix}.dockerRegistry'] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5edf8be1-51c3-4163-a776-d673c6264fc3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "133f5b48-cef9-480b-b824-7d96b253c909", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "59710506-35a6-487b-8e94-ca86513e29c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
ARN-1379_CP-18-21_multiomeFastQCHuman multiomeARN-1379_CP-18-21_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1379_CP-18-24_multiomeFastQCHuman multiomeARN-1379_CP-18-24_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1379_CP-18-21_multiome_ATACFastQCHuman multiomeARN-1379_CP-18-21_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1379_CP-18-24_multiome_ATACFastQCHuman multiomeARN-1379_CP-18-24_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA19-09_multiomeFastQCHuman multiomeARN-1449_RA19-09_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA18-16_multiomeFastQCHuman multiomeARN-1449_RA18-16_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA19-09_multiome_ATACFastQCHuman multiomeARN-1449_RA19-09_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA18-16_multiome_ATACFastQCHuman multiomeARN-1449_RA18-16_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome FastQC Human multiome \n", + "ARN-1379_CP-18-24_multiome FastQC Human multiome \n", + "ARN-1379_CP-18-21_multiome_ATAC FastQC Human multiome \n", + "ARN-1379_CP-18-24_multiome_ATAC FastQC Human multiome \n", + "ARN-1449_RA19-09_multiome FastQC Human multiome \n", + "ARN-1449_RA18-16_multiome FastQC Human multiome \n", + "ARN-1449_RA19-09_multiome_ATAC FastQC Human multiome \n", + "ARN-1449_RA18-16_multiome_ATAC FastQC Human multiome \n", + "\n", + " sample owner \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome ARN-1379_CP-18-21_multiome sohailn \n", + "ARN-1379_CP-18-24_multiome ARN-1379_CP-18-24_multiome sohailn \n", + "ARN-1379_CP-18-21_multiome_ATAC ARN-1379_CP-18-21_multiome_ATAC sohailn \n", + "ARN-1379_CP-18-24_multiome_ATAC ARN-1379_CP-18-24_multiome_ATAC sohailn \n", + "ARN-1449_RA19-09_multiome ARN-1449_RA19-09_multiome sohailn \n", + "ARN-1449_RA18-16_multiome ARN-1449_RA18-16_multiome sohailn \n", + "ARN-1449_RA19-09_multiome_ATAC ARN-1449_RA19-09_multiome_ATAC sohailn \n", + "ARN-1449_RA18-16_multiome_ATAC ARN-1449_RA18-16_multiome_ATAC sohailn \n", + "\n", + " destination \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-21_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "\n", + " transfer comment \n", + "Sample \n", + "ARN-1379_CP-18-21_multiome - sohailn \n", + "ARN-1379_CP-18-24_multiome - sohailn \n", + "ARN-1379_CP-18-21_multiome_ATAC - sohailn \n", + "ARN-1379_CP-18-24_multiome_ATAC - sohailn \n", + "ARN-1449_RA19-09_multiome - sohailn \n", + "ARN-1449_RA18-16_multiome - sohailn \n", + "ARN-1449_RA19-09_multiome_ATAC - sohailn \n", + "ARN-1449_RA18-16_multiome_ATAC - sohailn " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ac67f52-2cf2-42f6-9d19-11e164021e40", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "51ea6068-d51b-494f-819b-4f9fda7a0ac6", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "id": "3417414b-07c1-4bd3-886d-4751c6ec5693", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6a77a19c-37bf-4f6d-93ff-5b5d4f33fbc7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FastQC.fastqFilesFastQC.dockerRegistry
Sample
ARN-1379_CP-18-21_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1379_CP-18-24_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1379_CP-18-21_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1379_CP-18-24_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA19-09_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA18-16_multiome[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA19-09_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
ARN-1449_RA18-16_multiome_ATAC[s3://dp-lab-data/collaborators/aboi...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " FastQC.fastqFiles \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-24_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-21_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1379_CP-18-24_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA19-09_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA18-16_multiome [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA19-09_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1449_RA18-16_multiome_ATAC [s3://dp-lab-data/collaborators/aboi... \n", + "\n", + " FastQC.dockerRegistry \n", + "Sample \n", + "ARN-1379_CP-18-21_multiome quay.io/hisplan \n", + "ARN-1379_CP-18-24_multiome quay.io/hisplan \n", + "ARN-1379_CP-18-21_multiome_ATAC quay.io/hisplan \n", + "ARN-1379_CP-18-24_multiome_ATAC quay.io/hisplan \n", + "ARN-1449_RA19-09_multiome quay.io/hisplan \n", + "ARN-1449_RA18-16_multiome quay.io/hisplan \n", + "ARN-1449_RA19-09_multiome_ATAC quay.io/hisplan \n", + "ARN-1449_RA18-16_multiome_ATAC quay.io/hisplan " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "58110f32-95b2-4446-8cee-843985fc273e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
ARN-1379_CP-18-21_multiomeFastQCHuman multiomeARN-1379_CP-18-21_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1379_CP-18-24_multiomeFastQCHuman multiomeARN-1379_CP-18-24_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1379_CP-18-21_multiome_ATACFastQCHuman multiomeARN-1379_CP-18-21_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1379_CP-18-24_multiome_ATACFastQCHuman multiomeARN-1379_CP-18-24_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA19-09_multiomeFastQCHuman multiomeARN-1449_RA19-09_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA18-16_multiomeFastQCHuman multiomeARN-1449_RA18-16_multiomesohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA19-09_multiome_ATACFastQCHuman multiomeARN-1449_RA19-09_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
ARN-1449_RA18-16_multiome_ATACFastQCHuman multiomeARN-1449_RA18-16_multiome_ATACsohailns3://dp-lab-data/collaborators/aboir...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome FastQC Human multiome \n", + "ARN-1379_CP-18-24_multiome FastQC Human multiome \n", + "ARN-1379_CP-18-21_multiome_ATAC FastQC Human multiome \n", + "ARN-1379_CP-18-24_multiome_ATAC FastQC Human multiome \n", + "ARN-1449_RA19-09_multiome FastQC Human multiome \n", + "ARN-1449_RA18-16_multiome FastQC Human multiome \n", + "ARN-1449_RA19-09_multiome_ATAC FastQC Human multiome \n", + "ARN-1449_RA18-16_multiome_ATAC FastQC Human multiome \n", + "\n", + " sample owner \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome ARN-1379_CP-18-21_multiome sohailn \n", + "ARN-1379_CP-18-24_multiome ARN-1379_CP-18-24_multiome sohailn \n", + "ARN-1379_CP-18-21_multiome_ATAC ARN-1379_CP-18-21_multiome_ATAC sohailn \n", + "ARN-1379_CP-18-24_multiome_ATAC ARN-1379_CP-18-24_multiome_ATAC sohailn \n", + "ARN-1449_RA19-09_multiome ARN-1449_RA19-09_multiome sohailn \n", + "ARN-1449_RA18-16_multiome ARN-1449_RA18-16_multiome sohailn \n", + "ARN-1449_RA19-09_multiome_ATAC ARN-1449_RA19-09_multiome_ATAC sohailn \n", + "ARN-1449_RA18-16_multiome_ATAC ARN-1449_RA18-16_multiome_ATAC sohailn \n", + "\n", + " destination \\\n", + "Sample \n", + "ARN-1379_CP-18-21_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-21_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1379_CP-18-24_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA19-09_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1449_RA18-16_multiome_ATAC s3://dp-lab-data/collaborators/aboir... \n", + "\n", + " transfer comment \n", + "Sample \n", + "ARN-1379_CP-18-21_multiome - sohailn \n", + "ARN-1379_CP-18-24_multiome - sohailn \n", + "ARN-1379_CP-18-21_multiome_ATAC - sohailn \n", + "ARN-1379_CP-18-24_multiome_ATAC - sohailn \n", + "ARN-1449_RA19-09_multiome - sohailn \n", + "ARN-1449_RA18-16_multiome - sohailn \n", + "ARN-1449_RA19-09_multiome_ATAC - sohailn \n", + "ARN-1449_RA18-16_multiome_ATAC - sohailn " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a0c8d8a-a6b8-4f03-b48d-9482d310d5ad", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "a58189c6-1fd1-4e6a-ae7b-7835530867f7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "58dc01dd19864fe8b228368cf811a994", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/8 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_ID
AV-1759_Ru1083_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3924
AV-1760_MSK_LX_1083c_T_2_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3925
AV-1761_POSIE_101920_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3926
AV-1762_Ru1083d_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3927
AV-1763_Ru1250C_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3928
AV-1764_MSK_LX_1250b_PM_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3929
AV-1764_Ru1250D_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3930
AV-1765_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3931
AV-1766_MSK_LX_1250f_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3932
AV-1760_Ru263_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3933
\n", + "" + ], + "text/plain": [ + " S3_path \\\n", + "AV-1759_Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1760_MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1761_POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1762_Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1763_Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1764_Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1766_MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1760_Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " Sample_ID \n", + "AV-1759_Ru1083_MITO 3924 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO 3925 \n", + "AV-1761_POSIE_101920_T_1_MITO 3926 \n", + "AV-1762_Ru1083d_MITO 3927 \n", + "AV-1763_Ru1250C_T_1_MITO 3928 \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO 3929 \n", + "AV-1764_Ru1250D_T_1_MITO 3930 \n", + "AV-1765_Ru1250e_MITO 3931 \n", + "AV-1766_MSK_LX_1250f_MITO 3932 \n", + "AV-1760_Ru263_MITO 3933 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Common query col: id, request_id, Sample\n", + "sample_id = list(range(3924, 3934))\n", + "\n", + "samples = format_sample_aws(sample_id, 'id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "359fbd22-702d-4763-85c2-0410b0611bb6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcode
RA19_10_13_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3008GRCh38-3.0.0CRNaN
RA19_10_14_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3009GRCh38-3.0.0CRNaN
RA19_10_17_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3010GRCh38-3.0.0CRNaN
RA19_10_18_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3011GRCh38-3.0.0CRNaN
RA19_10_23_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3012GRCh38-3.0.0CRNaN
RA19_10_14_FACS_citric_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3013GRCh38-3.0.0CRNaN
\n", + "
" + ], + "text/plain": [ + " S3_path \\\n", + "RA19_10_13_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " Sample_ID index assay barcode \n", + "RA19_10_13_FACs_MITO 3008 GRCh38-3.0.0 CR NaN \n", + "RA19_10_14_FACs_MITO 3009 GRCh38-3.0.0 CR NaN \n", + "RA19_10_17_FACs_MITO 3010 GRCh38-3.0.0 CR NaN \n", + "RA19_10_18_FACs_MITO 3011 GRCh38-3.0.0 CR NaN \n", + "RA19_10_23_FACs_MITO 3012 GRCh38-3.0.0 CR NaN \n", + "RA19_10_14_FACS_citric_MITO 3013 GRCh38-3.0.0 CR NaN " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = format_assay_barcode(samples, creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a14b9418-dab4-4aaf-8abb-14f9e0534152", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcodefastq
RA19_10_13_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3008GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_14_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3009GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_17_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3010GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_18_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3011GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_23_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3012GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_14_FACS_citric_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3013GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
\n", + "
" + ], + "text/plain": [ + " S3_path \\\n", + "RA19_10_13_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " Sample_ID index assay barcode \\\n", + "RA19_10_13_FACs_MITO 3008 GRCh38-3.0.0 CR NaN \n", + "RA19_10_14_FACs_MITO 3009 GRCh38-3.0.0 CR NaN \n", + "RA19_10_17_FACs_MITO 3010 GRCh38-3.0.0 CR NaN \n", + "RA19_10_18_FACs_MITO 3011 GRCh38-3.0.0 CR NaN \n", + "RA19_10_23_FACs_MITO 3012 GRCh38-3.0.0 CR NaN \n", + "RA19_10_14_FACS_citric_MITO 3013 GRCh38-3.0.0 CR NaN \n", + "\n", + " fastq \n", + "RA19_10_13_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_14_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_17_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_18_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_23_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_14_FACS_citric_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = get_barcode_genomic_fastqs(samples)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7fd5ce73-8d7f-4d23-9784-4774c7da80a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcodefastq
RA19_10_14_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3009GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_17_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3010GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_18_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3011GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_23_FACs_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3012GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
RA19_10_14_FACS_citric_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3013GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
\n", + "
" + ], + "text/plain": [ + " S3_path \\\n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " Sample_ID index assay barcode \\\n", + "RA19_10_14_FACs_MITO 3009 GRCh38-3.0.0 CR NaN \n", + "RA19_10_17_FACs_MITO 3010 GRCh38-3.0.0 CR NaN \n", + "RA19_10_18_FACs_MITO 3011 GRCh38-3.0.0 CR NaN \n", + "RA19_10_23_FACs_MITO 3012 GRCh38-3.0.0 CR NaN \n", + "RA19_10_14_FACS_citric_MITO 3013 GRCh38-3.0.0 CR NaN \n", + "\n", + " fastq \n", + "RA19_10_14_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_17_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_18_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_23_FACs_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "RA19_10_14_FACS_citric_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This sample has already been processed\n", + "samples = samples[samples.index != 'RA19_10_13_FACs_MITO']\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9992a85d-6af9-449c-9eec-1ad4fc8aa2b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "85065328-a68e-46a7-9304-88953148416e", + "metadata": { + "tags": [] + }, + "source": [ + "## Make input file" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ce68536f-7029-4689-83a7-239421284398", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
RA19_10_14_FACs_MITONaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
RA19_10_17_FACs_MITONaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
RA19_10_18_FACs_MITONaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
RA19_10_23_FACs_MITONaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
RA19_10_14_FACS_citric_MITONaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName MitoTracing.fastqName \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.fastqR1 MitoTracing.fastqR2 \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.reference MitoTracing.includeIntrons \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.memory MitoTracing.whitelist \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.dockerRegistry \n", + "RA19_10_14_FACs_MITO NaN \n", + "RA19_10_17_FACs_MITO NaN \n", + "RA19_10_18_FACs_MITO NaN \n", + "RA19_10_23_FACs_MITO NaN \n", + "RA19_10_14_FACS_citric_MITO NaN " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load minimum inputs and labels fields from input template\n", + "with open(f\"{config_dir}/{template_prefix}.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "bd8a0e97-49ad-4295-a4ff-ca83e90732c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
RA19_10_14_FACs_MITONaNNaNNaNNaNNaNFalse500016128NaNquay.io/hisplan
RA19_10_17_FACs_MITONaNNaNNaNNaNNaNFalse500016128NaNquay.io/hisplan
RA19_10_18_FACs_MITONaNNaNNaNNaNNaNFalse500016128NaNquay.io/hisplan
RA19_10_23_FACs_MITONaNNaNNaNNaNNaNFalse500016128NaNquay.io/hisplan
RA19_10_14_FACS_citric_MITONaNNaNNaNNaNNaNFalse500016128NaNquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName MitoTracing.fastqName \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.fastqR1 MitoTracing.fastqR2 \\\n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN \n", + "\n", + " MitoTracing.reference MitoTracing.includeIntrons \\\n", + "RA19_10_14_FACs_MITO NaN False \n", + "RA19_10_17_FACs_MITO NaN False \n", + "RA19_10_18_FACs_MITO NaN False \n", + "RA19_10_23_FACs_MITO NaN False \n", + "RA19_10_14_FACS_citric_MITO NaN False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "RA19_10_14_FACs_MITO 5000 16 \n", + "RA19_10_17_FACs_MITO 5000 16 \n", + "RA19_10_18_FACs_MITO 5000 16 \n", + "RA19_10_23_FACs_MITO 5000 16 \n", + "RA19_10_14_FACS_citric_MITO 5000 16 \n", + "\n", + " MitoTracing.memory MitoTracing.whitelist \\\n", + "RA19_10_14_FACs_MITO 128 NaN \n", + "RA19_10_17_FACs_MITO 128 NaN \n", + "RA19_10_18_FACs_MITO 128 NaN \n", + "RA19_10_23_FACs_MITO 128 NaN \n", + "RA19_10_14_FACS_citric_MITO 128 NaN \n", + "\n", + " MitoTracing.dockerRegistry \n", + "RA19_10_14_FACs_MITO quay.io/hisplan \n", + "RA19_10_17_FACs_MITO quay.io/hisplan \n", + "RA19_10_18_FACs_MITO quay.io/hisplan \n", + "RA19_10_23_FACs_MITO quay.io/hisplan \n", + "RA19_10_14_FACS_citric_MITO quay.io/hisplan " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Default inputs\n", + "inputs[f\"{prefix}.includeIntrons\"] = False\n", + "inputs[f\"{prefix}.expectCells\"] = 5000\n", + "inputs[f\"{prefix}.numCores\"] = 16\n", + "inputs[f\"{prefix}.memory\"] = 128\n", + "\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "db50eb55-40dc-4840-bc61-63525c2afb2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
RA19_10_14_FACs_MITORA19_10_14_FACs_MITO3009_RA19_10_14_FACs_MITO_IGO_12411_5[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...NaNFalse500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_17_FACs_MITORA19_10_17_FACs_MITO3010_RA19_10_17_FACs_MITO_IGO_12411_6[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...NaNFalse500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_18_FACs_MITORA19_10_18_FACs_MITO3011_RA19_10_18_FACs_MITO_IGO_12411_7[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...NaNFalse500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_23_FACs_MITORA19_10_23_FACs_MITO3012_RA19_10_23_FACs_MITO_IGO_12411_8[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...NaNFalse500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_14_FACS_citric_MITORA19_10_14_FACS_citric_MITO3013_RA19_10_14_FACS_citric_MITO_IGO...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...NaNFalse500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "RA19_10_14_FACs_MITO RA19_10_14_FACs_MITO \n", + "RA19_10_17_FACs_MITO RA19_10_17_FACs_MITO \n", + "RA19_10_18_FACs_MITO RA19_10_18_FACs_MITO \n", + "RA19_10_23_FACs_MITO RA19_10_23_FACs_MITO \n", + "RA19_10_14_FACS_citric_MITO RA19_10_14_FACS_citric_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "RA19_10_14_FACs_MITO 3009_RA19_10_14_FACs_MITO_IGO_12411_5 \n", + "RA19_10_17_FACs_MITO 3010_RA19_10_17_FACs_MITO_IGO_12411_6 \n", + "RA19_10_18_FACs_MITO 3011_RA19_10_18_FACs_MITO_IGO_12411_7 \n", + "RA19_10_23_FACs_MITO 3012_RA19_10_23_FACs_MITO_IGO_12411_8 \n", + "RA19_10_14_FACS_citric_MITO 3013_RA19_10_14_FACS_citric_MITO_IGO... \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "RA19_10_14_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_17_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_18_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_23_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_14_FACS_citric_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "RA19_10_14_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_17_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_18_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_23_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_14_FACS_citric_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference MitoTracing.includeIntrons \\\n", + "RA19_10_14_FACs_MITO NaN False \n", + "RA19_10_17_FACs_MITO NaN False \n", + "RA19_10_18_FACs_MITO NaN False \n", + "RA19_10_23_FACs_MITO NaN False \n", + "RA19_10_14_FACS_citric_MITO NaN False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "RA19_10_14_FACs_MITO 5000 16 \n", + "RA19_10_17_FACs_MITO 5000 16 \n", + "RA19_10_18_FACs_MITO 5000 16 \n", + "RA19_10_23_FACs_MITO 5000 16 \n", + "RA19_10_14_FACS_citric_MITO 5000 16 \n", + "\n", + " MitoTracing.memory \\\n", + "RA19_10_14_FACs_MITO 128 \n", + "RA19_10_17_FACs_MITO 128 \n", + "RA19_10_18_FACs_MITO 128 \n", + "RA19_10_23_FACs_MITO 128 \n", + "RA19_10_14_FACS_citric_MITO 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "RA19_10_14_FACs_MITO quay.io/hisplan \n", + "RA19_10_17_FACs_MITO quay.io/hisplan \n", + "RA19_10_18_FACs_MITO quay.io/hisplan \n", + "RA19_10_23_FACs_MITO quay.io/hisplan \n", + "RA19_10_14_FACS_citric_MITO quay.io/hisplan " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample information\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index.tolist()\n", + "\n", + "inputs[f\"{prefix}.fastqR1\"] = samples['fastq'].str['R1']\n", + "inputs[f\"{prefix}.fastqR2\"] = samples['fastq'].str['R2']\n", + "\n", + "for sample in inputs.index:\n", + " fastqR1_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR1'])\n", + " fastqR2_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR2'])\n", + " \n", + " assert(fastqR1_name == fastqR1_name)\n", + " inputs.loc[sample, f\"{prefix}.fastqName\"] = fastqR1_name\n", + " \n", + " # Check the whitelist to make sure that barcodes are in nucleotide format\n", + " inputs.loc[sample, f\"{prefix}.whitelist\"] = get_whitelist(samples.loc[sample, 'S3_path'])\n", + "\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "101eb0d4-4ca6-4639-bb67-afe47109a251", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
RA19_10_14_FACs_MITORA19_10_14_FACs_MITO3009_RA19_10_14_FACs_MITO_IGO_12411_5[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_17_FACs_MITORA19_10_17_FACs_MITO3010_RA19_10_17_FACs_MITO_IGO_12411_6[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_18_FACs_MITORA19_10_18_FACs_MITO3011_RA19_10_18_FACs_MITO_IGO_12411_7[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_23_FACs_MITORA19_10_23_FACs_MITO3012_RA19_10_23_FACs_MITO_IGO_12411_8[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_14_FACS_citric_MITORA19_10_14_FACS_citric_MITO3013_RA19_10_14_FACS_citric_MITO_IGO...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "RA19_10_14_FACs_MITO RA19_10_14_FACs_MITO \n", + "RA19_10_17_FACs_MITO RA19_10_17_FACs_MITO \n", + "RA19_10_18_FACs_MITO RA19_10_18_FACs_MITO \n", + "RA19_10_23_FACs_MITO RA19_10_23_FACs_MITO \n", + "RA19_10_14_FACS_citric_MITO RA19_10_14_FACS_citric_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "RA19_10_14_FACs_MITO 3009_RA19_10_14_FACs_MITO_IGO_12411_5 \n", + "RA19_10_17_FACs_MITO 3010_RA19_10_17_FACs_MITO_IGO_12411_6 \n", + "RA19_10_18_FACs_MITO 3011_RA19_10_18_FACs_MITO_IGO_12411_7 \n", + "RA19_10_23_FACs_MITO 3012_RA19_10_23_FACs_MITO_IGO_12411_8 \n", + "RA19_10_14_FACS_citric_MITO 3013_RA19_10_14_FACS_citric_MITO_IGO... \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "RA19_10_14_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_17_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_18_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_23_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_14_FACS_citric_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "RA19_10_14_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_17_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_18_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_23_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_14_FACS_citric_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "RA19_10_14_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_17_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_18_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_23_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_14_FACS_citric_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons \\\n", + "RA19_10_14_FACs_MITO False \n", + "RA19_10_17_FACs_MITO False \n", + "RA19_10_18_FACs_MITO False \n", + "RA19_10_23_FACs_MITO False \n", + "RA19_10_14_FACS_citric_MITO False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "RA19_10_14_FACs_MITO 5000 16 \n", + "RA19_10_17_FACs_MITO 5000 16 \n", + "RA19_10_18_FACs_MITO 5000 16 \n", + "RA19_10_23_FACs_MITO 5000 16 \n", + "RA19_10_14_FACS_citric_MITO 5000 16 \n", + "\n", + " MitoTracing.memory \\\n", + "RA19_10_14_FACs_MITO 128 \n", + "RA19_10_17_FACs_MITO 128 \n", + "RA19_10_18_FACs_MITO 128 \n", + "RA19_10_23_FACs_MITO 128 \n", + "RA19_10_14_FACS_citric_MITO 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "RA19_10_14_FACs_MITO quay.io/hisplan \n", + "RA19_10_17_FACs_MITO quay.io/hisplan \n", + "RA19_10_18_FACs_MITO quay.io/hisplan \n", + "RA19_10_23_FACs_MITO quay.io/hisplan \n", + "RA19_10_14_FACS_citric_MITO quay.io/hisplan " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reference created by Jaeyoung\n", + "cellRangerRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/refdata-gex-mito-GRCh38-ensemble98.tar.gz\"\n", + "mitoFastaRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/mito-fasta-GRCh38.tar.gz\"\n", + "\n", + "reference = [dict(cellRangerRefPkg=cellRangerRefPkg,\n", + " mitoFastaRefPkg=mitoFastaRefPkg)] * len(inputs)\n", + "inputs[f\"{prefix}.reference\"] = reference\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f12841-72fe-49c6-b91c-2381f7fc62be", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "0f737133-5115-4ee6-bf93-5443e5c652d1", + "metadata": { + "tags": [] + }, + "source": [ + "## Make label file" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6ca8c556-8254-4051-93ee-5447d43abfbb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
RA19_10_14_FACs_MITONaNNaNNaNNaNNaNNaNNaN
RA19_10_17_FACs_MITONaNNaNNaNNaNNaNNaNNaN
RA19_10_18_FACs_MITONaNNaNNaNNaNNaNNaNNaN
RA19_10_23_FACs_MITONaNNaNNaNNaNNaNNaNNaN
RA19_10_14_FACS_citric_MITONaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner destination \\\n", + "RA19_10_14_FACs_MITO NaN NaN NaN NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN NaN NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN NaN NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN NaN NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN NaN NaN NaN \n", + "\n", + " transfer comment \n", + "RA19_10_14_FACs_MITO NaN NaN \n", + "RA19_10_17_FACs_MITO NaN NaN \n", + "RA19_10_18_FACs_MITO NaN NaN \n", + "RA19_10_23_FACs_MITO NaN NaN \n", + "RA19_10_14_FACS_citric_MITO NaN NaN " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load minimum inputs and labels fields from labels template\n", + "with open(f\"{config_dir}/{template_prefix}.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0731efde-30bb-419b-b5a4-b8420911f309", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
RA19_10_14_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_14_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_17_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_17_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_18_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_18_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_23_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_23_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_14_FACS_citric_MITOMitoTracingHTAN_CITEseqRA19_10_14_FACS_citric_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "RA19_10_14_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_17_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_18_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_23_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_14_FACS_citric_MITO MitoTracing HTAN_CITEseq \n", + "\n", + " sample owner \\\n", + "RA19_10_14_FACs_MITO RA19_10_14_FACs_MITO sohailn \n", + "RA19_10_17_FACs_MITO RA19_10_17_FACs_MITO sohailn \n", + "RA19_10_18_FACs_MITO RA19_10_18_FACs_MITO sohailn \n", + "RA19_10_23_FACs_MITO RA19_10_23_FACs_MITO sohailn \n", + "RA19_10_14_FACS_citric_MITO RA19_10_14_FACS_citric_MITO sohailn \n", + "\n", + " destination transfer \\\n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "\n", + " comment \n", + "RA19_10_14_FACs_MITO sohailn \n", + "RA19_10_17_FACs_MITO sohailn \n", + "RA19_10_18_FACs_MITO sohailn \n", + "RA19_10_23_FACs_MITO sohailn \n", + "RA19_10_14_FACS_citric_MITO sohailn " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Annotate labels\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples[\"Sample_ID\"].apply(lambda x: get_project_id(x, creds))\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['S3_path'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5973e09-e9bf-4415-acee-9e1809afdf5b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "84007e1e-ae11-4665-ba47-aab5b803af27", + "metadata": {}, + "source": [ + "# Submit job" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "68d260a1-cca0-41d6-9c84-d5f1aef94128", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
RA19_10_14_FACs_MITORA19_10_14_FACs_MITO3009_RA19_10_14_FACs_MITO_IGO_12411_5[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_17_FACs_MITORA19_10_17_FACs_MITO3010_RA19_10_17_FACs_MITO_IGO_12411_6[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_18_FACs_MITORA19_10_18_FACs_MITO3011_RA19_10_18_FACs_MITO_IGO_12411_7[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_23_FACs_MITORA19_10_23_FACs_MITO3012_RA19_10_23_FACs_MITO_IGO_12411_8[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
RA19_10_14_FACS_citric_MITORA19_10_14_FACS_citric_MITO3013_RA19_10_14_FACS_citric_MITO_IGO...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "RA19_10_14_FACs_MITO RA19_10_14_FACs_MITO \n", + "RA19_10_17_FACs_MITO RA19_10_17_FACs_MITO \n", + "RA19_10_18_FACs_MITO RA19_10_18_FACs_MITO \n", + "RA19_10_23_FACs_MITO RA19_10_23_FACs_MITO \n", + "RA19_10_14_FACS_citric_MITO RA19_10_14_FACS_citric_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "RA19_10_14_FACs_MITO 3009_RA19_10_14_FACs_MITO_IGO_12411_5 \n", + "RA19_10_17_FACs_MITO 3010_RA19_10_17_FACs_MITO_IGO_12411_6 \n", + "RA19_10_18_FACs_MITO 3011_RA19_10_18_FACs_MITO_IGO_12411_7 \n", + "RA19_10_23_FACs_MITO 3012_RA19_10_23_FACs_MITO_IGO_12411_8 \n", + "RA19_10_14_FACS_citric_MITO 3013_RA19_10_14_FACS_citric_MITO_IGO... \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "RA19_10_14_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_17_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_18_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_23_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_14_FACS_citric_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "RA19_10_14_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_17_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_18_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_23_FACs_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "RA19_10_14_FACS_citric_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "RA19_10_14_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_17_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_18_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_23_FACs_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RA19_10_14_FACS_citric_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons \\\n", + "RA19_10_14_FACs_MITO False \n", + "RA19_10_17_FACs_MITO False \n", + "RA19_10_18_FACs_MITO False \n", + "RA19_10_23_FACs_MITO False \n", + "RA19_10_14_FACS_citric_MITO False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "RA19_10_14_FACs_MITO 5000 16 \n", + "RA19_10_17_FACs_MITO 5000 16 \n", + "RA19_10_18_FACs_MITO 5000 16 \n", + "RA19_10_23_FACs_MITO 5000 16 \n", + "RA19_10_14_FACS_citric_MITO 5000 16 \n", + "\n", + " MitoTracing.memory \\\n", + "RA19_10_14_FACs_MITO 128 \n", + "RA19_10_17_FACs_MITO 128 \n", + "RA19_10_18_FACs_MITO 128 \n", + "RA19_10_23_FACs_MITO 128 \n", + "RA19_10_14_FACS_citric_MITO 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "RA19_10_14_FACs_MITO quay.io/hisplan \n", + "RA19_10_17_FACs_MITO quay.io/hisplan \n", + "RA19_10_18_FACs_MITO quay.io/hisplan \n", + "RA19_10_23_FACs_MITO quay.io/hisplan \n", + "RA19_10_14_FACS_citric_MITO quay.io/hisplan " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c5713d9e-a923-446e-9df8-fb9619590923", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
RA19_10_14_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_14_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_17_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_17_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_18_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_18_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_23_FACs_MITOMitoTracingHTAN_CITEseqRA19_10_23_FACs_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
RA19_10_14_FACS_citric_MITOMitoTracingHTAN_CITEseqRA19_10_14_FACS_citric_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "RA19_10_14_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_17_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_18_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_23_FACs_MITO MitoTracing HTAN_CITEseq \n", + "RA19_10_14_FACS_citric_MITO MitoTracing HTAN_CITEseq \n", + "\n", + " sample owner \\\n", + "RA19_10_14_FACs_MITO RA19_10_14_FACs_MITO sohailn \n", + "RA19_10_17_FACs_MITO RA19_10_17_FACs_MITO sohailn \n", + "RA19_10_18_FACs_MITO RA19_10_18_FACs_MITO sohailn \n", + "RA19_10_23_FACs_MITO RA19_10_23_FACs_MITO sohailn \n", + "RA19_10_14_FACS_citric_MITO RA19_10_14_FACS_citric_MITO sohailn \n", + "\n", + " destination transfer \\\n", + "RA19_10_14_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_17_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_18_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_23_FACs_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "RA19_10_14_FACS_citric_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - \n", + "\n", + " comment \n", + "RA19_10_14_FACs_MITO sohailn \n", + "RA19_10_17_FACs_MITO sohailn \n", + "RA19_10_18_FACs_MITO sohailn \n", + "RA19_10_23_FACs_MITO sohailn \n", + "RA19_10_14_FACS_citric_MITO sohailn " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "3bd30985-9d8f-4a6a-afb4-0f93357ad25b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1b6df2f77e1741a0bbb3b86940980c55", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_ID
AV-1759_Ru1083_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3924
AV-1760_MSK_LX_1083c_T_2_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3925
AV-1761_POSIE_101920_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3926
AV-1762_Ru1083d_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3927
AV-1763_Ru1250C_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3928
AV-1764_MSK_LX_1250b_PM_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3929
AV-1764_Ru1250D_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3930
AV-1765_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3931
AV-1766_MSK_LX_1250f_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3932
AV-1760_Ru263_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3933
\n", + "" + ], + "text/plain": [ + " S3_path \\\n", + "AV-1759_Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1760_MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1761_POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1762_Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1763_Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1764_Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1766_MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1760_Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " Sample_ID \n", + "AV-1759_Ru1083_MITO 3924 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO 3925 \n", + "AV-1761_POSIE_101920_T_1_MITO 3926 \n", + "AV-1762_Ru1083d_MITO 3927 \n", + "AV-1763_Ru1250C_T_1_MITO 3928 \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO 3929 \n", + "AV-1764_Ru1250D_T_1_MITO 3930 \n", + "AV-1765_Ru1250e_MITO 3931 \n", + "AV-1766_MSK_LX_1250f_MITO 3932 \n", + "AV-1760_Ru263_MITO 3933 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Common query col: id, request_id, Sample\n", + "sample_id = list(range(3924, 3934))\n", + "\n", + "samples = format_sample_aws(sample_id, 'id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "40640f5a-1e75-4947-98ba-05aec427e5e0", + "metadata": {}, + "outputs": [], + "source": [ + "# # Modification for Joe's samples\n", + "# samples.loc['Ru581b_T1_MITO', 'S3_path'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO'\n", + "# samples.loc['Ru581c-LN1_MITO', 'S3_path'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO'\n", + "\n", + "# samples" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "dc80d4e5-c699-4be7-96f4-0475e3f1f12d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_ID
AV-1765_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3931
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID\n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3931" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Re-running some of the samples\n", + "# samples = samples.loc[samples.index.isin(['AV-1760_Ru263_MITO', 'AV-1762_Ru1083d_MITO'])]\n", + "\n", + "# barcodes_in_RNA.non-epithelial.RU1083_ST.txt\n", + "# samples = samples.loc[samples.index.isin(['AV-1762_Ru1083d_MITO'])]\n", + "\n", + "\n", + "# barcodes_in_RNA.epithelial.RU1250_ASC1.txt\n", + "samples = samples.loc[samples.index.isin(['AV-1765_Ru1250e_MITO'])]\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "359fbd22-702d-4763-85c2-0410b0611bb6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/2360911148.py:110: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['index'] = np.nan\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/2360911148.py:112: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['assay'] = np.nan\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/2360911148.py:113: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['barcode'] = np.nan\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/2360911148.py:125: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples.loc[sample, 'index'] = index\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/2360911148.py:127: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples.loc[sample, 'assay'] = assay\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcode
AV-1765_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3931GRCh38-3.0.0CRNaN
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID \\\n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3931 \n", + "\n", + " index assay barcode \n", + "AV-1765_Ru1250e_MITO GRCh38-3.0.0 CR NaN " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = format_assay_barcode(samples, creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a14b9418-dab4-4aaf-8abb-14f9e0534152", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/638862739.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['fastq'] = np.empty((len(samples), 0)).tolist()\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_34982/638862739.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['fastq'] = fastqs\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcodefastq
AV-1765_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3931GRCh38-3.0.0CRNaN{'R1': ['s3://dp-lab-data/SCRI_Proje...
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID \\\n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3931 \n", + "\n", + " index assay barcode \\\n", + "AV-1765_Ru1250e_MITO GRCh38-3.0.0 CR NaN \n", + "\n", + " fastq \n", + "AV-1765_Ru1250e_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = get_barcode_genomic_fastqs(samples)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bc16e8ed-5503-421b-9ce9-b31496670188", + "metadata": {}, + "outputs": [], + "source": [ + "# samples = pd.DataFrame(samples.loc['Ru581b_T1_MITO']).T\n", + "# samples" + ] + }, + { + "cell_type": "markdown", + "id": "85065328-a68e-46a7-9304-88953148416e", + "metadata": { + "tags": [] + }, + "source": [ + "## Make input file" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "ce68536f-7029-4689-83a7-239421284398", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1765_Ru1250e_MITONaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName MitoTracing.fastqName \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.fastqR1 MitoTracing.fastqR2 \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.reference MitoTracing.includeIntrons \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.memory MitoTracing.whitelist \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.dockerRegistry \n", + "AV-1765_Ru1250e_MITO NaN " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load minimum inputs and labels fields from input template\n", + "with open(f\"{config_dir}/{template_prefix}.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "bd8a0e97-49ad-4295-a4ff-ca83e90732c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1765_Ru1250e_MITONaNNaNNaNNaNNaNFalse500016256NaNquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName MitoTracing.fastqName \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.fastqR1 MitoTracing.fastqR2 \\\n", + "AV-1765_Ru1250e_MITO NaN NaN \n", + "\n", + " MitoTracing.reference MitoTracing.includeIntrons \\\n", + "AV-1765_Ru1250e_MITO NaN False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "AV-1765_Ru1250e_MITO 5000 16 \n", + "\n", + " MitoTracing.memory MitoTracing.whitelist \\\n", + "AV-1765_Ru1250e_MITO 256 NaN \n", + "\n", + " MitoTracing.dockerRegistry \n", + "AV-1765_Ru1250e_MITO quay.io/hisplan " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Default inputs\n", + "inputs[f\"{prefix}.includeIntrons\"] = False\n", + "inputs[f\"{prefix}.expectCells\"] = 5000\n", + "inputs[f\"{prefix}.numCores\"] = 16\n", + "# inputs[f\"{prefix}.memory\"] = 128\n", + "inputs[f\"{prefix}.memory\"] = 256 \n", + "\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "db50eb55-40dc-4840-bc61-63525c2afb2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1765_Ru1250e_MITOAV-1765_Ru1250e_MITO3931_AV-1765_Ru1250e_MITO_IGO_13388_9[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...NaNFalse500016256NaNquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "AV-1765_Ru1250e_MITO AV-1765_Ru1250e_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "AV-1765_Ru1250e_MITO 3931_AV-1765_Ru1250e_MITO_IGO_13388_9 \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference MitoTracing.includeIntrons \\\n", + "AV-1765_Ru1250e_MITO NaN False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "AV-1765_Ru1250e_MITO 5000 16 \n", + "\n", + " MitoTracing.memory MitoTracing.whitelist \\\n", + "AV-1765_Ru1250e_MITO 256 NaN \n", + "\n", + " MitoTracing.dockerRegistry \n", + "AV-1765_Ru1250e_MITO quay.io/hisplan " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample information\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index.tolist()\n", + "\n", + "inputs[f\"{prefix}.fastqR1\"] = samples['fastq'].str['R1']\n", + "inputs[f\"{prefix}.fastqR2\"] = samples['fastq'].str['R2']\n", + "\n", + "for sample in inputs.index:\n", + " fastqR1_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR1'])\n", + " fastqR2_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR2'])\n", + " \n", + " assert(fastqR1_name == fastqR1_name)\n", + " inputs.loc[sample, f\"{prefix}.fastqName\"] = fastqR1_name\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "101eb0d4-4ca6-4639-bb67-afe47109a251", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1765_Ru1250e_MITOAV-1765_Ru1250e_MITO3931_AV-1765_Ru1250e_MITO_IGO_13388_9[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016256NaNquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "AV-1765_Ru1250e_MITO AV-1765_Ru1250e_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "AV-1765_Ru1250e_MITO 3931_AV-1765_Ru1250e_MITO_IGO_13388_9 \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "AV-1765_Ru1250e_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons MitoTracing.expectCells \\\n", + "AV-1765_Ru1250e_MITO False 5000 \n", + "\n", + " MitoTracing.numCores MitoTracing.memory \\\n", + "AV-1765_Ru1250e_MITO 16 256 \n", + "\n", + " MitoTracing.whitelist MitoTracing.dockerRegistry \n", + "AV-1765_Ru1250e_MITO NaN quay.io/hisplan " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reference package created by Jaeyoung\n", + "\n", + "cellRangerRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/refdata-gex-mito-GRCh38-ensemble98.tar.gz\"\n", + "mitoFastaRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/mito-fasta-GRCh38.tar.gz\"\n", + "\n", + "reference = [dict(cellRangerRefPkg=cellRangerRefPkg,\n", + " mitoFastaRefPkg=mitoFastaRefPkg)] * len(inputs)\n", + "inputs[f\"{prefix}.reference\"] = reference\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f12841-72fe-49c6-b91c-2381f7fc62be", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6581c13e-4156-4082-b9a0-14bdfd420b1e", + "metadata": {}, + "source": [ + "## Check whitelist" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "03cf4850-3447-4f8a-acc0-87de499ac2f3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "1084d07a-a0e7-4274-81eb-2c2d3de4ecdf", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "46cd6c61-b55a-4c3b-b00a-0f88df9a3b93", + "metadata": {}, + "outputs": [], + "source": [ + "epithelial = True\n", + "for sample in inputs.index:\n", + " for w in whitelist[sample]:\n", + " \n", + " if epithelial:\n", + " if '.epithelial.' in w:\n", + " sample_whitelist = w\n", + " else:\n", + " if '.non-epithelial.' in w:\n", + " sample_whitelist = w\n", + " \n", + " inputs.loc[sample, f\"{prefix}.whitelist\"] = sample_whitelist" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "5bd15e40-eeae-4b8f-80fa-1975f89ba0b2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1765_Ru1250e_MITOAV-1765_Ru1250e_MITO3931_AV-1765_Ru1250e_MITO_IGO_13388_9[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016256s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "AV-1765_Ru1250e_MITO AV-1765_Ru1250e_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "AV-1765_Ru1250e_MITO 3931_AV-1765_Ru1250e_MITO_IGO_13388_9 \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "AV-1765_Ru1250e_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons MitoTracing.expectCells \\\n", + "AV-1765_Ru1250e_MITO False 5000 \n", + "\n", + " MitoTracing.numCores MitoTracing.memory \\\n", + "AV-1765_Ru1250e_MITO 16 256 \n", + "\n", + " MitoTracing.whitelist \\\n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "AV-1765_Ru1250e_MITO quay.io/hisplan " + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3371ad80-d08d-4d2a-aeba-8442cd8a16e8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "1ae5805b-8806-4846-a9ed-3b7400e99e2d", + "metadata": {}, + "outputs": [], + "source": [ + "# non_epithelial = ['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/barcodes_in_RNA.non-epithelial.RU581_Ta.txt',\n", + "# 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/barcodes_in_RNA.non-epithelial.RU581_LNa.txt',\n", + "# 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/barcodes_in_RNA.non-epithelial.RU581_LIV.txt']\n", + "\n", + "# epithelial = ['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/barcodes_in_RNA.RU581_Ta.txt',\n", + "# 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/barcodes_in_RNA.RU581_LNa.txt',\n", + "# 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/barcodes_in_RNA.RU581_LIV.txt']" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "b07e2dd7-8ed4-4dd1-9faa-a3fe7bfd99b2", + "metadata": {}, + "outputs": [], + "source": [ + "# # Whitelist needs to be in SEQC barcodes (nucleotide) format rather than CR format\n", + "# # Path to download the whitelist file\n", + "# path_whitelist = '/Users/sohailn/scing/bin/wdl-mito-tracing/whitelist/'\n", + "\n", + "# # for path_s3 in epithelial:\n", + "# for path_s3 in non_epithelial:\n", + "# path_s3_dirname = os.path.dirname(path_s3)\n", + "# sample_name = os.path.basename(path_s3_dirname)\n", + "\n", + "# filename_in = os.path.basename(path_s3)\n", + "# filename_out = filename_in.replace('.txt', '.seqc.txt')\n", + "\n", + "# # Download barcode file\n", + "# cmd = f'aws s3 cp {path_s3} {path_whitelist}{filename_in}'\n", + "# os.system(cmd)\n", + "\n", + "# barcodes = pd.read_csv(f'{path_whitelist}{filename_in}', header=None)[0].tolist()\n", + "\n", + "# from seqc.sequence.encodings import DNA3Bit\n", + "# dna3bit = DNA3Bit()\n", + "# barcodes = [dna3bit.decode(x) for x in barcodes]\n", + " \n", + "# with open(f'{path_whitelist}{filename_out}', \"w\") as output:\n", + "# for barcode in barcodes:\n", + "# output.write(f'{barcode.decode(\"utf-8\")}-1\\n')\n", + "\n", + "# cmd = f'aws s3 cp {path_whitelist}{filename_out} {path_s3_dirname}/{filename_out}'\n", + "# inputs.loc[sample_name, f\"{prefix}.whitelist\"] = f'{path_s3_dirname}/{filename_out}'\n", + " \n", + "# os.system(cmd)\n", + "# print()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "3c3633c6-cc9a-43f0-a4fa-b5b2cfc2a76e", + "metadata": {}, + "outputs": [], + "source": [ + "# inputs.loc['Ru581b_T1_MITO', f\"{prefix}.whitelist\"] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/barcodes_in_RNA.RU581_Ta.seqc.txt'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46fa392b-d4f4-4c47-aa8f-d8c7d980f62f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b019afa4-15b1-4e1d-8eb1-9ba176088dee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/barcodes_in_RNA.epithelial.RU1250_ASC1.txt'],\n", + " dtype=object)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs[f\"{prefix}.whitelist\"].values" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "eee5a4c1-8dce-4975-811c-57ed30da1994", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['epi_AV-1765_Ru1250e_MITO'], dtype=object)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "if epithelial:\n", + " inputs[f\"{prefix}.sampleName\"] = 'epi_' + inputs[f\"{prefix}.sampleName\"] \n", + "else:\n", + " inputs[f\"{prefix}.sampleName\"] = 'non_epi_' + inputs[f\"{prefix}.sampleName\"]\n", + "inputs[f\"{prefix}.sampleName\"].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9af5c62-35fe-4d20-a198-9819aeb2c680", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "0f737133-5115-4ee6-bf93-5443e5c652d1", + "metadata": { + "tags": [] + }, + "source": [ + "## Make label file" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "6ca8c556-8254-4051-93ee-5447d43abfbb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
AV-1765_Ru1250e_MITONaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner destination transfer \\\n", + "AV-1765_Ru1250e_MITO NaN NaN NaN NaN NaN NaN \n", + "\n", + " comment \n", + "AV-1765_Ru1250e_MITO NaN " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load minimum inputs and labels fields from labels template\n", + "with open(f\"{config_dir}/{template_prefix}.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "0731efde-30bb-419b-b5a4-b8420911f309", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
AV-1765_Ru1250e_MITOMitoTracingLung Tumor AtlasAV-1765_Ru1250e_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample \\\n", + "AV-1765_Ru1250e_MITO MitoTracing Lung Tumor Atlas AV-1765_Ru1250e_MITO \n", + "\n", + " owner destination \\\n", + "AV-1765_Ru1250e_MITO sohailn s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " transfer comment \n", + "AV-1765_Ru1250e_MITO - sohailn " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Annotate labels\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples[\"Sample_ID\"].apply(lambda x: get_project_id(x, creds))\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "# labels[\"destination\"] = samples['S3_path'] + \"/\" + output_dirname\n", + "\n", + "if epithelial:\n", + " labels[\"destination\"] = samples['S3_path'] + \"/\" + 'epi_' + output_dirname\n", + "else:\n", + " labels[\"destination\"] = samples['S3_path'] + \"/\" + 'non_epi_' + output_dirname\n", + "\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "82ab0b61-8acf-43d3-97ae-3e53c528e890", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/epi_mito-tracing-outs'],\n", + " dtype=object)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# labels.loc['Ru581D_MITO', 'destination'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/non_epi_mito-tracing-outs'\n", + "# labels.loc['Ru581D_MITO', 'destination'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/epi_mito-tracing-outs'\n", + "\n", + "labels['destination'].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5973e09-e9bf-4415-acee-9e1809afdf5b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "84007e1e-ae11-4665-ba47-aab5b803af27", + "metadata": {}, + "source": [ + "# Submit job" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "eb40233f-7d95-4e66-91b8-933d77265ee7", + "metadata": {}, + "outputs": [], + "source": [ + "inputs_all = inputs.copy()\n", + "labels_all = labels.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "56fb192f-46d2-401d-bf8e-7bf65551364c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['AV-1759_Ru1083_MITO',\n", + " 'AV-1760_MSK_LX_1083c_T_2_MITO',\n", + " 'AV-1760_Ru263_MITO',\n", + " 'AV-1761_POSIE_101920_T_1_MITO',\n", + " 'AV-1762_Ru1083d_MITO',\n", + " 'AV-1763_Ru1250C_T_1_MITO',\n", + " 'AV-1764_MSK_LX_1250b_PM_1_MITO',\n", + " 'AV-1764_Ru1250D_T_1_MITO',\n", + " 'AV-1765_Ru1250e_MITO',\n", + " 'AV-1766_MSK_LX_1250f_MITO']" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_names = inputs.index.tolist()\n", + "sample_names.sort()\n", + "sample_names" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "ca0f8bc8-c862-4398-89b2-152ad55bdec9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1759_Ru1083_MITOnon_epi_AV-1759_Ru1083_MITO3924_AV-1759_Ru1083_MITO_IGO_13388_1[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1760_MSK_LX_1083c_T_2_MITOnon_epi_AV-1760_MSK_LX_1083c_T_2_MITO3925_AV-1760_MSK_LX_1083c_T_2_MITO_I...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1760_Ru263_MITOnon_epi_AV-1760_Ru263_MITO3933_AV-1760_Ru263_MITO_IGO_13388_7[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1761_POSIE_101920_T_1_MITOnon_epi_AV-1761_POSIE_101920_T_1_MITO3926_AV-1761_POSIE_101920_T_1_MITO_I...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1762_Ru1083d_MITOnon_epi_AV-1762_Ru1083d_MITO3927_AV-1762_Ru1083d_MITO_IGO_13388_4[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1763_Ru1250C_T_1_MITOnon_epi_AV-1763_Ru1250C_T_1_MITO3928_AV-1763_Ru1250C_T_1_MITO_IGO_13...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1764_MSK_LX_1250b_PM_1_MITOnon_epi_AV-1764_MSK_LX_1250b_PM_1_MITO3929_AV-1764_MSK_LX_1250b_PM_1_MITO_...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1764_Ru1250D_T_1_MITOnon_epi_AV-1764_Ru1250D_T_1_MITO3930_AV-1764_Ru1250D_T_1_MITO_IGO_13...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1765_Ru1250e_MITOnon_epi_AV-1765_Ru1250e_MITO3931_AV-1765_Ru1250e_MITO_IGO_13388_9[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
AV-1766_MSK_LX_1250f_MITOnon_epi_AV-1766_MSK_LX_1250f_MITO3932_AV-1766_MSK_LX_1250f_MITO_IGO_1...[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "AV-1759_Ru1083_MITO non_epi_AV-1759_Ru1083_MITO \n", + "AV-1760_MSK_LX_1083c_T_2_MITO non_epi_AV-1760_MSK_LX_1083c_T_2_MITO \n", + "AV-1760_Ru263_MITO non_epi_AV-1760_Ru263_MITO \n", + "AV-1761_POSIE_101920_T_1_MITO non_epi_AV-1761_POSIE_101920_T_1_MITO \n", + "AV-1762_Ru1083d_MITO non_epi_AV-1762_Ru1083d_MITO \n", + "AV-1763_Ru1250C_T_1_MITO non_epi_AV-1763_Ru1250C_T_1_MITO \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO non_epi_AV-1764_MSK_LX_1250b_PM_1_MITO \n", + "AV-1764_Ru1250D_T_1_MITO non_epi_AV-1764_Ru1250D_T_1_MITO \n", + "AV-1765_Ru1250e_MITO non_epi_AV-1765_Ru1250e_MITO \n", + "AV-1766_MSK_LX_1250f_MITO non_epi_AV-1766_MSK_LX_1250f_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "AV-1759_Ru1083_MITO 3924_AV-1759_Ru1083_MITO_IGO_13388_1 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO 3925_AV-1760_MSK_LX_1083c_T_2_MITO_I... \n", + "AV-1760_Ru263_MITO 3933_AV-1760_Ru263_MITO_IGO_13388_7 \n", + "AV-1761_POSIE_101920_T_1_MITO 3926_AV-1761_POSIE_101920_T_1_MITO_I... \n", + "AV-1762_Ru1083d_MITO 3927_AV-1762_Ru1083d_MITO_IGO_13388_4 \n", + "AV-1763_Ru1250C_T_1_MITO 3928_AV-1763_Ru1250C_T_1_MITO_IGO_13... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO 3929_AV-1764_MSK_LX_1250b_PM_1_MITO_... \n", + "AV-1764_Ru1250D_T_1_MITO 3930_AV-1764_Ru1250D_T_1_MITO_IGO_13... \n", + "AV-1765_Ru1250e_MITO 3931_AV-1765_Ru1250e_MITO_IGO_13388_9 \n", + "AV-1766_MSK_LX_1250f_MITO 3932_AV-1766_MSK_LX_1250f_MITO_IGO_1... \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "AV-1759_Ru1083_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1760_MSK_LX_1083c_T_2_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1760_Ru263_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1761_POSIE_101920_T_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1762_Ru1083d_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1763_Ru1250C_T_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1764_Ru1250D_T_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1766_MSK_LX_1250f_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "AV-1759_Ru1083_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1760_MSK_LX_1083c_T_2_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1760_Ru263_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1761_POSIE_101920_T_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1762_Ru1083d_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1763_Ru1250C_T_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1764_Ru1250D_T_1_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "AV-1766_MSK_LX_1250f_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "AV-1759_Ru1083_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1760_MSK_LX_1083c_T_2_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1760_Ru263_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1761_POSIE_101920_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1762_Ru1083d_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1763_Ru1250C_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1764_Ru1250D_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1765_Ru1250e_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "AV-1766_MSK_LX_1250f_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons \\\n", + "AV-1759_Ru1083_MITO False \n", + "AV-1760_MSK_LX_1083c_T_2_MITO False \n", + "AV-1760_Ru263_MITO False \n", + "AV-1761_POSIE_101920_T_1_MITO False \n", + "AV-1762_Ru1083d_MITO False \n", + "AV-1763_Ru1250C_T_1_MITO False \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO False \n", + "AV-1764_Ru1250D_T_1_MITO False \n", + "AV-1765_Ru1250e_MITO False \n", + "AV-1766_MSK_LX_1250f_MITO False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "AV-1759_Ru1083_MITO 5000 16 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO 5000 16 \n", + "AV-1760_Ru263_MITO 5000 16 \n", + "AV-1761_POSIE_101920_T_1_MITO 5000 16 \n", + "AV-1762_Ru1083d_MITO 5000 16 \n", + "AV-1763_Ru1250C_T_1_MITO 5000 16 \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO 5000 16 \n", + "AV-1764_Ru1250D_T_1_MITO 5000 16 \n", + "AV-1765_Ru1250e_MITO 5000 16 \n", + "AV-1766_MSK_LX_1250f_MITO 5000 16 \n", + "\n", + " MitoTracing.memory \\\n", + "AV-1759_Ru1083_MITO 128 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO 128 \n", + "AV-1760_Ru263_MITO 128 \n", + "AV-1761_POSIE_101920_T_1_MITO 128 \n", + "AV-1762_Ru1083d_MITO 128 \n", + "AV-1763_Ru1250C_T_1_MITO 128 \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO 128 \n", + "AV-1764_Ru1250D_T_1_MITO 128 \n", + "AV-1765_Ru1250e_MITO 128 \n", + "AV-1766_MSK_LX_1250f_MITO 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "AV-1759_Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1760_MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1760_Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1761_POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1762_Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1763_Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1764_Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "AV-1766_MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "AV-1759_Ru1083_MITO quay.io/hisplan \n", + "AV-1760_MSK_LX_1083c_T_2_MITO quay.io/hisplan \n", + "AV-1760_Ru263_MITO quay.io/hisplan \n", + "AV-1761_POSIE_101920_T_1_MITO quay.io/hisplan \n", + "AV-1762_Ru1083d_MITO quay.io/hisplan \n", + "AV-1763_Ru1250C_T_1_MITO quay.io/hisplan \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO quay.io/hisplan \n", + "AV-1764_Ru1250D_T_1_MITO quay.io/hisplan \n", + "AV-1765_Ru1250e_MITO quay.io/hisplan \n", + "AV-1766_MSK_LX_1250f_MITO quay.io/hisplan " + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = pd.DataFrame(inputs_all.loc[sample_names[0:]])\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "c25489ef-ec76-4e19-b6d3-fafb53e5fd23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/FASTQ/3924_AV-1759_Ru1083_MITO_IGO_13388_1_S1_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/FASTQ/3924_AV-1759_Ru1083_MITO_IGO_13388_1_S1_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/FASTQ/3925_AV-1760_MSK_LX_1083c_T_2_MITO_IGO_13388_2_S2_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/FASTQ/3925_AV-1760_MSK_LX_1083c_T_2_MITO_IGO_13388_2_S2_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/FASTQ/3933_AV-1760_Ru263_MITO_IGO_13388_7_S6_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/FASTQ/3933_AV-1760_Ru263_MITO_IGO_13388_7_S6_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/FASTQ/3926_AV-1761_POSIE_101920_T_1_MITO_IGO_13388_3_S8_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/FASTQ/3926_AV-1761_POSIE_101920_T_1_MITO_IGO_13388_3_S8_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/FASTQ/3927_AV-1762_Ru1083d_MITO_IGO_13388_4_S3_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/FASTQ/3927_AV-1762_Ru1083d_MITO_IGO_13388_4_S3_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/FASTQ/3928_AV-1763_Ru1250C_T_1_MITO_IGO_13388_5_S4_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/FASTQ/3928_AV-1763_Ru1250C_T_1_MITO_IGO_13388_5_S4_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/FASTQ/3929_AV-1764_MSK_LX_1250b_PM_1_MITO_IGO_13388_6_S5_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/FASTQ/3929_AV-1764_MSK_LX_1250b_PM_1_MITO_IGO_13388_6_S5_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/FASTQ/3930_AV-1764_Ru1250D_T_1_MITO_IGO_13388_8_S9_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/FASTQ/3930_AV-1764_Ru1250D_T_1_MITO_IGO_13388_8_S9_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/FASTQ/3931_AV-1765_Ru1250e_MITO_IGO_13388_9_S10_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/FASTQ/3931_AV-1765_Ru1250e_MITO_IGO_13388_9_S10_L002_R1_001.fastq.gz']),\n", + " list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/FASTQ/3932_AV-1766_MSK_LX_1250f_MITO_IGO_13388_10_S7_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/FASTQ/3932_AV-1766_MSK_LX_1250f_MITO_IGO_13388_10_S7_L002_R1_001.fastq.gz'])],\n", + " dtype=object)" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs[f'{prefix}.fastqR1'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "248dbbb9-ee1d-4ae8-88e5-ecfecc7cbdd7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/barcodes_in_RNA.non-epithelial.RU1083_LIV.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/barcodes_in_RNA.non-epithelial.RU1083_T2.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/barcodes_in_RNA.non-epithelial.RU263_PDX.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/barcodes_in_RNA.non-epithelial.RU1083_T1.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/barcodes_in_RNA.non-epithelial.RU1083_ST.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/barcodes_in_RNA.non-epithelial.RU1250_T1.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/barcodes_in_RNA.non-epithelial.RU1250_PL.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/barcodes_in_RNA.non-epithelial.RU1250_T2.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/barcodes_in_RNA.non-epithelial.RU1250_ASC1.txt',\n", + " 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/barcodes_in_RNA.non-epithelial.RU1250_ASC2.txt'],\n", + " dtype=object)" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs[f'{prefix}.whitelist'].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0eafa67-d6cc-49c9-be93-50957248ded1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a9ee54c6-1a2c-46f5-a450-75bcf2a7cfb5", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'labels_all' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [33]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m labels \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame(\u001b[43mlabels_all\u001b[49m\u001b[38;5;241m.\u001b[39mloc[sample_names[\u001b[38;5;241m0\u001b[39m:]])\n\u001b[1;32m 2\u001b[0m labels\n", + "\u001b[0;31mNameError\u001b[0m: name 'labels_all' is not defined" + ] + } + ], + "source": [ + "labels = pd.DataFrame(labels_all.loc[sample_names[0:]])\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9008381e-ad79-4dab-8585-a59d20756c73", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "db50837c-9648-4400-af53-aa436bb93adc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
AV-1765_Ru1250e_MITOepi_AV-1765_Ru1250e_MITO3931_AV-1765_Ru1250e_MITO_IGO_13388_9[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016256s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "AV-1765_Ru1250e_MITO epi_AV-1765_Ru1250e_MITO \n", + "\n", + " MitoTracing.fastqName \\\n", + "AV-1765_Ru1250e_MITO 3931_AV-1765_Ru1250e_MITO_IGO_13388_9 \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "AV-1765_Ru1250e_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "AV-1765_Ru1250e_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons MitoTracing.expectCells \\\n", + "AV-1765_Ru1250e_MITO False 5000 \n", + "\n", + " MitoTracing.numCores MitoTracing.memory \\\n", + "AV-1765_Ru1250e_MITO 16 256 \n", + "\n", + " MitoTracing.whitelist \\\n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "AV-1765_Ru1250e_MITO quay.io/hisplan " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "b7dff356-8581-4638-a8e2-9e93dde1a251", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
AV-1765_Ru1250e_MITOMitoTracingLung Tumor AtlasAV-1765_Ru1250e_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample \\\n", + "AV-1765_Ru1250e_MITO MitoTracing Lung Tumor Atlas AV-1765_Ru1250e_MITO \n", + "\n", + " owner destination \\\n", + "AV-1765_Ru1250e_MITO sohailn s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " transfer comment \n", + "AV-1765_Ru1250e_MITO - sohailn " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "bf7782ae-982a-45d9-bef1-5e6dede154cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/epi_mito-tracing-outs'],\n", + " dtype=object)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels['destination'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "cb6579bc-ee47-41fb-b4a7-66435d46188f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/sohailn/scing/bin/wdl-mito-tracing/MitoTracing.options.aws.json'" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Setting cache to true to hopefully save some time\n", + "path_to_options" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "3bd30985-9d8f-4a6a-afb4-0f93357ad25b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "918936fec41b4e9b94b952b0ef943203", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0joe_sample_nameAWS_storage
sample
Ru1083_MITO0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...
MSK_LX_1083c_T_2_MITO1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru263_MITO2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...
POSIE_101920_T_1_MITO3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1083d_MITO4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1250C_T_1_MITO5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...
MSK_LX_1250b_PM_1_MITO6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1250D_T_1_MITO7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1250e_MITO8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...
MSK_LX_1250f_MITO9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru581D_MITO10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...
Ru581b_T1_MITO11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...
Ru581c-LN1_MITO13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...
\n", + "" + ], + "text/plain": [ + " Unnamed: 0 joe_sample_name \\\n", + "sample \n", + "Ru1083_MITO 0 RU1083_LIV \n", + "MSK_LX_1083c_T_2_MITO 1 RU1083_T2 \n", + "Ru263_MITO 2 RU263_PDX \n", + "POSIE_101920_T_1_MITO 3 RU1083_T1 \n", + "Ru1083d_MITO 4 RU1083_ST \n", + "Ru1250C_T_1_MITO 5 RU1250_T1 \n", + "MSK_LX_1250b_PM_1_MITO 6 RU1250_PL \n", + "Ru1250D_T_1_MITO 7 RU1250_T2 \n", + "Ru1250e_MITO 8 RU1250_ASC1 \n", + "MSK_LX_1250f_MITO 9 RU1250_ASC2 \n", + "Ru581D_MITO 10 RU581_LIV \n", + "Ru581b_T1_MITO 11 RU581_Ta \n", + "Ru581c-LN1_MITO 13 RU581_LNa \n", + "\n", + " AWS_storage \n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "# sample_ids = list(range(3924, 3934))\n", + "\n", + "# epi = sample_scridb_info(sample_ids, 'id', creds)\n", + "# epi.index = 'epi_' + epi.index\n", + "\n", + "# non_epi = sample_scridb_info(sample_ids, 'id', creds)\n", + "# non_epi.index = 'non_epi_' + non_epi.index\n", + "\n", + "samples = pd.read_csv('joe_samples_unarchive.txt', index_col='sample')\n", + "samples['AWS_storage'] = samples['AWS_storage'].str.strip('/')\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e85154ba-47ad-4f3a-b2c8-9c81f2d33e23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0joe_sample_nameAWS_storageFASTQs
sample
Ru1083_MITO0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
MSK_LX_1083c_T_2_MITO1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru263_MITO2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
POSIE_101920_T_1_MITO3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru1083d_MITO4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru1250C_T_1_MITO5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
MSK_LX_1250b_PM_1_MITO6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru1250D_T_1_MITO7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru1250e_MITO8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
MSK_LX_1250f_MITO9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru581D_MITO10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru581b_T1_MITO11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
Ru581c-LN1_MITO13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 joe_sample_name \\\n", + "sample \n", + "Ru1083_MITO 0 RU1083_LIV \n", + "MSK_LX_1083c_T_2_MITO 1 RU1083_T2 \n", + "Ru263_MITO 2 RU263_PDX \n", + "POSIE_101920_T_1_MITO 3 RU1083_T1 \n", + "Ru1083d_MITO 4 RU1083_ST \n", + "Ru1250C_T_1_MITO 5 RU1250_T1 \n", + "MSK_LX_1250b_PM_1_MITO 6 RU1250_PL \n", + "Ru1250D_T_1_MITO 7 RU1250_T2 \n", + "Ru1250e_MITO 8 RU1250_ASC1 \n", + "MSK_LX_1250f_MITO 9 RU1250_ASC2 \n", + "Ru581D_MITO 10 RU581_LIV \n", + "Ru581b_T1_MITO 11 RU581_Ta \n", + "Ru581c-LN1_MITO 13 RU581_LNa \n", + "\n", + " AWS_storage \\\n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " FASTQs \n", + "sample \n", + "Ru1083_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1083c_T_2_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru263_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "POSIE_101920_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1083d_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250C_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1250b_PM_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250D_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250e_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1250f_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581D_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581b_T1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581c-LN1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, ['R1', 'R2'], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a59060ca-b11f-4e74-9edf-5ce3e863a064", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "16d0ca53-edc7-4e1b-9c9c-0d812a1ae1c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0joe_sample_nameAWS_storageFASTQsreference
sample
Ru1083_MITO0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
MSK_LX_1083c_T_2_MITO1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru263_MITO2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
POSIE_101920_T_1_MITO3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru1083d_MITO4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru1250C_T_1_MITO5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
MSK_LX_1250b_PM_1_MITO6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru1250D_T_1_MITO7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru1250e_MITO8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
MSK_LX_1250f_MITO9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru581D_MITO10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru581b_T1_MITO11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
Ru581c-LN1_MITO13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 joe_sample_name \\\n", + "sample \n", + "Ru1083_MITO 0 RU1083_LIV \n", + "MSK_LX_1083c_T_2_MITO 1 RU1083_T2 \n", + "Ru263_MITO 2 RU263_PDX \n", + "POSIE_101920_T_1_MITO 3 RU1083_T1 \n", + "Ru1083d_MITO 4 RU1083_ST \n", + "Ru1250C_T_1_MITO 5 RU1250_T1 \n", + "MSK_LX_1250b_PM_1_MITO 6 RU1250_PL \n", + "Ru1250D_T_1_MITO 7 RU1250_T2 \n", + "Ru1250e_MITO 8 RU1250_ASC1 \n", + "MSK_LX_1250f_MITO 9 RU1250_ASC2 \n", + "Ru581D_MITO 10 RU581_LIV \n", + "Ru581b_T1_MITO 11 RU581_Ta \n", + "Ru581c-LN1_MITO 13 RU581_LNa \n", + "\n", + " AWS_storage \\\n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " FASTQs \\\n", + "sample \n", + "Ru1083_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1083c_T_2_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru263_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "POSIE_101920_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1083d_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250C_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1250b_PM_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250D_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250e_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1250f_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581D_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581b_T1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581c-LN1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "\n", + " reference \n", + "sample \n", + "Ru1083_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "MSK_LX_1083c_T_2_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru263_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "POSIE_101920_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1083d_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1250C_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "MSK_LX_1250b_PM_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1250D_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1250e_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "MSK_LX_1250f_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru581D_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru581b_T1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru581c-LN1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reference package created by Jaeyoung\n", + "\n", + "# cellRangerRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/refdata-gex-mito-GRCh38-ensemble98.tar.gz\"\n", + "cellRangerRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/GRCh38_gex_mito_mask.tar.gz\"\n", + "mitoFastaRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/chrM-fasta-GRCh38.tar.gz\"\n", + "# mitoFastaRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/mito-fasta-GRCh38.tar.gz\"\n", + "\n", + "reference = [dict(cellRangerRefPkg=cellRangerRefPkg,\n", + " mitoFastaRefPkg=mitoFastaRefPkg)] * len(samples)\n", + "samples[\"reference\"] = reference\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56ea3b42-967e-49aa-8344-93420514c6e1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c5c4180c-0d82-415e-8516-4174947fd567", + "metadata": {}, + "source": [ + "# Get whitelist" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "69d70985-4fb6-42c6-a87f-a3ec6b9b6a85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0joe_sample_nameAWS_storageFASTQsreferencewhitelist
sample
Ru1083_MITO0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
MSK_LX_1083c_T_2_MITO1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru263_MITO2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
POSIE_101920_T_1_MITO3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1083d_MITO4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1250C_T_1_MITO5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
MSK_LX_1250b_PM_1_MITO6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1250D_T_1_MITO7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru1250e_MITO8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
MSK_LX_1250f_MITO9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru581D_MITO10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru581b_T1_MITO11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
Ru581c-LN1_MITO13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 joe_sample_name \\\n", + "sample \n", + "Ru1083_MITO 0 RU1083_LIV \n", + "MSK_LX_1083c_T_2_MITO 1 RU1083_T2 \n", + "Ru263_MITO 2 RU263_PDX \n", + "POSIE_101920_T_1_MITO 3 RU1083_T1 \n", + "Ru1083d_MITO 4 RU1083_ST \n", + "Ru1250C_T_1_MITO 5 RU1250_T1 \n", + "MSK_LX_1250b_PM_1_MITO 6 RU1250_PL \n", + "Ru1250D_T_1_MITO 7 RU1250_T2 \n", + "Ru1250e_MITO 8 RU1250_ASC1 \n", + "MSK_LX_1250f_MITO 9 RU1250_ASC2 \n", + "Ru581D_MITO 10 RU581_LIV \n", + "Ru581b_T1_MITO 11 RU581_Ta \n", + "Ru581c-LN1_MITO 13 RU581_LNa \n", + "\n", + " AWS_storage \\\n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " FASTQs \\\n", + "sample \n", + "Ru1083_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1083c_T_2_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru263_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "POSIE_101920_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1083d_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250C_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1250b_PM_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250D_T_1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru1250e_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "MSK_LX_1250f_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581D_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581b_T1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "Ru581c-LN1_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "\n", + " reference \\\n", + "sample \n", + "Ru1083_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "MSK_LX_1083c_T_2_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru263_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "POSIE_101920_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1083d_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1250C_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "MSK_LX_1250b_PM_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1250D_T_1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru1250e_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "MSK_LX_1250f_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru581D_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru581b_T1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "Ru581c-LN1_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " whitelist \n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['whitelist'] = np.nan\n", + "for sample, row in samples.iterrows():\n", + " samples.loc[sample, 'whitelist'] = f's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/masked_mito_reference/{sample}/{sample}.txt'\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e845f640-0716-49ca-8e27-8d987b2cf586", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6e7dacea-ff91-416b-9558-6200b5d5cae2", + "metadata": {}, + "source": [ + "# Subset" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "805eab1f-e70a-4e7e-8a54-dd679a30a5f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0joe_sample_nameAWS_storageFASTQsreferencewhitelist
sample
Ru1083_MITO0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...{'R1': ['s3://dp-lab-data/SCRI_Proje...{'cellRangerRefPkg': 's3://dp-lab-da...s3://dp-lab-data/SCRI_Projects/HTA/M...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 joe_sample_name \\\n", + "sample \n", + "Ru1083_MITO 0 RU1083_LIV \n", + "\n", + " AWS_storage \\\n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " FASTQs \\\n", + "sample \n", + "Ru1083_MITO {'R1': ['s3://dp-lab-data/SCRI_Proje... \n", + "\n", + " reference \\\n", + "sample \n", + "Ru1083_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " whitelist \n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples_ = samples.copy()\n", + "\n", + "samples = samples.iloc[0:1]\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c41c631-45c8-4239-87a3-fd88ed67e3f1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c8852971-6bdd-403b-8ea4-b4ebfa3639f0", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "157fd6b4-859d-4b48-8812-04691f9b097e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
sample
Ru1083_MITORu1083_MITO3924_AV-1759_Ru1083_MITO_IGO_13388_1[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName MitoTracing.fastqName \\\n", + "sample \n", + "Ru1083_MITO Ru1083_MITO 3924_AV-1759_Ru1083_MITO_IGO_13388_1 \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "sample \n", + "Ru1083_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "sample \n", + "Ru1083_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "sample \n", + "Ru1083_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons MitoTracing.expectCells \\\n", + "sample \n", + "Ru1083_MITO False 5000 \n", + "\n", + " MitoTracing.numCores MitoTracing.memory \\\n", + "sample \n", + "Ru1083_MITO 16 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "sample \n", + "Ru1083_MITO quay.io/hisplan " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Default inputs\n", + "inputs[f\"{prefix}.includeIntrons\"] = False\n", + "inputs[f\"{prefix}.expectCells\"] = 5000\n", + "inputs[f\"{prefix}.numCores\"] = 16\n", + "inputs[f\"{prefix}.memory\"] = 128\n", + "# inputs[f\"{prefix}.dockerRegistry\"] = \"docker.io/sailmskcc\"\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "\n", + "# Sample information\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index.tolist()\n", + "inputs[f\"{prefix}.fastqR1\"] = samples['FASTQs'].str['R1']\n", + "inputs[f\"{prefix}.fastqR2\"] = samples['FASTQs'].str['R2']\n", + "\n", + "for sample in inputs.index:\n", + " fastqR1_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR1'])\n", + " fastqR2_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR2'])\n", + " \n", + " assert(fastqR1_name == fastqR2_name)\n", + " inputs.loc[sample, f\"{prefix}.fastqName\"] = fastqR1_name\n", + "\n", + "inputs[f\"{prefix}.whitelist\"] = samples['whitelist']\n", + "inputs[f\"{prefix}.reference\"] = samples['reference']\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9785f91c-a79e-44ad-b7df-403b557c7d74", + "metadata": {}, + "outputs": [], + "source": [ + "# Modification for specific samples\n", + "\n", + "# inputs[f\"{prefix}.memory\"] = 256 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db167f4-30fa-4b6c-b254-f8ca24448786", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "130c8072-b98e-4e5c-819b-5fd18cfb27b0", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "03d5abbb-6bf0-4636-8bb7-edd2463439c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
sample
Ru1083_MITOMitoTracingLung Tumor AtlasRu1083_MITOsohailnNaN-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner destination \\\n", + "sample \n", + "Ru1083_MITO MitoTracing Lung Tumor Atlas Ru1083_MITO sohailn NaN \n", + "\n", + " transfer comment \n", + "sample \n", + "Ru1083_MITO - sohailn " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = 'Lung Tumor Atlas'\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "# labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "bce9baca-918f-4264-9afd-809ef64e9d9e", + "metadata": {}, + "outputs": [], + "source": [ + "labels[\"destination\"] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/masked_mito_reference/' + labels.index + '/' + output_dirname" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d99c1d72-d584-4a2b-97c6-c80de3d66b5c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "dce78f39-c948-4ab2-a62d-bee0a08c1564", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "id": "bd2bf1b2-4b5f-48c0-8909-354fe6b313e3", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "ee601aed-3a19-4317-a918-4120db7ee7bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
sample
Ru1083_MITORu1083_MITO3924_AV-1759_Ru1083_MITO_IGO_13388_1[s3://dp-lab-data/SCRI_Projects/HTA/...[s3://dp-lab-data/SCRI_Projects/HTA/...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/SCRI_Projects/HTA/M...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName MitoTracing.fastqName \\\n", + "sample \n", + "Ru1083_MITO Ru1083_MITO 3924_AV-1759_Ru1083_MITO_IGO_13388_1 \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "sample \n", + "Ru1083_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "sample \n", + "Ru1083_MITO [s3://dp-lab-data/SCRI_Projects/HTA/... \n", + "\n", + " MitoTracing.reference \\\n", + "sample \n", + "Ru1083_MITO {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons MitoTracing.expectCells \\\n", + "sample \n", + "Ru1083_MITO False 5000 \n", + "\n", + " MitoTracing.numCores MitoTracing.memory \\\n", + "sample \n", + "Ru1083_MITO 16 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "sample \n", + "Ru1083_MITO quay.io/hisplan " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "91318853-0886-4c2b-8b87-76fa8e5ab30d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
sample
Ru1083_MITOMitoTracingLung Tumor AtlasRu1083_MITOsohailns3://dp-lab-data/SCRI_Projects/HTA/M...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner \\\n", + "sample \n", + "Ru1083_MITO MitoTracing Lung Tumor Atlas Ru1083_MITO sohailn \n", + "\n", + " destination transfer comment \n", + "sample \n", + "Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... - sohailn " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acd1c04b-d582-481b-b8d4-bd9a9a8f97c4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "54ef3d9c-2361-4b8e-bcfe-bc9c3878876b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c0008f7bacfb48a2ac0978f4ff894f57", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilotGRCh38-1.1.0
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilotGRCh38-1.1.0
\n", + "" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq GRCh38-1.1.0 \n", + "RB-2041_mRB54_1003_DOGMAseq GRCh38-1.1.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "sample_ids = [4440, 4441]\n", + "\n", + "samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e85154ba-47ad-4f3a-b2c8-9c81f2d33e23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilotGRCh38-1.1.0{'R1': ['s3://dp-lab-data/collaborat...
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilotGRCh38-1.1.0{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq GRCh38-1.1.0 \n", + "RB-2041_mRB54_1003_DOGMAseq GRCh38-1.1.0 \n", + "\n", + " FASTQs \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'R1': ['s3://dp-lab-data/collaborat... \n", + "RB-2041_mRB54_1003_DOGMAseq {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a59060ca-b11f-4e74-9edf-5ce3e863a064", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "16d0ca53-edc7-4e1b-9c9c-0d812a1ae1c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilot{'cellRangerRefPkg': 's3://dp-lab-da...{'R1': ['s3://dp-lab-data/collaborat...
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilot{'cellRangerRefPkg': 's3://dp-lab-da...{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RB-2041_mRB54_1003_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " FASTQs \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'R1': ['s3://dp-lab-data/collaborat... \n", + "RB-2041_mRB54_1003_DOGMAseq {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reference package created by Jaeyoung\n", + "\n", + "# cellRangerRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/refdata-gex-mito-GRCh38-ensemble98.tar.gz\"\n", + "cellRangerRefPkg=\"s3://dp-lab-data/collaborators/sfeira/ScatacSeqPilot/GRCh38_atac_mito_mask_reference/GRCh38_atac_mito_mask_reference.tar.gz\"\n", + "mitoFastaRefPkg=\"s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/mito-fasta-GRCh38.tar.gz\"\n", + "\n", + "reference = [dict(cellRangerRefPkg=cellRangerRefPkg,\n", + " mitoFastaRefPkg=mitoFastaRefPkg)] * len(samples)\n", + "samples[\"reference\"] = reference\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56ea3b42-967e-49aa-8344-93420514c6e1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c5c4180c-0d82-415e-8516-4174947fd567", + "metadata": {}, + "source": [ + "# Get whitelist" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "600f5e5b-228c-42f4-9297-a6a2b4035d60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQswhitelist
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilot{'cellRangerRefPkg': 's3://dp-lab-da...{'R1': ['s3://dp-lab-data/collaborat...s3://dp-lab-data/collaborators/sfeir...
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilot{'cellRangerRefPkg': 's3://dp-lab-da...{'R1': ['s3://dp-lab-data/collaborat...s3://dp-lab-data/collaborators/sfeir...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RB-2041_mRB54_1003_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " FASTQs \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'R1': ['s3://dp-lab-data/collaborat... \n", + "RB-2041_mRB54_1003_DOGMAseq {'R1': ['s3://dp-lab-data/collaborat... \n", + "\n", + " whitelist \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['whitelist'] = np.nan\n", + "for sample, row in samples.iterrows(): \n", + " samples.loc[sample, 'whitelist'] = f\"{row['AWS_storage']}/cr-atac-mito-results/filtered_peak_bc_matrix/barcodes.tsv\"\n", + " \n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c41c631-45c8-4239-87a3-fd88ed67e3f1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c8852971-6bdd-403b-8ea4-b4ebfa3639f0", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "157fd6b4-859d-4b48-8812-04691f9b097e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
Sample
RB-2041_WildType_DOGMAseqRB-2041_WildType_DOGMAseq4440_RB-2041_WildType_DOGMAseq_IGO_1...[s3://dp-lab-data/collaborators/sfei...[s3://dp-lab-data/collaborators/sfei...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/collaborators/sfeir...quay.io/hisplan
RB-2041_mRB54_1003_DOGMAseqRB-2041_mRB54_1003_DOGMAseq4441_RB-2041_mRB54_1003_DOGMAseq_IGO...[s3://dp-lab-data/collaborators/sfei...[s3://dp-lab-data/collaborators/sfei...{'cellRangerRefPkg': 's3://dp-lab-da...False500016128s3://dp-lab-data/collaborators/sfeir...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq \n", + "\n", + " MitoTracing.fastqName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 4440_RB-2041_WildType_DOGMAseq_IGO_1... \n", + "RB-2041_mRB54_1003_DOGMAseq 4441_RB-2041_mRB54_1003_DOGMAseq_IGO... \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "\n", + " MitoTracing.reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RB-2041_mRB54_1003_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq False \n", + "RB-2041_mRB54_1003_DOGMAseq False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 5000 16 \n", + "RB-2041_mRB54_1003_DOGMAseq 5000 16 \n", + "\n", + " MitoTracing.memory \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 128 \n", + "RB-2041_mRB54_1003_DOGMAseq 128 \n", + "\n", + " MitoTracing.whitelist \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq quay.io/hisplan \n", + "RB-2041_mRB54_1003_DOGMAseq quay.io/hisplan " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Default inputs\n", + "inputs[f\"{prefix}.includeIntrons\"] = False\n", + "inputs[f\"{prefix}.expectCells\"] = 5000\n", + "inputs[f\"{prefix}.numCores\"] = 16\n", + "inputs[f\"{prefix}.memory\"] = 128\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "# Sample information\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index.tolist()\n", + "inputs[f\"{prefix}.fastqR1\"] = samples['FASTQs'].str['R1']\n", + "inputs[f\"{prefix}.fastqR2\"] = samples['FASTQs'].str['R2']\n", + "\n", + "for sample in inputs.index:\n", + " fastqR1_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR1'])\n", + " fastqR2_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR2'])\n", + " \n", + " assert(fastqR1_name == fastqR2_name)\n", + " inputs.loc[sample, f\"{prefix}.fastqName\"] = fastqR1_name\n", + "\n", + "inputs[f\"{prefix}.whitelist\"] = samples['whitelist']\n", + "inputs[f\"{prefix}.reference\"] = samples['reference']\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "9785f91c-a79e-44ad-b7df-403b557c7d74", + "metadata": {}, + "outputs": [], + "source": [ + "# Modification for specific samples\n", + "inputs[f\"{prefix}.memory\"] = 256 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db167f4-30fa-4b6c-b254-f8ca24448786", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "130c8072-b98e-4e5c-819b-5fd18cfb27b0", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "03d5abbb-6bf0-4636-8bb7-edd2463439c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
RB-2041_WildType_DOGMAseqMitoTracingscATAC-seq pilotRB-2041_WildType_DOGMAseqsohailns3://dp-lab-data/collaborators/sfeir...-sohailn
RB-2041_mRB54_1003_DOGMAseqMitoTracingscATAC-seq pilotRB-2041_mRB54_1003_DOGMAseqsohailns3://dp-lab-data/collaborators/sfeir...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq MitoTracing scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq MitoTracing scATAC-seq pilot \n", + "\n", + " sample owner \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq sohailn \n", + "\n", + " destination transfer \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "\n", + " comment \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq sohailn " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d99c1d72-d584-4a2b-97c6-c80de3d66b5c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "dce78f39-c948-4ab2-a62d-bee0a08c1564", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "id": "bd2bf1b2-4b5f-48c0-8909-354fe6b313e3", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ee601aed-3a19-4317-a918-4120db7ee7bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MitoTracing.sampleNameMitoTracing.fastqNameMitoTracing.fastqR1MitoTracing.fastqR2MitoTracing.referenceMitoTracing.includeIntronsMitoTracing.expectCellsMitoTracing.numCoresMitoTracing.memoryMitoTracing.whitelistMitoTracing.dockerRegistry
Sample
RB-2041_WildType_DOGMAseqRB-2041_WildType_DOGMAseq4440_RB-2041_WildType_DOGMAseq_IGO_1...[s3://dp-lab-data/collaborators/sfei...[s3://dp-lab-data/collaborators/sfei...{'cellRangerRefPkg': 's3://dp-lab-da...False500016256s3://dp-lab-data/collaborators/sfeir...quay.io/hisplan
RB-2041_mRB54_1003_DOGMAseqRB-2041_mRB54_1003_DOGMAseq4441_RB-2041_mRB54_1003_DOGMAseq_IGO...[s3://dp-lab-data/collaborators/sfei...[s3://dp-lab-data/collaborators/sfei...{'cellRangerRefPkg': 's3://dp-lab-da...False500016256s3://dp-lab-data/collaborators/sfeir...quay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " MitoTracing.sampleName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq \n", + "\n", + " MitoTracing.fastqName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 4440_RB-2041_WildType_DOGMAseq_IGO_1... \n", + "RB-2041_mRB54_1003_DOGMAseq 4441_RB-2041_mRB54_1003_DOGMAseq_IGO... \n", + "\n", + " MitoTracing.fastqR1 \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "\n", + " MitoTracing.fastqR2 \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "\n", + " MitoTracing.reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "RB-2041_mRB54_1003_DOGMAseq {'cellRangerRefPkg': 's3://dp-lab-da... \n", + "\n", + " MitoTracing.includeIntrons \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq False \n", + "RB-2041_mRB54_1003_DOGMAseq False \n", + "\n", + " MitoTracing.expectCells MitoTracing.numCores \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 5000 16 \n", + "RB-2041_mRB54_1003_DOGMAseq 5000 16 \n", + "\n", + " MitoTracing.memory \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 256 \n", + "RB-2041_mRB54_1003_DOGMAseq 256 \n", + "\n", + " MitoTracing.whitelist \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... \n", + "\n", + " MitoTracing.dockerRegistry \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq quay.io/hisplan \n", + "RB-2041_mRB54_1003_DOGMAseq quay.io/hisplan " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "91318853-0886-4c2b-8b87-76fa8e5ab30d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
RB-2041_WildType_DOGMAseqMitoTracingscATAC-seq pilotRB-2041_WildType_DOGMAseqsohailns3://dp-lab-data/collaborators/sfeir...-sohailn
RB-2041_mRB54_1003_DOGMAseqMitoTracingscATAC-seq pilotRB-2041_mRB54_1003_DOGMAseqsohailns3://dp-lab-data/collaborators/sfeir...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq MitoTracing scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq MitoTracing scATAC-seq pilot \n", + "\n", + " sample owner \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq sohailn \n", + "\n", + " destination transfer \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... - \n", + "\n", + " comment \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq sohailn \n", + "RB-2041_mRB54_1003_DOGMAseq sohailn " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acd1c04b-d582-481b-b8d4-bd9a9a8f97c4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "54ef3d9c-2361-4b8e-bcfe-bc9c3878876b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0d10068356bb4441b4ddbf1fa0b95419", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
ARN-1167_M-4T1s3://dp-lab-data/collaborators/aboir...3249mouse10X_V3.1Leptomeningeal metastasis heterogeneitys3://seqc-public/genomes/mm38_long_p...
ARN-1167_Normals3://dp-lab-data/collaborators/aboir...3250mouse10X_V3.1Leptomeningeal metastasis heterogeneitys3://seqc-public/genomes/mm38_long_p...
ARN-1167_PM-4T1s3://dp-lab-data/collaborators/aboir...3251mouse10X_V3.1Leptomeningeal metastasis heterogeneitys3://seqc-public/genomes/mm38_long_p...
\n", + "" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... 3249 mouse \n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... 3250 mouse \n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... 3251 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "ARN-1167_M-4T1 10X_V3.1 Leptomeningeal metastasis heterogeneity \n", + "ARN-1167_Normal 10X_V3.1 Leptomeningeal metastasis heterogeneity \n", + "ARN-1167_PM-4T1 10X_V3.1 Leptomeningeal metastasis heterogeneity \n", + "\n", + " reference \n", + "Sample \n", + "ARN-1167_M-4T1 s3://seqc-public/genomes/mm38_long_p... \n", + "ARN-1167_Normal s3://seqc-public/genomes/mm38_long_p... \n", + "ARN-1167_PM-4T1 s3://seqc-public/genomes/mm38_long_p... " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs or even AWS paths\n", + "\n", + "request_ids = ['ARN-1167']\n", + "samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "# sample_ids = [3970]\n", + "# samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ARN-1167_M-4T1\n", + " PRE ARN-1167_M-4T1/\n", + " PRE ARN-1167_M-4T1_HTO/\n", + "\n", + "ARN-1167_Normal\n", + " PRE ARN-1167_Normal/\n", + " PRE ARN-1167_Normal_HTO/\n", + "\n", + "ARN-1167_PM-4T1\n", + " PRE ARN-1167_PM-4T1/\n", + " PRE ARN-1167_PM-4T1_HTO/\n", + "\n" + ] + } + ], + "source": [ + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE\n", + "\n", + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQs'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "For CellRanger you need to supply an HTTPS path. So if you are using a custom genome stored on AWS, you must make the reference public ! Be sure to manually change the \"reference\" argument if it has not been updated correctly!!!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "samples = update_ref(samples, prefix)\n", + "\n", + "if not samples['reference'].isna().any():\n", + " samples[\"reference\"].apply(lambda x: {\n", + " \"name\": re.match(r'.*refdata-cellranger-arc-(.*).tar.gz', x)[1],\n", + " \"location\": x,\n", + " }) \n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "samples['reference'] = [\n", + " {\n", + " 'name' : 'GRCz11_atac',\n", + " 'location' : \"https://dp-lab-data.s3.amazonaws.com/collaborators/whiter/YM-1704_transgene_reference/refdata-cellranger/GRCz11_atac.tar.gz\"\n", + " }] * len(samples)\n", + "samples['reference']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index\n", + "inputs[f\"{prefix}.fastqFiles\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f\"{prefix}.fastqNames\"] = inputs[f\"{prefix}.fastqFiles\"].apply(lambda x: get_fastqs_name(x))\n", + "inputs[f\"{prefix}.referenceGenome\"] = samples[\"reference\"] \n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/Run_SeqcAda_WIP.ipynb b/notebooks/Run_SeqcAda_WIP.ipynb index 3e8022e..0a1891e 100644 --- a/notebooks/Run_SeqcAda_WIP.ipynb +++ b/notebooks/Run_SeqcAda_WIP.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,16 +18,374 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from utils.utils import *" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import glob" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## List FASTQs" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# FASTQ reads/indices required for each workflow\n", + "fastq_map = {\n", + " 'genomic': ['R2'],\n", + " 'barcode': ['R1']\n", + "}\n", + "\n", + "# Get fastq file paths on S3 for each file id\n", + "# Returns dictionary from id to s3 path\n", + "# Throws exception if FASTQs don't exist for any id\n", + "def get_fastqs(\n", + " path: str, # path to directory containing FASTQ files\n", + " fastq_file_ids: list, # FASTQ file ids needed for this run type (e.g. I1, R1, R2, etc.)\n", + "\n", + "):\n", + " fastq_map = dict()\n", + " _, bucket, key, _, _ = urllib.parse.urlsplit(path)\n", + " for fid in fastq_file_ids:\n", + " files = get_s3_objects(\n", + " bucket, key.lstrip(\"/\"),\n", + " re.compile(f\"_{fid}_\\d{{3}}.fastq.gz$\")\n", + " )\n", + " try:\n", + " assert files, f\"AssertionError: Missing `{fid}` archives!\"\n", + " fastq_map[fid] = [os.path.join(\"s3://\", bucket, str(f)) for f in files]\n", + " except AssertionError as err:\n", + " logging.warning(\"%s\\n\\t %s\", err, path)\n", + " return\n", + " return fastq_map" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_barcode_genomic_fastqs(samples):\n", + " \n", + " samples['fastqBarcode'] = np.empty((len(samples), 0)).tolist()\n", + " samples['fastqGenomic'] = np.empty((len(samples), 0)).tolist()\n", + " \n", + " for sample, row in samples.iterrows():\n", + " S3_path = row['S3_path']\n", + " \n", + " # Barcode FASTQ files\n", + " fastq_file_ids = fastq_map['barcode'] \n", + " barcode_path = f\"{S3_path}/barcode/\"\n", + " fastqBarcode = get_fastqs(barcode_path, fastq_file_ids)['R1']\n", + " \n", + " # Genomic FASTQ files\n", + " fastq_file_ids = fastq_map['genomic'] \n", + " genomic_path = f\"{S3_path}/genomic/\"\n", + " fastqGenomic = get_fastqs(genomic_path, fastq_file_ids)['R2']\n", + " \n", + " samples.loc[sample, 'fastqBarcode'] += fastqBarcode\n", + " samples.loc[sample, 'fastqGenomic'] += fastqGenomic\n", + " \n", + " return samples" + ] + }, { "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## DB queries" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Common query col: id, request_id, Sample\n", + "def get_sample_name(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.Sample\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_names = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_names.append(result[0])\n", + " return sample_names\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def get_aws_path(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.AWS_storage\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_paths = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_paths.append(result[0])\n", + " return sample_paths\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def get_sample_id(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.id\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_ids = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_ids.append(result[0])\n", + " return sample_ids\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def format_sample_aws(querys, query_col, creds):\n", + " sample_names = []\n", + " sample_paths = []\n", + " sample_ids = []\n", + " \n", + " for query in querys:\n", + " sample_names += get_sample_name(query, query_col, creds)\n", + " sample_paths += get_aws_path(query, query_col, creds)\n", + " sample_ids += get_sample_id(query, query_col, creds)\n", + " \n", + " sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", + " \n", + " samples = pd.DataFrame(\n", + " dict(S3_path=sample_paths, Sample_ID=sample_ids),\n", + " index=sample_names,\n", + " dtype=str,\n", + " )\n", + " return samples" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_genomeIndex_id(query, query_col, creds):\n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.genomeIndex_id\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " \n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " genomeIndex_id = result[0]\n", + " \n", + " return genomeIndex_id\n", + " \n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def get_scTech_id(query, query_col, creds):\n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.genome_index\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.scTech_id\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " \n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " scTech_id = result[0]\n", + " \n", + " return scTech_id\n", + " \n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def get_index(query, query_col, creds):\n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.genome_index\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.gIndex\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " \n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " index = result[0]\n", + " \n", + " return index\n", + " \n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def get_assay(query, query_col, creds):\n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sc_tech\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.Run_name\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " \n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " assay = result[0]\n", + " \n", + " return assay\n", + " \n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def get_barcode(query, query_col, creds):\n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sc_tech\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.barcodes\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " \n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " barcode = result[0]\n", + " \n", + " return barcode\n", + " \n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def format_assay_barcode(samples, creds):\n", + " \n", + " # samples['genomeIndex_id'] = np.nan\n", + " samples['index'] = np.nan\n", + " # samples['scTech_id'] = np.nan\n", + " samples['assay'] = np.nan\n", + " samples['barcode'] = np.nan\n", + " \n", + " for sample, row in samples.iterrows():\n", + " sample_id = row['Sample_ID']\n", + " \n", + " genomeIndex_id = get_genomeIndex_id(sample_id, 'id', creds)\n", + " index = get_index(genomeIndex_id, 'id', creds)\n", + " scTech_id = get_scTech_id(genomeIndex_id, 'id', creds)\n", + " assay = get_assay(scTech_id, 'id', creds)\n", + " barcode = get_barcode(scTech_id, 'id', creds)\n", + " \n", + " # samples.loc[sample, 'genomeIndex_id'] = genomeIndex_id\n", + " samples.loc[sample, 'index'] = index\n", + " # samples.loc[sample, 'scTech_id'] = scTech_id\n", + " samples.loc[sample, 'assay'] = assay\n", + " samples.loc[sample, 'barcode'] = barcode\n", + "\n", + " return samples" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, + "outputs": [], + "source": [ + "def get_project_id(sample_id, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " table_project_data = \"peer_lab_db.project_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_project_data}.projectName\n", + " FROM {table_project_data}\n", + " LEFT JOIN {table_sample_data}\n", + " ON {table_project_data}.id = {table_sample_data}.projectData_id\n", + " WHERE {table_sample_data}.id = {sample_id}\n", + " \"\"\"\n", + " result = execute_query(query, user, password)[0][0]\n", + " return result\n", + " except Error as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, "source": [ "# Process Samples" ] @@ -41,91 +399,1057 @@ "## Setup" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Maps from .wdl name (prefix) to results dirname\n", + "results_dirs = {\n", + " \"SeqcAda\": \"seqc-ada-results\",\n", + "}\n", + "\n", + "# Maps from .wdl name (prefix) to shell script\n", + "sh_files = {\n", + " \"SeqcAda\": \"submit.sh\",\n", + "}" + ] + }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Location of docker files\n", - "common_docker_registry = \"quay.io/hisplan\"\n", + "# Location of docker files\n", + "common_docker_registry = \"quay.io/hisplan\"\n", + "\n", + "prefix = \"SeqcAda\" # Workflow to run; also .wdl filename prefix\n", + "pipeline_type = prefix # field in *.labels.json\n", + "output_dirname = results_dirs[prefix]\n", + "\n", + "# If need to add comment, put here\n", + "email = '20noor.sohail@gmail.com'\n", + "comment = \"sohailn\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Locations of workflow-related directories and files\n", + "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\" # CHANGE THIS\n", + "workflow_dir = glob.glob(f\"{Path.home()}/scing/bin/*ada*\")[0]\n", + "path_to_exec = sh_files[prefix]\n", + "config_dir = f\"{workflow_dir}/configs\"\n", + "path_to_options = glob.glob(f\"{workflow_dir}/*.options.aws.json\")[0]\n", + "\n", + "# Other file locations\n", + "db_credentials_path = f\"{Path.home()}/.config.json\" # CHANGE THIS" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Set credentials based on SCRIdb CLI config file\n", + "with open(db_credentials_path) as f:\n", + " creds = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMIDDZ3PPV\n", + "env: AWS_SECRET_ACCESS_KEY=uhvOE6Kdnl/ThtSguHWeJjtbQW4rQzt1o+bIzeNM\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEIH//////////wEaCXVzLWVhc3QtMSJHMEUCIEeCdu3sOyxjT3+hs8k2Ion/GZbxj94S8aDUlOBT1vGdAiEAu5gaGCZtCwZQEVdEbBiel2i6RKBx/sU5Lf3IBR1CEjkq7wEIehADGgw1ODM2NDM1Njc1MTIiDN6EiN42czmycbSfdirMAY7lrzMUFFrxPNsu/5mT5z0tpi+KEB9JYZIJa0tGg7ZDhlT/t07LpL/Er+45A4/m5g65jxzNRNHnoEUdQVh4A5xVSeWiorFYkJN3bjnplO3FZ0EzzMEaDqyqQakuOT0TBv9v/7zQ7MJMDayLKc83fOf4N7ISJCtn1IVQTO2eenBL+669Enp4kc7mNlwQ0HQaVrYFn9RCxdbtT18692pET+LjcfyKzAkcJK7pdaH8ZxOjMjUKE+UHCFjty2GmEDiKEAE7NnOsDdjJIoBMMzC6+uiUBjqYAX9ZfBDDdsPNSNTHOsGuY+0pXrT6xdLn+n5F5s/JFGfulywUfyefg6VArCxjlnYWN55U/kuimwcbdTivUqrF5gXoa4ZnnxY5IR7IldWIsmBSTMPt3yiWUvKDR/RE5w43dzH8J1483gDSKjbzdSsxxB7kdL+BbYmoS4GkvXAa/YX+QJWXXdvGuQNhYRGB1ur5+n+sQoRvpxz2\n" + ] + } + ], + "source": [ + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMIDDZ3PPV\n", + "%env AWS_SECRET_ACCESS_KEY=uhvOE6Kdnl/ThtSguHWeJjtbQW4rQzt1o+bIzeNM\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEIH//////////wEaCXVzLWVhc3QtMSJHMEUCIEeCdu3sOyxjT3+hs8k2Ion/GZbxj94S8aDUlOBT1vGdAiEAu5gaGCZtCwZQEVdEbBiel2i6RKBx/sU5Lf3IBR1CEjkq7wEIehADGgw1ODM2NDM1Njc1MTIiDN6EiN42czmycbSfdirMAY7lrzMUFFrxPNsu/5mT5z0tpi+KEB9JYZIJa0tGg7ZDhlT/t07LpL/Er+45A4/m5g65jxzNRNHnoEUdQVh4A5xVSeWiorFYkJN3bjnplO3FZ0EzzMEaDqyqQakuOT0TBv9v/7zQ7MJMDayLKc83fOf4N7ISJCtn1IVQTO2eenBL+669Enp4kc7mNlwQ0HQaVrYFn9RCxdbtT18692pET+LjcfyKzAkcJK7pdaH8ZxOjMjUKE+UHCFjty2GmEDiKEAE7NnOsDdjJIoBMMzC6+uiUBjqYAX9ZfBDDdsPNSNTHOsGuY+0pXrT6xdLn+n5F5s/JFGfulywUfyefg6VArCxjlnYWN55U/kuimwcbdTivUqrF5gXoa4ZnnxY5IR7IldWIsmBSTMPt3yiWUvKDR/RE5w43dzH8J1483gDSKjbzdSsxxB7kdL+BbYmoS4GkvXAa/YX+QJWXXdvGuQNhYRGB1ur5+n+sQoRvpxz2" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "!aws s3 ls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Excution" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Sample information" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_ID
ARN-1167_M-4T1s3://dp-lab-data/collaborators/aboir...3249
ARN-1167_Normals3://dp-lab-data/collaborators/aboir...3250
ARN-1167_PM-4T1s3://dp-lab-data/collaborators/aboir...3251
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID\n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... 3249\n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... 3250\n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... 3251" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Common query col: id, request_id, Sample\n", + "request_ids = ['ARN-1167']\n", + "\n", + "samples = format_sample_aws(request_ids, 'request_id', creds)\n", + "samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcode
ARN-1167_M-4T1s3://dp-lab-data/collaborators/aboir...3249s3://seqc-public/genomes/mm38_long_p...ten_x_v3s3://seqc-public/barcodes/ten_x_v3/f...
ARN-1167_Normals3://dp-lab-data/collaborators/aboir...3250s3://seqc-public/genomes/mm38_long_p...ten_x_v3s3://seqc-public/barcodes/ten_x_v3/f...
ARN-1167_PM-4T1s3://dp-lab-data/collaborators/aboir...3251s3://seqc-public/genomes/mm38_long_p...ten_x_v3s3://seqc-public/barcodes/ten_x_v3/f...
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID \\\n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... 3249 \n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... 3250 \n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... 3251 \n", + "\n", + " index assay \\\n", + "ARN-1167_M-4T1 s3://seqc-public/genomes/mm38_long_p... ten_x_v3 \n", + "ARN-1167_Normal s3://seqc-public/genomes/mm38_long_p... ten_x_v3 \n", + "ARN-1167_PM-4T1 s3://seqc-public/genomes/mm38_long_p... ten_x_v3 \n", + "\n", + " barcode \n", + "ARN-1167_M-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_Normal s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_PM-4T1 s3://seqc-public/barcodes/ten_x_v3/f... " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = format_assay_barcode(samples, creds)\n", + "samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_IDindexassaybarcodefastqBarcodefastqGenomic
ARN-1167_M-4T1s3://dp-lab-data/collaborators/aboir...3249s3://seqc-public/genomes/mm38_long_p...ten_x_v3s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...
ARN-1167_Normals3://dp-lab-data/collaborators/aboir...3250s3://seqc-public/genomes/mm38_long_p...ten_x_v3s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...
ARN-1167_PM-4T1s3://dp-lab-data/collaborators/aboir...3251s3://seqc-public/genomes/mm38_long_p...ten_x_v3s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID \\\n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... 3249 \n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... 3250 \n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... 3251 \n", + "\n", + " index assay \\\n", + "ARN-1167_M-4T1 s3://seqc-public/genomes/mm38_long_p... ten_x_v3 \n", + "ARN-1167_Normal s3://seqc-public/genomes/mm38_long_p... ten_x_v3 \n", + "ARN-1167_PM-4T1 s3://seqc-public/genomes/mm38_long_p... ten_x_v3 \n", + "\n", + " barcode \\\n", + "ARN-1167_M-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_Normal s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_PM-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "\n", + " fastqBarcode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_Normal [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_PM-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "\n", + " fastqGenomic \n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_Normal [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_PM-4T1 [s3://dp-lab-data/collaborators/aboi... " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = get_barcode_genomic_fastqs(samples)\n", + "samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Make input file" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SeqcAda.versionSeqcAda.assaySeqcAda.indexSeqcAda.barcodeFilesSeqcAda.fastqBarcodeSeqcAda.fastqGenomicSeqcAda.filterModeSeqcAda.outputPrefixSeqcAda.starArgumentsSeqcAda.emailSeqcAda.dockerRegistry
ARN-1167_M-4T1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
ARN-1167_NormalNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
ARN-1167_PM-4T1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " SeqcAda.version SeqcAda.assay SeqcAda.index \\\n", + "ARN-1167_M-4T1 NaN NaN NaN \n", + "ARN-1167_Normal NaN NaN NaN \n", + "ARN-1167_PM-4T1 NaN NaN NaN \n", + "\n", + " SeqcAda.barcodeFiles SeqcAda.fastqBarcode \\\n", + "ARN-1167_M-4T1 NaN NaN \n", + "ARN-1167_Normal NaN NaN \n", + "ARN-1167_PM-4T1 NaN NaN \n", + "\n", + " SeqcAda.fastqGenomic SeqcAda.filterMode SeqcAda.outputPrefix \\\n", + "ARN-1167_M-4T1 NaN NaN NaN \n", + "ARN-1167_Normal NaN NaN NaN \n", + "ARN-1167_PM-4T1 NaN NaN NaN \n", + "\n", + " SeqcAda.starArguments SeqcAda.email SeqcAda.dockerRegistry \n", + "ARN-1167_M-4T1 NaN NaN NaN \n", + "ARN-1167_Normal NaN NaN NaN \n", + "ARN-1167_PM-4T1 NaN NaN NaN " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "template_prefix = 'template'\n", + "# Load minimum inputs and labels fields from input template\n", + "with open(f\"{config_dir}/{template_prefix}.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SeqcAda.versionSeqcAda.assaySeqcAda.indexSeqcAda.barcodeFilesSeqcAda.fastqBarcodeSeqcAda.fastqGenomicSeqcAda.filterModeSeqcAda.outputPrefixSeqcAda.starArgumentsSeqcAda.emailSeqcAda.dockerRegistry
ARN-1167_M-4T10.2.11NaNNaNNaNNaNNaNscRNA-seqNaNrunRNGseed=020noor.sohail@gmail.comquay.io/hisplan
ARN-1167_Normal0.2.11NaNNaNNaNNaNNaNscRNA-seqNaNrunRNGseed=020noor.sohail@gmail.comquay.io/hisplan
ARN-1167_PM-4T10.2.11NaNNaNNaNNaNNaNscRNA-seqNaNrunRNGseed=020noor.sohail@gmail.comquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " SeqcAda.version SeqcAda.assay SeqcAda.index \\\n", + "ARN-1167_M-4T1 0.2.11 NaN NaN \n", + "ARN-1167_Normal 0.2.11 NaN NaN \n", + "ARN-1167_PM-4T1 0.2.11 NaN NaN \n", + "\n", + " SeqcAda.barcodeFiles SeqcAda.fastqBarcode \\\n", + "ARN-1167_M-4T1 NaN NaN \n", + "ARN-1167_Normal NaN NaN \n", + "ARN-1167_PM-4T1 NaN NaN \n", + "\n", + " SeqcAda.fastqGenomic SeqcAda.filterMode SeqcAda.outputPrefix \\\n", + "ARN-1167_M-4T1 NaN scRNA-seq NaN \n", + "ARN-1167_Normal NaN scRNA-seq NaN \n", + "ARN-1167_PM-4T1 NaN scRNA-seq NaN \n", + "\n", + " SeqcAda.starArguments SeqcAda.email \\\n", + "ARN-1167_M-4T1 runRNGseed=0 20noor.sohail@gmail.com \n", + "ARN-1167_Normal runRNGseed=0 20noor.sohail@gmail.com \n", + "ARN-1167_PM-4T1 runRNGseed=0 20noor.sohail@gmail.com \n", + "\n", + " SeqcAda.dockerRegistry \n", + "ARN-1167_M-4T1 quay.io/hisplan \n", + "ARN-1167_Normal quay.io/hisplan \n", + "ARN-1167_PM-4T1 quay.io/hisplan " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Default inputs\n", + "inputs[f\"{prefix}.version\"] = \"0.2.11\"\n", + "inputs[f\"{prefix}.filterMode\"] = \"scRNA-seq\"\n", + "inputs[f\"{prefix}.starArguments\"] = \"runRNGseed=0\"\n", + "inputs[f\"{prefix}.email\"] = email\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SeqcAda.versionSeqcAda.assaySeqcAda.indexSeqcAda.barcodeFilesSeqcAda.fastqBarcodeSeqcAda.fastqGenomicSeqcAda.filterModeSeqcAda.outputPrefixSeqcAda.starArgumentsSeqcAda.emailSeqcAda.dockerRegistry
ARN-1167_M-4T10.2.11ten_x_v3s3://seqc-public/genomes/mm38_long_p...s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...scRNA-seqARN-1167_M-4T1runRNGseed=020noor.sohail@gmail.comquay.io/hisplan
ARN-1167_Normal0.2.11ten_x_v3s3://seqc-public/genomes/mm38_long_p...s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...scRNA-seqARN-1167_NormalrunRNGseed=020noor.sohail@gmail.comquay.io/hisplan
ARN-1167_PM-4T10.2.11ten_x_v3s3://seqc-public/genomes/mm38_long_p...s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...scRNA-seqARN-1167_PM-4T1runRNGseed=020noor.sohail@gmail.comquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " SeqcAda.version SeqcAda.assay \\\n", + "ARN-1167_M-4T1 0.2.11 ten_x_v3 \n", + "ARN-1167_Normal 0.2.11 ten_x_v3 \n", + "ARN-1167_PM-4T1 0.2.11 ten_x_v3 \n", + "\n", + " SeqcAda.index \\\n", + "ARN-1167_M-4T1 s3://seqc-public/genomes/mm38_long_p... \n", + "ARN-1167_Normal s3://seqc-public/genomes/mm38_long_p... \n", + "ARN-1167_PM-4T1 s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " SeqcAda.barcodeFiles \\\n", + "ARN-1167_M-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_Normal s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_PM-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "\n", + " SeqcAda.fastqBarcode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_Normal [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_PM-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "\n", + " SeqcAda.fastqGenomic SeqcAda.filterMode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", + "ARN-1167_Normal [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", + "ARN-1167_PM-4T1 [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", + "\n", + " SeqcAda.outputPrefix SeqcAda.starArguments \\\n", + "ARN-1167_M-4T1 ARN-1167_M-4T1 runRNGseed=0 \n", + "ARN-1167_Normal ARN-1167_Normal runRNGseed=0 \n", + "ARN-1167_PM-4T1 ARN-1167_PM-4T1 runRNGseed=0 \n", + "\n", + " SeqcAda.email SeqcAda.dockerRegistry \n", + "ARN-1167_M-4T1 20noor.sohail@gmail.com quay.io/hisplan \n", + "ARN-1167_Normal 20noor.sohail@gmail.com quay.io/hisplan \n", + "ARN-1167_PM-4T1 20noor.sohail@gmail.com quay.io/hisplan " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sample inputs\n", + "inputs[f\"{prefix}.assay\"] = samples[\"assay\"]\n", + "inputs[f\"{prefix}.index\"] = samples[\"index\"]\n", + "inputs[f\"{prefix}.barcodeFiles\"] = samples[\"barcode\"]\n", + "inputs[f\"{prefix}.fastqBarcode\"] = samples[\"fastqBarcode\"]\n", + "inputs[f\"{prefix}.fastqGenomic\"] = samples[\"fastqGenomic\"]\n", "\n", - "prefix = \"SeqcAda\" # Workflow to run; also .wdl filename prefix\n", - "pipeline_type = prefix # field in *.labels.json\n", - "output_dirname = \"\"\n", + "inputs[f\"{prefix}.outputPrefix\"] = samples.index.tolist()\n", "\n", - "# If need to add comment, put here\n", - "comment = \"\"" + "inputs" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SeqcAda.versionSeqcAda.assaySeqcAda.indexSeqcAda.barcodeFilesSeqcAda.fastqBarcodeSeqcAda.fastqGenomicSeqcAda.filterModeSeqcAda.outputPrefixSeqcAda.starArgumentsSeqcAda.emailSeqcAda.dockerRegistry
ARN-1167_M-4T10.2.11ten_x_v3s3://dp-lab-data/collaborators/aboir...s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...scRNA-seqARN-1167_M-4T1runRNGseed=020noor.sohail@gmail.comquay.io/hisplan
ARN-1167_Normal0.2.11ten_x_v3s3://dp-lab-data/collaborators/aboir...s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...scRNA-seqARN-1167_NormalrunRNGseed=020noor.sohail@gmail.comquay.io/hisplan
ARN-1167_PM-4T10.2.11ten_x_v3s3://dp-lab-data/collaborators/aboir...s3://seqc-public/barcodes/ten_x_v3/f...[s3://dp-lab-data/collaborators/aboi...[s3://dp-lab-data/collaborators/aboi...scRNA-seqARN-1167_PM-4T1runRNGseed=020noor.sohail@gmail.comquay.io/hisplan
\n", + "
" + ], + "text/plain": [ + " SeqcAda.version SeqcAda.assay \\\n", + "ARN-1167_M-4T1 0.2.11 ten_x_v3 \n", + "ARN-1167_Normal 0.2.11 ten_x_v3 \n", + "ARN-1167_PM-4T1 0.2.11 ten_x_v3 \n", + "\n", + " SeqcAda.index \\\n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... \n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... \n", + "\n", + " SeqcAda.barcodeFiles \\\n", + "ARN-1167_M-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_Normal s3://seqc-public/barcodes/ten_x_v3/f... \n", + "ARN-1167_PM-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", + "\n", + " SeqcAda.fastqBarcode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_Normal [s3://dp-lab-data/collaborators/aboi... \n", + "ARN-1167_PM-4T1 [s3://dp-lab-data/collaborators/aboi... \n", + "\n", + " SeqcAda.fastqGenomic SeqcAda.filterMode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", + "ARN-1167_Normal [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", + "ARN-1167_PM-4T1 [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", + "\n", + " SeqcAda.outputPrefix SeqcAda.starArguments \\\n", + "ARN-1167_M-4T1 ARN-1167_M-4T1 runRNGseed=0 \n", + "ARN-1167_Normal ARN-1167_Normal runRNGseed=0 \n", + "ARN-1167_PM-4T1 ARN-1167_PM-4T1 runRNGseed=0 \n", + "\n", + " SeqcAda.email SeqcAda.dockerRegistry \n", + "ARN-1167_M-4T1 20noor.sohail@gmail.com quay.io/hisplan \n", + "ARN-1167_Normal 20noor.sohail@gmail.com quay.io/hisplan \n", + "ARN-1167_PM-4T1 20noor.sohail@gmail.com quay.io/hisplan " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Locations of workflow-related directories and files\n", - "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\" # CHANGE THIS\n", - "workflow_dir = f\"{Path.home()}/scing/bin/seqc-ada-0.0.3\" # CHANGE THIS\n", - "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", - "config_dir = f\"{workflow_dir}/configs\"\n", - "path_to_options = f\"{workflow_dir}/{prefix}.options.aws.json\"\n", - "\n", - "# Other file locations\n", - "db_credentials_path = f\"{Path.home()}/.config.json\" # CHANGE THIS" + "# Additional changes\n", + "inputs[f\"{prefix}.index\"] = \"s3://dp-lab-data/collaborators/aboire/LeptomeningealMetHeterogeneity/transgene_reference/refdata-seqc/STAR-index/\"\n", + "inputs" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Set credentials based on SCRIdb CLI config file\n", - "with open(db_credentials_path) as f:\n", - " creds = json.load(f)" - ] + "source": [] }, { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "tags": [] + }, "source": [ - "# Samples on which to run CellRangerATAC\n", - "# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)\n", - "# Note: Assumes directory name is name of sample\n", - "common_dir = \"s3://dp-lab-data/collaborators/VanDenBrink\"\n", - "samples = [\n", - " 'Thymic_regeneration/231_no_XRT_rep',\n", - "]\n", - "sample_paths = [\n", - " f\"{common_dir}/{sample}\" for sample in samples\n", - "]" + "## Make label file" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# Set path to transgene reference S3\n", - "# Note: This is an exceptional case\n", - "# path_to_reference = f\"{common_dir}/transgene_reference/refdata-cellranger/3PS19_SNSEQ-GRCm38-Ensembl-87-transgenes.tar.gz\"\n", - "path_to_reference = \"https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz\"" - ] - }, - { - "cell_type": "markdown", + "execution_count": 22, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
ARN-1167_M-4T1NaNNaNNaNNaNNaNNaNNaN
ARN-1167_NormalNaNNaNNaNNaNNaNNaNNaN
ARN-1167_PM-4T1NaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner destination transfer comment\n", + "ARN-1167_M-4T1 NaN NaN NaN NaN NaN NaN NaN\n", + "ARN-1167_Normal NaN NaN NaN NaN NaN NaN NaN\n", + "ARN-1167_PM-4T1 NaN NaN NaN NaN NaN NaN NaN" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Execution" + "# Load minimum inputs and labels fields from labels template\n", + "with open(f\"{config_dir}/{template_prefix}.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "labels" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -149,121 +1473,133 @@ " \n", " \n", " \n", - " S3_Path\n", - " Sample_ID\n", - " FASTQs\n", + " pipelineType\n", + " project\n", + " sample\n", + " owner\n", + " destination\n", + " transfer\n", + " comment\n", " \n", " \n", " \n", " \n", - " 231_no_XRT_rep\n", - " s3://dp-lab-data/collaborators/VanDe...\n", - " 2070\n", - " {'All': ['s3://dp-lab-data/collabora...\n", + " ARN-1167_M-4T1\n", + " SeqcAda\n", + " Leptomeningeal metastasis heterogeneity\n", + " ARN-1167_M-4T1\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " ARN-1167_Normal\n", + " SeqcAda\n", + " Leptomeningeal metastasis heterogeneity\n", + " ARN-1167_Normal\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " ARN-1167_PM-4T1\n", + " SeqcAda\n", + " Leptomeningeal metastasis heterogeneity\n", + " ARN-1167_PM-4T1\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " -\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " S3_Path Sample_ID \\\n", - "231_no_XRT_rep s3://dp-lab-data/collaborators/VanDe... 2070 \n", + " pipelineType project \\\n", + "ARN-1167_M-4T1 SeqcAda Leptomeningeal metastasis heterogeneity \n", + "ARN-1167_Normal SeqcAda Leptomeningeal metastasis heterogeneity \n", + "ARN-1167_PM-4T1 SeqcAda Leptomeningeal metastasis heterogeneity \n", "\n", - " FASTQs \n", - "231_no_XRT_rep {'All': ['s3://dp-lab-data/collabora... " + " sample owner \\\n", + "ARN-1167_M-4T1 ARN-1167_M-4T1 sohailn \n", + "ARN-1167_Normal ARN-1167_Normal sohailn \n", + "ARN-1167_PM-4T1 ARN-1167_PM-4T1 sohailn \n", + "\n", + " destination transfer comment \n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... - sohailn \n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... - sohailn \n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... - sohailn " ] }, - "execution_count": 30, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "samples" + "# Annotate labels\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples[\"Sample_ID\"].apply(lambda x: get_project_id(x, creds))\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['S3_path'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['231_no_XRT_rep']\n" - ] - } - ], "source": [ - "# Get information for all samples\n", - "sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", - "sample_names = [os.path.basename(s) for s in sample_paths]\n", - "\n", - "print(sample_names)\n", - "samples = pd.DataFrame(\n", - " sample_paths,\n", - " index=sample_names,\n", - " columns=[\"S3_Path\"],\n", - " dtype=str,\n", - ")\n", - "samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", - " lambda x: get_sample_id(x, creds['user'], creds['password'])\n", - ").values\n", - "\n", - "# Get FASTQ paths from S3\n", - "# Note: Uses same FASTQ file ids for all samples\n", - "#fastq_file_ids = fastq_map[prefix]\n", - "samples[\"FASTQs\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, folder=\"barcode\"))\n", - "\n", - "# Get reference genome location\n", - "#samples[\"Reference\"] = samples[\"Sample_ID\"].apply(lambda x: get_cr_reference(x, prefix, creds[\"user\"], creds[\"password\"]))\n", - "#samples[\"Reference\"] = path_to_reference" + "# Submit job" ] }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "# Load minimum inputs and labels fields from templates\n", - "with open(f\"{config_dir}/template.inputs.json\") as f:\n", - " std_inputs_fields = list(json.load(f).keys())\n", - " \n", - "with open(f\"{config_dir}/template.labels.json\") as f:\n", - " std_labels_fields = list(json.load(f).keys())\n", - " \n", - "# Annotate all samples with workflow inputs and labels\n", - "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", - "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", - "\n", - "# Annotate inputs\n", - "inputs[f\"{prefix}.sampleName\"] = inputs.index # may need to change\n", - "inputs[f\"{prefix}.inputFastq\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", - "inputs[f\"{prefix}.fastqName\"] = inputs[f\"{prefix}.inputFastq\"].apply(lambda x: get_fastqs_name(x))\n", - "inputs[f\"{prefix}.referenceUrl\"] = samples[\"Reference\"] \n", - "inputs[f\"{prefix}.includeIntrons\"] = False\n", - "inputs[f\"{prefix}.expectCells\"] = 5000\n", - "inputs[f\"{prefix}.memory\"] = 256\n", - "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", - "\n", - "# Annotate labels\n", - "labels[\"pipelineType\"] = pipeline_type\n", - "labels[\"project\"] = samples[\"Sample_ID\"].apply(lambda x: get_project_id(x, creds[\"user\"], creds[\"password\"]))\n", - "labels[\"sample\"] = labels.index\n", - "labels[\"owner\"] = creds[\"user\"]\n", - "labels[\"destination\"] = samples['S3_Path'] + \"/\" + output_dirname\n", - "labels[\"transfer\"] = \"-\"\n", - "labels[\"comment\"] = creds[\"user\"]\n", - "\n", - "assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())\n", - "assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())" + "inputs_all = inputs.copy()\n", + "labels_all = labels.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ARN-1167_M-4T1', 'ARN-1167_Normal', 'ARN-1167_PM-4T1']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_names = inputs.index.tolist()\n", + "sample_names" ] }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -287,26 +1623,32 @@ " \n", " \n", " \n", - " CellRangerGex.sampleName\n", - " CellRangerGex.fastqName\n", - " CellRangerGex.inputFastq\n", - " CellRangerGex.referenceUrl\n", - " CellRangerGex.includeIntrons\n", - " CellRangerGex.expectCells\n", - " CellRangerGex.memory\n", - " CellRangerGex.dockerRegistry\n", + " SeqcAda.version\n", + " SeqcAda.assay\n", + " SeqcAda.index\n", + " SeqcAda.barcodeFiles\n", + " SeqcAda.fastqBarcode\n", + " SeqcAda.fastqGenomic\n", + " SeqcAda.filterMode\n", + " SeqcAda.outputPrefix\n", + " SeqcAda.starArguments\n", + " SeqcAda.email\n", + " SeqcAda.dockerRegistry\n", " \n", " \n", " \n", " \n", - " SU-1358_C10_T2_on_treatment\n", - " SU-1358_C10_T2_on_treatment\n", - " 3447_SU-1358_C10_T2_on_treatment_IGO...\n", - " [s3://dp-lab-data/sc-seq/Project_124...\n", - " https://cf.10xgenomics.com/supp/cell...\n", - " False\n", - " 5000\n", - " 256\n", + " ARN-1167_M-4T1\n", + " 0.2.11\n", + " ten_x_v3\n", + " s3://dp-lab-data/collaborators/aboir...\n", + " s3://seqc-public/barcodes/ten_x_v3/f...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " [s3://dp-lab-data/collaborators/aboi...\n", + " scRNA-seq\n", + " ARN-1167_M-4T1\n", + " runRNGseed=0\n", + " 20noor.sohail@gmail.com\n", " quay.io/hisplan\n", " \n", " \n", @@ -314,40 +1656,41 @@ "" ], "text/plain": [ - " CellRangerGex.sampleName \\\n", - "SU-1358_C10_T2_on_treatment SU-1358_C10_T2_on_treatment \n", + " SeqcAda.version SeqcAda.assay \\\n", + "ARN-1167_M-4T1 0.2.11 ten_x_v3 \n", "\n", - " CellRangerGex.fastqName \\\n", - "SU-1358_C10_T2_on_treatment 3447_SU-1358_C10_T2_on_treatment_IGO... \n", + " SeqcAda.index \\\n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... \n", "\n", - " CellRangerGex.inputFastq \\\n", - "SU-1358_C10_T2_on_treatment [s3://dp-lab-data/sc-seq/Project_124... \n", + " SeqcAda.barcodeFiles \\\n", + "ARN-1167_M-4T1 s3://seqc-public/barcodes/ten_x_v3/f... \n", "\n", - " CellRangerGex.referenceUrl \\\n", - "SU-1358_C10_T2_on_treatment https://cf.10xgenomics.com/supp/cell... \n", + " SeqcAda.fastqBarcode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... \n", "\n", - " CellRangerGex.includeIntrons \\\n", - "SU-1358_C10_T2_on_treatment False \n", + " SeqcAda.fastqGenomic SeqcAda.filterMode \\\n", + "ARN-1167_M-4T1 [s3://dp-lab-data/collaborators/aboi... scRNA-seq \n", "\n", - " CellRangerGex.expectCells CellRangerGex.memory \\\n", - "SU-1358_C10_T2_on_treatment 5000 256 \n", + " SeqcAda.outputPrefix SeqcAda.starArguments \\\n", + "ARN-1167_M-4T1 ARN-1167_M-4T1 runRNGseed=0 \n", "\n", - " CellRangerGex.dockerRegistry \n", - "SU-1358_C10_T2_on_treatment quay.io/hisplan " + " SeqcAda.email SeqcAda.dockerRegistry \n", + "ARN-1167_M-4T1 20noor.sohail@gmail.com quay.io/hisplan " ] }, - "execution_count": 146, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "inputs = pd.DataFrame(inputs_all.loc[sample_names[0]]).T\n", "inputs" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -382,53 +1725,49 @@ " \n", " \n", " \n", - " SU-1358_C10_T2_on_treatment\n", - " CellRangerGex\n", - " POLAR\n", - " SU-1358_C10_T2_on_treatment\n", - " moormana\n", - " s3://dp-lab-data/sc-seq/Project_1243...\n", + " ARN-1167_M-4T1\n", + " SeqcAda\n", + " Leptomeningeal metastasis heterogeneity\n", + " ARN-1167_M-4T1\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/aboir...\n", " -\n", - " moormana\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pipelineType project \\\n", - "SU-1358_C10_T2_on_treatment CellRangerGex POLAR \n", + " pipelineType project \\\n", + "ARN-1167_M-4T1 SeqcAda Leptomeningeal metastasis heterogeneity \n", "\n", - " sample owner \\\n", - "SU-1358_C10_T2_on_treatment SU-1358_C10_T2_on_treatment moormana \n", + " sample owner \\\n", + "ARN-1167_M-4T1 ARN-1167_M-4T1 sohailn \n", "\n", - " destination transfer \\\n", - "SU-1358_C10_T2_on_treatment s3://dp-lab-data/sc-seq/Project_1243... - \n", - "\n", - " comment \n", - "SU-1358_C10_T2_on_treatment moormana " + " destination transfer comment \n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... - sohailn " ] }, - "execution_count": 149, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "labels = pd.DataFrame(labels_all.loc[sample_names[0]]).T\n", "labels" ] }, { "cell_type": "code", - "execution_count": 150, - "metadata": { - "tags": [] - }, + "execution_count": 28, + "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c261084afa9f437daabaecb8054fea46", + "model_id": "919e1efd3ca045b1a50f15ecc4b5432e", "version_major": 2, "version_minor": 0 }, @@ -441,26 +1780,29 @@ } ], "source": [ + "assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())\n", + "assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())\n", + "\n", "stdouts = [] # to store all outputs\n", "process = True\n", "\n", - "with tqdm(samples.index) as t:\n", + "with tqdm(inputs.index) as t:\n", "\n", " for sample_name in t:\n", "\n", " # Write inputs and labels to file\n", - " path_to_inputs = f\"{config_dir}/{sample_name}.inputs.json\"\n", + " path_to_inputs = f\"{config_dir}/{sample_name}_{prefix}.inputs.json\"\n", " with open(path_to_inputs, \"w\") as f_inputs:\n", " json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)\n", "\n", - " path_to_labels = f\"{config_dir}/{sample_name}.labels.json\"\n", + " path_to_labels = f\"{config_dir}/{sample_name}_{prefix}.labels.json\"\n", " with open(path_to_labels, \"w\") as f_labels:\n", " json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)\n", "\n", " if process:\n", " stdouts.append(run(\n", " workflow_path = workflow_dir,\n", - " execp = \"submit.sh\",\n", + " execp = path_to_exec,\n", " secrets = path_to_cromwell_secrets,\n", " inputs = path_to_inputs,\n", " labels = path_to_labels,\n", @@ -470,27 +1812,46 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/sohailn/scing/bin/seqc-ada-0.0.4/configs/ARN-1167_M-4T1_SeqcAda.inputs.json\n", + "/Users/sohailn/scing/bin/seqc-ada-0.0.4/configs/ARN-1167_M-4T1_SeqcAda.labels.json\n" + ] + } + ], + "source": [ + "print(path_to_inputs)\n", + "print(path_to_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'args': ['/Users/moormana/scing/bin/cellranger-gex-6.1.2/submit.sh',\n", + "[{'args': ['/Users/sohailn/scing/bin/seqc-ada-0.0.4/submit.sh',\n", " '-k',\n", - " '/Users/moormana/.cromwell/cromwell-secrets.json',\n", + " '/Users/sohailn/.cromwell/cromwell-secrets.json',\n", " '-i',\n", - " '/Users/moormana/scing/bin/cellranger-gex-6.1.2/configs/SU-1358_C10_T2_on_treatment.inputs.json',\n", + " '/Users/sohailn/scing/bin/seqc-ada-0.0.4/configs/ARN-1167_M-4T1_SeqcAda.inputs.json',\n", " '-l',\n", - " '/Users/moormana/scing/bin/cellranger-gex-6.1.2/configs/SU-1358_C10_T2_on_treatment.labels.json',\n", + " '/Users/sohailn/scing/bin/seqc-ada-0.0.4/configs/ARN-1167_M-4T1_SeqcAda.labels.json',\n", " '-o',\n", - " '/Users/moormana/scing/bin/cellranger-gex-6.1.2/CellRangerGex.options.aws.json'],\n", + " '/Users/sohailn/scing/bin/seqc-ada-0.0.4/SeqcAda.options.aws.json'],\n", " 'returncode': 0,\n", - " 'stdout': '{\"id\":\"1931681b-646b-4ba9-ac7e-01816b9d30cc\",\"status\":\"Submitted\"}\\n',\n", + " 'stdout': '{\"id\":\"d7016156-8b79-4e9c-9f28-eaeaefb4f2d5\",\"status\":\"Submitted\"}\\n',\n", " 'stderr': ''}]" ] }, - "execution_count": 151, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -498,13 +1859,6 @@ "source": [ "stdouts" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/notebooks/Run_Sharp.ipynb b/notebooks/Run_Sharp.ipynb index 9e87325..7da1bd4 100644 --- a/notebooks/Run_Sharp.ipynb +++ b/notebooks/Run_Sharp.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -19,7 +19,6 @@ { "cell_type": "markdown", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [], "toc-hr-collapsed": true }, @@ -29,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 45, "metadata": { "tags": [] }, @@ -59,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -102,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 48, "metadata": { "tags": [] }, @@ -150,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 49, "metadata": { "tags": [] }, @@ -196,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 50, "metadata": { "tags": [] }, @@ -223,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -241,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -282,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -302,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -321,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 55, "metadata": { "tags": [] }, @@ -344,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 56, "metadata": { "tags": [] }, @@ -365,7 +364,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 57, "metadata": { "tags": [] }, @@ -396,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 58, "metadata": { "tags": [] }, @@ -427,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 59, "metadata": { "tags": [] }, @@ -452,7 +451,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 60, "metadata": { "tags": [] }, @@ -480,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -496,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -517,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -536,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -555,7 +554,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 65, "metadata": { "tags": [] }, @@ -586,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 66, "metadata": { "tags": [] }, @@ -622,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 67, "metadata": { "tags": [] }, @@ -649,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 68, "metadata": { "tags": [] }, @@ -723,7 +722,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -748,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -765,7 +764,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -775,13 +774,13 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "# Locations of workflow-related directories and files\n", "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\" # CHANGE THIS\n", - "workflow_dir = f\"{Path.home()}/scing/bin/sharp-0.0.13\" # CHANGE THIS\n", + "workflow_dir = f\"{Path.home()}/scing/bin/sharp-0.1.1\" # CHANGE THIS\n", "path_to_exec = sh_files[prefix]\n", "config_dir = f\"{workflow_dir}/configs\"\n", "path_to_options = f\"{workflow_dir}/{options_prefix}.options.aws.json\"\n", @@ -793,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -804,7 +803,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -813,11 +812,54 @@ "# - Assumes data is transferred to AWS s3 (this should be an s3 location)\n", "# - Assumes directory name is also name of sample\n", "# - Workflows above will be run on all samples below\n", - "sample_paths = [\n", - " \"s3://dp-lab-data/SCRI_Projects/HTAN_CITEseq/CI210127_CD45pos_citeseq_CITE\",\n", - "]" + "# sample_paths = [\n", + "# \"s3://dp-lab-data/SCRI_Projects/HTAN_CITEseq/CI210127_CD45pos_citeseq_CITE\",\n", + "# ]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMKHJQGBB4\n", + "env: AWS_SECRET_ACCESS_KEY=0nbHN00aFVulHp4+YCSy0RvhgCGM727gYP3RGqSZ\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEGgaCXVzLWVhc3QtMSJIMEYCIQDxy4jpyfEB/jM9OAPA8MaqUXoBwb85rfdmM2jnCMsbLQIhAK1wUjV40LfkU8YVMww7NEXIwHxZ0EgOHYguj2eMFfJxKvgBCMD//////////wEQAxoMNTgzNjQzNTY3NTEyIgyXMJrVvZpPwzJN68EqzAEvkprlhoEsf/wIOuRrDT2dxOc8apURXE7FxyGu21YIeQ1uWx70qKA6oWYjMzLvihBS5hc7h+LeDbfKH6tnA93f2L/X5gzbDkFjGZAETOMhvbkwuco3Ly7120Maf7BzxFhI2icfYAaMUUTMcazjA/Pvg6nGsAckCvDAjJYHCYqlCtsWBZp4h/6qKdoCUttuW1zBzMYhMLkaehJHVn0XJRT4km41FSjc+tUMwP/n6qLg9Wm8201qRdjQo2Cn8mUHzLaJ4doyXKlibIflvWgwq+a0lwY6lwG2EC5bGDs0YEb0+XMzFdKcct+fiPnnqgxZd/i3lUhLxv1imL6GP8Hu3toPJgkazFpoPN5+iCBOTavcSTnlKWSzX6+TGtJta85foMoS7+N9qTwem3MJBKZoW8I5kLcmBtMcqig6x3x+NZOHox6xrG/ddwngg1sEc5tzkozICNL9RqzW8HH8ZJqqQ6BQySfbKFoj3u5F0EBm\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMKHJQGBB4\n", + "%env AWS_SECRET_ACCESS_KEY=0nbHN00aFVulHp4+YCSy0RvhgCGM727gYP3RGqSZ\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEGgaCXVzLWVhc3QtMSJIMEYCIQDxy4jpyfEB/jM9OAPA8MaqUXoBwb85rfdmM2jnCMsbLQIhAK1wUjV40LfkU8YVMww7NEXIwHxZ0EgOHYguj2eMFfJxKvgBCMD//////////wEQAxoMNTgzNjQzNTY3NTEyIgyXMJrVvZpPwzJN68EqzAEvkprlhoEsf/wIOuRrDT2dxOc8apURXE7FxyGu21YIeQ1uWx70qKA6oWYjMzLvihBS5hc7h+LeDbfKH6tnA93f2L/X5gzbDkFjGZAETOMhvbkwuco3Ly7120Maf7BzxFhI2icfYAaMUUTMcazjA/Pvg6nGsAckCvDAjJYHCYqlCtsWBZp4h/6qKdoCUttuW1zBzMYhMLkaehJHVn0XJRT4km41FSjc+tUMwP/n6qLg9Wm8201qRdjQo2Cn8mUHzLaJ4doyXKlibIflvWgwq+a0lwY6lwG2EC5bGDs0YEb0+XMzFdKcct+fiPnnqgxZd/i3lUhLxv1imL6GP8Hu3toPJgkazFpoPN5+iCBOTavcSTnlKWSzX6+TGtJta85foMoS7+N9qTwem3MJBKZoW8I5kLcmBtMcqig6x3x+NZOHox6xrG/ddwngg1sEc5tzkozICNL9RqzW8HH8ZJqqQ6BQySfbKFoj3u5F0EBm\n", + "!aws s3 ls" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": { @@ -838,30 +880,333 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ - "# Get information for all samples\n", - "sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", - "sample_names = [os.path.basename(s) for s in sample_paths]\n", - "sample_names = [re.match(r'(.*)_.+$', s)[1] for s in sample_names] # remove library suffix (e.g. _CITE, _HTO, etc.)\n", - "# TODO: assert basename is in peer_lab_db.sample_data.Sample\n", - "# assert(all(check_sample_name(s) for s in sample_names))\n", - "samples = pd.DataFrame(\n", - " sample_paths,\n", - " index=sample_names,\n", - " columns=[\"S3_Path\"],\n", - " dtype=str,\n", - ")\n", + "# Common query col: id, request_id, Sample\n", + "def get_sample_name(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.Sample\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_names = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_names.append(result[0])\n", + " return sample_names\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def get_aws_path(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.AWS_storage\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_paths = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_paths.append(result[0])\n", + " return sample_paths\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + " \n", + "def get_sample_id(query, query_col, creds):\n", + " \n", + " user = creds['user']\n", + " password = creds['password']\n", + " \n", + " try:\n", + " table_sample_data = \"peer_lab_db.sample_data\"\n", + " query = f\"\"\"\n", + " SELECT {table_sample_data}.id\n", + " FROM {table_sample_data}\n", + " WHERE {table_sample_data}.{query_col}=\"{query}\"\n", + " \"\"\"\n", + " sample_ids = []\n", + " results = execute_query(query, user, password)\n", + " for result in results:\n", + " sample_ids.append(result[0])\n", + " return sample_ids\n", + " except Error as e:\n", + " print(f\"Error: {e}\")\n", + " \n", + "def format_sample_aws(querys, query_col, creds):\n", + " sample_names = []\n", + " sample_paths = []\n", + " sample_ids = []\n", + " \n", + " for query in querys:\n", + " sample_names += get_sample_name(query, query_col, creds)\n", + " sample_paths += get_aws_path(query, query_col, creds)\n", + " sample_ids += get_sample_id(query, query_col, creds)\n", + " \n", + " sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", + " \n", + " samples = pd.DataFrame(\n", + " dict(S3_path=sample_paths, Sample_ID=sample_ids),\n", + " index=sample_names,\n", + " dtype=str,\n", + " )\n", + " return samples" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_ID
SV-1721_SV_LN11s3://dp-lab-data/collaborators/vardh...3947
SV-1721_SV_LN12s3://dp-lab-data/collaborators/vardh...3948
SV-1723_SV_LN1s3://dp-lab-data/collaborators/vardh...3949
SV-1723_SV_LN2s3://dp-lab-data/collaborators/vardh...3950
SV-1723_SV_LN3s3://dp-lab-data/collaborators/vardh...3951
\n", + "
" + ], + "text/plain": [ + " S3_path Sample_ID\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947\n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948\n", + "SV-1723_SV_LN1 s3://dp-lab-data/collaborators/vardh... 3949\n", + "SV-1723_SV_LN2 s3://dp-lab-data/collaborators/vardh... 3950\n", + "SV-1723_SV_LN3 s3://dp-lab-data/collaborators/vardh... 3951" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Common query col: id, request_id, Sample\n", + "sample_id = list(range(3947, 3952))\n", + "\n", + "samples = format_sample_aws(sample_id, 'id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "samples = samples.rename(columns={'S3_path': 'S3_Path'})\n", + "if prefix == 'Hashtag':\n", + " samples['S3_Path'] += '_HTO'" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_PathSample_IDFASTQsWhitelist_ParamsBarcode_ParamsBarcodes
SV-1721_SV_LN11s3://dp-lab-data/collaborators/vardh...3947{'R1': ['s3://dp-lab-data/collaborat...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(GTCAACTCTTTAGCG, C0251, Pt19_Tumor...
SV-1721_SV_LN12s3://dp-lab-data/collaborators/vardh...3948{'R1': ['s3://dp-lab-data/collaborat...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(GTCAACTCTTTAGCG, C0251, Pt60_LN, 1...
SV-1723_SV_LN1s3://dp-lab-data/collaborators/vardh...3949{'R1': ['s3://dp-lab-data/collaborat...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(GTCAACTCTTTAGCG, C0251, Pt37_A, 10...
SV-1723_SV_LN2s3://dp-lab-data/collaborators/vardh...3950{'R1': ['s3://dp-lab-data/collaborat...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(GTCAACTCTTTAGCG, C0251, Pt33_B, 10...
SV-1723_SV_LN3s3://dp-lab-data/collaborators/vardh...3951{'R1': ['s3://dp-lab-data/collaborat...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(GTCAACTCTTTAGCG, C0251, Pt42_A, 10...
\n", + "
" + ], + "text/plain": [ + " S3_Path Sample_ID \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947 \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948 \n", + "SV-1723_SV_LN1 s3://dp-lab-data/collaborators/vardh... 3949 \n", + "SV-1723_SV_LN2 s3://dp-lab-data/collaborators/vardh... 3950 \n", + "SV-1723_SV_LN3 s3://dp-lab-data/collaborators/vardh... 3951 \n", + "\n", + " FASTQs \\\n", + "SV-1721_SV_LN11 {'R1': ['s3://dp-lab-data/collaborat... \n", + "SV-1721_SV_LN12 {'R1': ['s3://dp-lab-data/collaborat... \n", + "SV-1723_SV_LN1 {'R1': ['s3://dp-lab-data/collaborat... \n", + "SV-1723_SV_LN2 {'R1': ['s3://dp-lab-data/collaborat... \n", + "SV-1723_SV_LN3 {'R1': ['s3://dp-lab-data/collaborat... \n", + "\n", + " Whitelist_Params \\\n", + "SV-1721_SV_LN11 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1721_SV_LN12 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1723_SV_LN1 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1723_SV_LN2 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1723_SV_LN3 {'uri': 's3://dp-lab-data/collaborat... \n", + "\n", + " Barcode_Params \\\n", + "SV-1721_SV_LN11 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "SV-1721_SV_LN12 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "SV-1723_SV_LN1 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "SV-1723_SV_LN2 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "SV-1723_SV_LN3 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "\n", + " Barcodes \n", + "SV-1721_SV_LN11 [(GTCAACTCTTTAGCG, C0251, Pt19_Tumor... \n", + "SV-1721_SV_LN12 [(GTCAACTCTTTAGCG, C0251, Pt60_LN, 1... \n", + "SV-1723_SV_LN1 [(GTCAACTCTTTAGCG, C0251, Pt37_A, 10... \n", + "SV-1723_SV_LN2 [(GTCAACTCTTTAGCG, C0251, Pt33_B, 10... \n", + "SV-1723_SV_LN3 [(GTCAACTCTTTAGCG, C0251, Pt42_A, 10... " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# # Get information for all samples\n", + "# sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", + "# sample_names = [os.path.basename(s) for s in sample_paths]\n", + "# sample_names = [re.match(r'(.*)_.+$', s)[1] for s in sample_names] # remove library suffix (e.g. _CITE, _HTO, etc.)\n", + "# # TODO: assert basename is in peer_lab_db.sample_data.Sample\n", + "# # assert(all(check_sample_name(s) for s in sample_names))\n", + "# samples = pd.DataFrame(\n", + "# sample_paths,\n", + "# index=sample_names,\n", + "# columns=[\"S3_Path\"],\n", + "# dtype=str,\n", + "# )\n", "# Get FASTQ paths from S3\n", "# Note: Uses same FASTQ file ids for all samples\n", "fastq_file_ids = fastq_map[prefix]\n", "samples[\"FASTQs\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, fastq_file_ids))\n", "\n", - "samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", - " lambda x: get_sample_id(x, creds['user'], creds['password'])\n", - ").values\n", + "# samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", + "# lambda x: get_sample_id(x, creds['user'], creds['password'])\n", + "# ).values\n", "samples[\"Whitelist_Params\"] = samples['Sample_ID'].apply(\n", " lambda x: get_wl_params(x, creds['user'], creds['password'])\n", ")\n", @@ -870,23 +1215,32 @@ ")\n", "samples[\"Barcodes\"] = samples['Sample_ID'].apply(\n", " lambda x: get_bcs(x, creds['user'], creds['password'])\n", - ")" + ")\n", + "\n", + "samples" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "45fece038e5d4f05b3c125dfea249457", + "model_id": "fa6d1f8820a443d7b22bfecedb97f2e1", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/1 [00:00\n", " \n", " \n", - " CiteSeq.uriFastqR1\n", - " CiteSeq.uriFastqR2\n", - " CiteSeq.sampleName\n", - " CiteSeq.scRnaSeqPlatform\n", - " CiteSeq.lengthR1\n", - " CiteSeq.lengthR2\n", - " CiteSeq.cellBarcodeWhitelistUri\n", - " CiteSeq.cellBarcodeWhiteListMethod\n", - " CiteSeq.tagList\n", - " CiteSeq.cbStartPos\n", + " Hashtag.uriFastqR1\n", + " Hashtag.uriFastqR2\n", + " Hashtag.sampleName\n", + " Hashtag.scRnaSeqPlatform\n", + " Hashtag.lengthR1\n", + " Hashtag.lengthR2\n", + " Hashtag.cellBarcodeWhitelistUri\n", + " Hashtag.cellBarcodeWhiteListMethod\n", + " Hashtag.hashTagList\n", + " Hashtag.cbStartPos\n", " ...\n", - " CiteSeq.umiStartPos\n", - " CiteSeq.umiEndPos\n", - " CiteSeq.trimPos\n", - " CiteSeq.slidingWindowSearch\n", - " CiteSeq.translate10XBarcodes\n", - " CiteSeq.cbCollapsingDistance\n", - " CiteSeq.umiCollapsingDistance\n", - " CiteSeq.numExpectedCells\n", - " CiteSeq.resourceSpec\n", - " CiteSeq.dockerRegistry\n", + " Hashtag.umiEndPos\n", + " Hashtag.trimPos\n", + " Hashtag.slidingWindowSearch\n", + " Hashtag.translate10XBarcodes\n", + " Hashtag.cbCollapsingDistance\n", + " Hashtag.umiCollapsingDistance\n", + " Hashtag.numExpectedCells\n", + " Hashtag.minCount\n", + " Hashtag.resourceSpec\n", + " Hashtag.dockerRegistry\n", " \n", " \n", " \n", " \n", - " CI210127_CD45pos_citeseq\n", - " [s3://dp-lab-data/SCRI_Projects/HTAN...\n", - " [s3://dp-lab-data/SCRI_Projects/HTAN...\n", - " CI210127_CD45pos_citeseq\n", + " SV-1721_SV_LN11\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1721_SV_LN11\n", " 10x\n", " 28\n", - " 15\n", - " s3://dp-lab-data/SCRI_Projects/HTAN_...\n", - " SeqcDenseCountsMatrixCsv\n", - " s3://dp-lab-data/SCRI_Projects/HTAN_...\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", " 1\n", " ...\n", - " 17\n", " 28\n", + " 10\n", + " False\n", + " False\n", + " 1\n", + " 1\n", " 0\n", + " 10\n", + " {'cpu': 32, 'memory': -1}\n", + " quay.io/hisplan\n", + " \n", + " \n", + " SV-1721_SV_LN12\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1721_SV_LN12\n", + " 10x\n", + " 28\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 1\n", + " ...\n", + " 28\n", + " 10\n", " False\n", " False\n", " 1\n", " 1\n", " 0\n", + " 10\n", " {'cpu': 32, 'memory': -1}\n", " quay.io/hisplan\n", " \n", - " \n", - "\n", - "

1 rows × 21 columns

\n", - "" - ], - "text/plain": [ - " CiteSeq.uriFastqR1 \\\n", - "CI210127_CD45pos_citeseq [s3://dp-lab-data/SCRI_Projects/HTAN... \n", - "\n", - " CiteSeq.uriFastqR2 \\\n", - "CI210127_CD45pos_citeseq [s3://dp-lab-data/SCRI_Projects/HTAN... \n", - "\n", - " CiteSeq.sampleName CiteSeq.scRnaSeqPlatform \\\n", - "CI210127_CD45pos_citeseq CI210127_CD45pos_citeseq 10x \n", - "\n", - " CiteSeq.lengthR1 CiteSeq.lengthR2 \\\n", - "CI210127_CD45pos_citeseq 28 15 \n", - "\n", - " CiteSeq.cellBarcodeWhitelistUri \\\n", - "CI210127_CD45pos_citeseq s3://dp-lab-data/SCRI_Projects/HTAN_... \n", - "\n", - " CiteSeq.cellBarcodeWhiteListMethod \\\n", - "CI210127_CD45pos_citeseq SeqcDenseCountsMatrixCsv \n", - "\n", - " CiteSeq.tagList \\\n", - "CI210127_CD45pos_citeseq s3://dp-lab-data/SCRI_Projects/HTAN_... \n", - "\n", - " CiteSeq.cbStartPos ... CiteSeq.umiStartPos \\\n", - "CI210127_CD45pos_citeseq 1 ... 17 \n", - "\n", - " CiteSeq.umiEndPos CiteSeq.trimPos \\\n", - "CI210127_CD45pos_citeseq 28 0 \n", - "\n", - " CiteSeq.slidingWindowSearch \\\n", - "CI210127_CD45pos_citeseq False \n", - "\n", - " CiteSeq.translate10XBarcodes \\\n", - "CI210127_CD45pos_citeseq False \n", - "\n", - " CiteSeq.cbCollapsingDistance \\\n", - "CI210127_CD45pos_citeseq 1 \n", - "\n", - " CiteSeq.umiCollapsingDistance \\\n", - "CI210127_CD45pos_citeseq 1 \n", - "\n", - " CiteSeq.numExpectedCells CiteSeq.resourceSpec \\\n", - "CI210127_CD45pos_citeseq 0 {'cpu': 32, 'memory': -1} \n", - "\n", - " CiteSeq.dockerRegistry \n", - "CI210127_CD45pos_citeseq quay.io/hisplan \n", - "\n", - "[1 rows x 21 columns]" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } + " \n", + " SV-1723_SV_LN1\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1723_SV_LN1\n", + " 10x\n", + " 28\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 1\n", + " ...\n", + " 28\n", + " 10\n", + " False\n", + " False\n", + " 1\n", + " 1\n", + " 0\n", + " 10\n", + " {'cpu': 32, 'memory': -1}\n", + " quay.io/hisplan\n", + " \n", + " \n", + " SV-1723_SV_LN2\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1723_SV_LN2\n", + " 10x\n", + " 28\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 1\n", + " ...\n", + " 28\n", + " 10\n", + " False\n", + " False\n", + " 1\n", + " 1\n", + " 0\n", + " 10\n", + " {'cpu': 32, 'memory': -1}\n", + " quay.io/hisplan\n", + " \n", + " \n", + " SV-1723_SV_LN3\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1723_SV_LN3\n", + " 10x\n", + " 28\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 1\n", + " ...\n", + " 28\n", + " 10\n", + " False\n", + " False\n", + " 1\n", + " 1\n", + " 0\n", + " 10\n", + " {'cpu': 32, 'memory': -1}\n", + " quay.io/hisplan\n", + " \n", + " \n", + "\n", + "

5 rows × 22 columns

\n", + "" + ], + "text/plain": [ + " Hashtag.uriFastqR1 \\\n", + "SV-1721_SV_LN11 [s3://dp-lab-data/collaborators/vard... \n", + "SV-1721_SV_LN12 [s3://dp-lab-data/collaborators/vard... \n", + "SV-1723_SV_LN1 [s3://dp-lab-data/collaborators/vard... \n", + "SV-1723_SV_LN2 [s3://dp-lab-data/collaborators/vard... \n", + "SV-1723_SV_LN3 [s3://dp-lab-data/collaborators/vard... \n", + "\n", + " Hashtag.uriFastqR2 Hashtag.sampleName \\\n", + "SV-1721_SV_LN11 [s3://dp-lab-data/collaborators/vard... SV-1721_SV_LN11 \n", + "SV-1721_SV_LN12 [s3://dp-lab-data/collaborators/vard... SV-1721_SV_LN12 \n", + "SV-1723_SV_LN1 [s3://dp-lab-data/collaborators/vard... SV-1723_SV_LN1 \n", + "SV-1723_SV_LN2 [s3://dp-lab-data/collaborators/vard... SV-1723_SV_LN2 \n", + "SV-1723_SV_LN3 [s3://dp-lab-data/collaborators/vard... SV-1723_SV_LN3 \n", + "\n", + " Hashtag.scRnaSeqPlatform Hashtag.lengthR1 Hashtag.lengthR2 \\\n", + "SV-1721_SV_LN11 10x 28 25 \n", + "SV-1721_SV_LN12 10x 28 25 \n", + "SV-1723_SV_LN1 10x 28 25 \n", + "SV-1723_SV_LN2 10x 28 25 \n", + "SV-1723_SV_LN3 10x 28 25 \n", + "\n", + " Hashtag.cellBarcodeWhitelistUri \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... \n", + "SV-1723_SV_LN1 s3://dp-lab-data/collaborators/vardh... \n", + "SV-1723_SV_LN2 s3://dp-lab-data/collaborators/vardh... \n", + "SV-1723_SV_LN3 s3://dp-lab-data/collaborators/vardh... \n", + "\n", + " Hashtag.cellBarcodeWhiteListMethod \\\n", + "SV-1721_SV_LN11 10x \n", + "SV-1721_SV_LN12 10x \n", + "SV-1723_SV_LN1 10x \n", + "SV-1723_SV_LN2 10x \n", + "SV-1723_SV_LN3 10x \n", + "\n", + " Hashtag.hashTagList Hashtag.cbStartPos \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 1 \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 1 \n", + "SV-1723_SV_LN1 s3://dp-lab-data/collaborators/vardh... 1 \n", + "SV-1723_SV_LN2 s3://dp-lab-data/collaborators/vardh... 1 \n", + "SV-1723_SV_LN3 s3://dp-lab-data/collaborators/vardh... 1 \n", + "\n", + " ... Hashtag.umiEndPos Hashtag.trimPos \\\n", + "SV-1721_SV_LN11 ... 28 10 \n", + "SV-1721_SV_LN12 ... 28 10 \n", + "SV-1723_SV_LN1 ... 28 10 \n", + "SV-1723_SV_LN2 ... 28 10 \n", + "SV-1723_SV_LN3 ... 28 10 \n", + "\n", + " Hashtag.slidingWindowSearch Hashtag.translate10XBarcodes \\\n", + "SV-1721_SV_LN11 False False \n", + "SV-1721_SV_LN12 False False \n", + "SV-1723_SV_LN1 False False \n", + "SV-1723_SV_LN2 False False \n", + "SV-1723_SV_LN3 False False \n", + "\n", + " Hashtag.cbCollapsingDistance Hashtag.umiCollapsingDistance \\\n", + "SV-1721_SV_LN11 1 1 \n", + "SV-1721_SV_LN12 1 1 \n", + "SV-1723_SV_LN1 1 1 \n", + "SV-1723_SV_LN2 1 1 \n", + "SV-1723_SV_LN3 1 1 \n", + "\n", + " Hashtag.numExpectedCells Hashtag.minCount \\\n", + "SV-1721_SV_LN11 0 10 \n", + "SV-1721_SV_LN12 0 10 \n", + "SV-1723_SV_LN1 0 10 \n", + "SV-1723_SV_LN2 0 10 \n", + "SV-1723_SV_LN3 0 10 \n", + "\n", + " Hashtag.resourceSpec Hashtag.dockerRegistry \n", + "SV-1721_SV_LN11 {'cpu': 32, 'memory': -1} quay.io/hisplan \n", + "SV-1721_SV_LN12 {'cpu': 32, 'memory': -1} quay.io/hisplan \n", + "SV-1723_SV_LN1 {'cpu': 32, 'memory': -1} quay.io/hisplan \n", + "SV-1723_SV_LN2 {'cpu': 32, 'memory': -1} quay.io/hisplan \n", + "SV-1723_SV_LN3 {'cpu': 32, 'memory': -1} quay.io/hisplan \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } ], "source": [ "inputs" @@ -1138,7 +1620,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -1173,31 +1655,83 @@ " \n", " \n", " \n", - " CI210127_CD45pos_citeseq\n", - " CITE-seq\n", - " HTAN_CITEseq\n", - " CI210127_CD45pos_citeseq\n", - " moormana\n", - " s3://dp-lab-data/SCRI_Projects/HTAN_...\n", + " SV-1721_SV_LN11\n", + " Hashtag\n", + " Locally advanced gastric cancer\n", + " SV-1721_SV_LN11\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " SV-1721_SV_LN12\n", + " Hashtag\n", + " Locally advanced gastric cancer\n", + " SV-1721_SV_LN12\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", " -\n", - " moormana\n", + " sohailn\n", + " \n", + " \n", + " SV-1723_SV_LN1\n", + " Hashtag\n", + " single cell immune profiling of PBMC...\n", + " SV-1723_SV_LN1\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " SV-1723_SV_LN2\n", + " Hashtag\n", + " single cell immune profiling of PBMC...\n", + " SV-1723_SV_LN2\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " SV-1723_SV_LN3\n", + " Hashtag\n", + " single cell immune profiling of PBMC...\n", + " SV-1723_SV_LN3\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " -\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pipelineType project sample \\\n", - "CI210127_CD45pos_citeseq CITE-seq HTAN_CITEseq CI210127_CD45pos_citeseq \n", + " pipelineType project \\\n", + "SV-1721_SV_LN11 Hashtag Locally advanced gastric cancer \n", + "SV-1721_SV_LN12 Hashtag Locally advanced gastric cancer \n", + "SV-1723_SV_LN1 Hashtag single cell immune profiling of PBMC... \n", + "SV-1723_SV_LN2 Hashtag single cell immune profiling of PBMC... \n", + "SV-1723_SV_LN3 Hashtag single cell immune profiling of PBMC... \n", "\n", - " owner destination \\\n", - "CI210127_CD45pos_citeseq moormana s3://dp-lab-data/SCRI_Projects/HTAN_... \n", + " sample owner \\\n", + "SV-1721_SV_LN11 SV-1721_SV_LN11 sohailn \n", + "SV-1721_SV_LN12 SV-1721_SV_LN12 sohailn \n", + "SV-1723_SV_LN1 SV-1723_SV_LN1 sohailn \n", + "SV-1723_SV_LN2 SV-1723_SV_LN2 sohailn \n", + "SV-1723_SV_LN3 SV-1723_SV_LN3 sohailn \n", "\n", - " transfer comment \n", - "CI210127_CD45pos_citeseq - moormana " + " destination transfer comment \n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... - sohailn \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... - sohailn \n", + "SV-1723_SV_LN1 s3://dp-lab-data/collaborators/vardh... - sohailn \n", + "SV-1723_SV_LN2 s3://dp-lab-data/collaborators/vardh... - sohailn \n", + "SV-1723_SV_LN3 s3://dp-lab-data/collaborators/vardh... - sohailn " ] }, - "execution_count": 67, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1208,7 +1742,45 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/Hashtag-results',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN12_HTO/Hashtag-results',\n", + " 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN1_HTO/Hashtag-results',\n", + " 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN2_HTO/Hashtag-results',\n", + " 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN3_HTO/Hashtag-results']" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels['destination'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 58, "metadata": { "tags": [] }, @@ -1216,12 +1788,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "559a5c48486540ad9aa62559bd608099", + "model_id": "c623c42539d348d5a97720b9a4d7315f", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_pathSample_ID
SV-1721_SV_LN11s3://dp-lab-data/collaborators/vardh...3947
SV-1721_SV_LN12s3://dp-lab-data/collaborators/vardh...3948
SV-1723_SV_LN1s3://dp-lab-data/collaborators/vardh...3949
SV-1723_SV_LN2s3://dp-lab-data/collaborators/vardh...3950
SV-1723_SV_LN3s3://dp-lab-data/collaborators/vardh...3951
\n", + "" + ], + "text/plain": [ + " S3_path Sample_ID\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947\n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948\n", + "SV-1723_SV_LN1 s3://dp-lab-data/collaborators/vardh... 3949\n", + "SV-1723_SV_LN2 s3://dp-lab-data/collaborators/vardh... 3950\n", + "SV-1723_SV_LN3 s3://dp-lab-data/collaborators/vardh... 3951" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Common query col: id, request_id, Sample\n", + "sample_id = list(range(3947, 3952))\n", + "\n", + "samples = format_sample_aws(sample_id, 'id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ - "# Get information for all samples\n", - "sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", - "sample_names = [os.path.basename(s) for s in sample_paths]\n", - "sample_names = [re.match(r'(.*)_.+$', s)[1] for s in sample_names] # remove library suffix (e.g. _CITE, _HTO, etc.)\n", - "# TODO: assert basename is in peer_lab_db.sample_data.Sample\n", - "# assert(all(check_sample_name(s) for s in sample_names))\n", - "samples = pd.DataFrame(\n", - " sample_paths,\n", - " index=sample_names,\n", - " columns=[\"S3_Path\"],\n", - " dtype=str,\n", - ")\n", - "samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", - " lambda x: get_sample_id(x, creds['user'], creds['password'])\n", - ").values" + "samples = samples.rename(columns={'S3_path': 'S3_Path'})\n", + "samples['S3_Path'] += '_HTO'\n", + "# if prefix == 'Hashtag':\n", + "# samples['S3_Path'] += '_HTO'\n", + " \n", + "# elif prefix == 'CiteSEq':\n", + "# samples['S3_Path'] += '_CITE'" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 103, "metadata": {}, "outputs": [ { @@ -1365,49 +2062,76 @@ " \n", " \n", " \n", - " BF-1402_SI\n", - " s3://dp-lab-data/collaborators/arude...\n", - " 3555\n", + " SV-1721_SV_LN11\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 3947\n", " \n", " \n", - " BF-1472_LI\n", - " s3://dp-lab-data/collaborators/arude...\n", - " 3571\n", - " \n", - " \n", - " BD-1495_1\n", - " s3://dp-lab-data/collaborators/arude...\n", - " 3557\n", + " SV-1721_SV_LN12\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 3948\n", " \n", " \n", "\n", "" ], "text/plain": [ - " S3_Path Sample_ID\n", - "BF-1402_SI s3://dp-lab-data/collaborators/arude... 3555\n", - "BF-1472_LI s3://dp-lab-data/collaborators/arude... 3571\n", - "BD-1495_1 s3://dp-lab-data/collaborators/arude... 3557" + " S3_Path Sample_ID\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947\n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948" ] }, - "execution_count": 35, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "samples = samples.loc[samples.index.str.startswith('SV-1721')]\n", + "# samples = samples.loc[samples.index.str.startswith('SV-1723')]\n", + "\n", "samples" ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# # Get information for all samples\n", + "# # sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists\n", + "# # sample_names = [os.path.basename(s) for s in sample_paths]\n", + "# # sample_names = [re.match(r'(.*)_.+$', s)[1] for s in sample_names] # remove library suffix (e.g. _CITE, _HTO, etc.)\n", + "# # TODO: assert basename is in peer_lab_db.sample_data.Sample\n", + "# # assert(all(check_sample_name(s) for s in sample_names))\n", + "# samples = pd.DataFrame(\n", + "# sample_paths,\n", + "# index=sample_names,\n", + "# columns=[\"S3_Path\"],\n", + "# dtype=str,\n", + "# )\n", + "# samples[\"Sample_ID\"] = pd.Series(samples.index).apply(\n", + "# lambda x: get_sample_id(x, creds['user'], creds['password'])\n", + "# ).values" + ] + }, + { + "cell_type": "code", + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "# Read barcodes from file\n", "# Note: Must be subset to HTO or CITE barcodes before next step!\n", - "barcodes = get_bcs_manual(path_to_excel)\n", + "# barcodes = get_bcs_manual(path_to_excel)\n", "# barcodes = barcodes[\n", "# barcodes[\"Description\"].str.contains(\"SS1\") |\n", "# barcodes[\"Description\"].str.contains(\"SS2\")\n", @@ -1419,6 +2143,13 @@ "execution_count": 104, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "~/Downloads/SV-1721 CITEseq.xlsx\n" + ] + }, { "data": { "text/html": [ @@ -1450,35 +2181,35 @@ " \n", " 0\n", " GTCTTTGTCAGTGCA\n", - " A0006\n", + " C0006\n", " anti-human CD86\n", " 0\n", " \n", " \n", " 1\n", " GTTGTCCGACAATAC\n", - " A0007\n", + " C0007\n", " anti-human CD274 (B7-H1, PD-L1)\n", " 0\n", " \n", " \n", " 2\n", " TGATAGAAACAGACC\n", - " A0020\n", + " C0020\n", " anti-human CD270 (HVEM, TR2)\n", " 0\n", " \n", " \n", " 3\n", " ATCACATCGTTGCCA\n", - " A0023\n", + " C0023\n", " anti-human CD155 (PVR)\n", " 0\n", " \n", " \n", " 4\n", " AACCTTCCGTCTAAG\n", - " A0024\n", + " C0024\n", " anti-human CD112 (Nectin-2)\n", " 0\n", " \n", @@ -1490,60 +2221,60 @@ " ...\n", " \n", " \n", - " 165\n", - " AACTTCTGTGGTAGC\n", - " A0584\n", - " anti-human TCR V{LATIN CAPITAL LIGAT...\n", + " 132\n", + " GAGTCGAGAAATCAT\n", + " C0918\n", + " anti-human HLA-E\n", " 0\n", " \n", " \n", - " 166\n", - " CTTCCGATTCATTCA\n", - " A0139\n", - " anti-human TCR {LATIN CAPITAL LIGATU...\n", + " 133\n", + " TCCCACTTCCGCTTT\n", + " C0920\n", + " anti-human CD82\n", " 0\n", " \n", " \n", - " 167\n", - " AGCTGTAAGTTTCGG\n", - " A0166\n", - " anti-human CD66b\n", + " 134\n", + " CTACTTCCCTGTCAA\n", + " C0944\n", + " anti-human CD101 (BB27)\n", " 0\n", " \n", " \n", - " 168\n", - " AAGTGATGGTATCTG\n", - " A0583\n", - " anti-human TCR V{LATIN CAPITAL LIGAT...\n", + " 135\n", + " GCCGCATGAGAAACA\n", + " C1046\n", + " anti-human CD88 (C5aR)\n", " 0\n", " \n", " \n", - " 169\n", - " TCACCAGTACCTAGT\n", - " A0392\n", - " anti-human CD15 (SSEA-1)\n", + " 136\n", + " CTGATGAGATGTCAG\n", + " C1052\n", + " anti-human CD224\n", " 0\n", " \n", " \n", "\n", - "

170 rows × 4 columns

\n", + "

137 rows × 4 columns

\n", "" ], "text/plain": [ - " Barcode DNA_ID Description BP Shift\n", - "0 GTCTTTGTCAGTGCA A0006 anti-human CD86 0\n", - "1 GTTGTCCGACAATAC A0007 anti-human CD274 (B7-H1, PD-L1) 0\n", - "2 TGATAGAAACAGACC A0020 anti-human CD270 (HVEM, TR2) 0\n", - "3 ATCACATCGTTGCCA A0023 anti-human CD155 (PVR) 0\n", - "4 AACCTTCCGTCTAAG A0024 anti-human CD112 (Nectin-2) 0\n", - ".. ... ... ... ...\n", - "165 AACTTCTGTGGTAGC A0584 anti-human TCR V{LATIN CAPITAL LIGAT... 0\n", - "166 CTTCCGATTCATTCA A0139 anti-human TCR {LATIN CAPITAL LIGATU... 0\n", - "167 AGCTGTAAGTTTCGG A0166 anti-human CD66b 0\n", - "168 AAGTGATGGTATCTG A0583 anti-human TCR V{LATIN CAPITAL LIGAT... 0\n", - "169 TCACCAGTACCTAGT A0392 anti-human CD15 (SSEA-1) 0\n", + " Barcode DNA_ID Description BP Shift\n", + "0 GTCTTTGTCAGTGCA C0006 anti-human CD86 0\n", + "1 GTTGTCCGACAATAC C0007 anti-human CD274 (B7-H1, PD-L1) 0\n", + "2 TGATAGAAACAGACC C0020 anti-human CD270 (HVEM, TR2) 0\n", + "3 ATCACATCGTTGCCA C0023 anti-human CD155 (PVR) 0\n", + "4 AACCTTCCGTCTAAG C0024 anti-human CD112 (Nectin-2) 0\n", + ".. ... ... ... ...\n", + "132 GAGTCGAGAAATCAT C0918 anti-human HLA-E 0\n", + "133 TCCCACTTCCGCTTT C0920 anti-human CD82 0\n", + "134 CTACTTCCCTGTCAA C0944 anti-human CD101 (BB27) 0\n", + "135 GCCGCATGAGAAACA C1046 anti-human CD88 (C5aR) 0\n", + "136 CTGATGAGATGTCAG C1052 anti-human CD224 0\n", "\n", - "[170 rows x 4 columns]" + "[137 rows x 4 columns]" ] }, "execution_count": 104, @@ -1552,6 +2283,13 @@ } ], "source": [ + "barcodes = pd.read_excel(path_to_excel, header=None) \n", + "print(path_to_excel)\n", + "barcodes.columns = ['Description', 'Barcode', 'DNA_ID', 'description', 'citeseq']\n", + "replace = lambda x: x.encode('ascii', 'namereplace').decode().replace(\"\\\\N\", \"\")\n", + "barcodes[\"Description\"] = barcodes[\"Description\"].apply(replace)\n", + "barcodes[\"BP Shift\"] = 0\n", + "barcodes = barcodes[[\"Barcode\", \"DNA_ID\", \"Description\", \"BP Shift\"]]\n", "barcodes" ] }, @@ -1565,25 +2303,35 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dd85b56be9894e31b2f4214b4f9058b6", + "model_id": "39230bb1a70941beb108a2e3ee0d331e", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_PathSample_IDWhitelist_Params
SV-1721_SV_LN11s3://dp-lab-data/collaborators/vardh...3947{'uri': 's3://dp-lab-data/collaborat...
SV-1721_SV_LN12s3://dp-lab-data/collaborators/vardh...3948{'uri': 's3://dp-lab-data/collaborat...
\n", + "" + ], + "text/plain": [ + " S3_Path Sample_ID \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947 \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948 \n", + "\n", + " Whitelist_Params \n", + "SV-1721_SV_LN11 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1721_SV_LN12 {'uri': 's3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Note: Assumes GEX data is recorded in database\n", "samples[\"Whitelist_Params\"] = samples[\"Sample_ID\"].apply(\n", - " lambda x: get_wl_params(x, creds)\n", + " lambda x: get_wl_params(x, creds['user'], creds['password'])\n", ")\n", - "assert ~samples[\"Whitelist_Params\"].isna().any()" + "assert ~samples[\"Whitelist_Params\"].isna().any()\n", + "samples" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_PathSample_IDWhitelist_ParamsBC_Params
SV-1721_SV_LN11s3://dp-lab-data/collaborators/vardh...3947{'uri': 's3://dp-lab-data/collaborat...{'conjugation': 'C', 'bp_shift': 10,...
SV-1721_SV_LN12s3://dp-lab-data/collaborators/vardh...3948{'uri': 's3://dp-lab-data/collaborat...{'conjugation': 'C', 'bp_shift': 10,...
\n", + "
" + ], + "text/plain": [ + " S3_Path Sample_ID \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947 \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948 \n", + "\n", + " Whitelist_Params \\\n", + "SV-1721_SV_LN11 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1721_SV_LN12 {'uri': 's3://dp-lab-data/collaborat... \n", + "\n", + " BC_Params \n", + "SV-1721_SV_LN11 {'conjugation': 'C', 'bp_shift': 10,... \n", + "SV-1721_SV_LN12 {'conjugation': 'C', 'bp_shift': 10,... " + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Note: Adds same barcode parameters for all samples\n", "samples[\"BC_Params\"] = samples[\"Sample_ID\"].apply(\n", " lambda x: get_bc_params_manual(barcodes, prefix, platform, creds)\n", ")\n", - "assert ~samples[\"BC_Params\"].isna().any()" + "assert ~samples[\"BC_Params\"].isna().any()\n", + "samples" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
S3_PathSample_IDWhitelist_ParamsBC_ParamsFASTQs
SV-1721_SV_LN11s3://dp-lab-data/collaborators/vardh...3947{'uri': 's3://dp-lab-data/collaborat...{'conjugation': 'C', 'bp_shift': 10,...{'R1': ['s3://dp-lab-data/collaborat...
SV-1721_SV_LN12s3://dp-lab-data/collaborators/vardh...3948{'uri': 's3://dp-lab-data/collaborat...{'conjugation': 'C', 'bp_shift': 10,...{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " S3_Path Sample_ID \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 3947 \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 3948 \n", + "\n", + " Whitelist_Params \\\n", + "SV-1721_SV_LN11 {'uri': 's3://dp-lab-data/collaborat... \n", + "SV-1721_SV_LN12 {'uri': 's3://dp-lab-data/collaborat... \n", + "\n", + " BC_Params \\\n", + "SV-1721_SV_LN11 {'conjugation': 'C', 'bp_shift': 10,... \n", + "SV-1721_SV_LN12 {'conjugation': 'C', 'bp_shift': 10,... \n", + "\n", + " FASTQs \n", + "SV-1721_SV_LN11 {'R1': ['s3://dp-lab-data/collaborat... \n", + "SV-1721_SV_LN12 {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get FASTQ paths from S3\n", "# Note: Uses same FASTQ file ids for all samples\n", "fastq_file_ids = fastq_map[prefix]\n", - "samples[\"FASTQs\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, fastq_file_ids))" + "samples[\"FASTQs\"] = samples[\"S3_Path\"].apply(lambda x: get_fastqs(x, fastq_file_ids))\n", + "samples" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'R1': ['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L001_R1_001.fastq.gz',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L002_R1_001.fastq.gz',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L003_R1_001.fastq.gz',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L004_R1_001.fastq.gz'],\n", + " 'R2': ['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L001_R2_001.fastq.gz',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L002_R2_001.fastq.gz',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L003_R2_001.fastq.gz',\n", + " 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L004_R2_001.fastq.gz']}" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples.iloc[0]['FASTQs']" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, "outputs": [], "source": [ "# Load minimum inputs and labels fields from templates\n", @@ -1647,7 +2620,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1706,7 +2679,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 112, "metadata": {}, "outputs": [ { @@ -1755,21 +2728,45 @@ " \n", " \n", " \n", - " IM-1356_Ru553B_1\n", - " [s3://dp-lab-data/SCRI_Projects/HTAN...\n", - " [s3://dp-lab-data/SCRI_Projects/HTAN...\n", - " IM-1356_Ru553B_1\n", + " SV-1721_SV_LN11\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1721_SV_LN11\n", " 10x_v3\n", " 28\n", - " 15\n", - " s3://dp-lab-data/SCRI_Projects/HTAN_...\n", - " SeqcDenseCountsMatrixCsv\n", - " s3://dp-lab-data/SCRI_Projects/HTAN_...\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", " 1\n", " ...\n", " 17\n", " 28\n", + " 10\n", + " False\n", + " False\n", + " 1\n", + " 1\n", " 0\n", + " {'cpu': 32, 'memory': -1}\n", + " quay.io/hisplan\n", + " \n", + " \n", + " SV-1721_SV_LN12\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " [s3://dp-lab-data/collaborators/vard...\n", + " SV-1721_SV_LN12\n", + " 10x_v3\n", + " 28\n", + " 25\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 10x\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " 1\n", + " ...\n", + " 17\n", + " 28\n", + " 10\n", " False\n", " False\n", " 1\n", @@ -1780,47 +2777,58 @@ " \n", " \n", "\n", - "

1 rows × 21 columns

\n", + "

2 rows × 21 columns

\n", "" ], "text/plain": [ - " CiteSeq.uriFastqR1 \\\n", - "IM-1356_Ru553B_1 [s3://dp-lab-data/SCRI_Projects/HTAN... \n", + " CiteSeq.uriFastqR1 \\\n", + "SV-1721_SV_LN11 [s3://dp-lab-data/collaborators/vard... \n", + "SV-1721_SV_LN12 [s3://dp-lab-data/collaborators/vard... \n", "\n", - " CiteSeq.uriFastqR2 CiteSeq.sampleName \\\n", - "IM-1356_Ru553B_1 [s3://dp-lab-data/SCRI_Projects/HTAN... IM-1356_Ru553B_1 \n", + " CiteSeq.uriFastqR2 CiteSeq.sampleName \\\n", + "SV-1721_SV_LN11 [s3://dp-lab-data/collaborators/vard... SV-1721_SV_LN11 \n", + "SV-1721_SV_LN12 [s3://dp-lab-data/collaborators/vard... SV-1721_SV_LN12 \n", "\n", - " CiteSeq.scRnaSeqPlatform CiteSeq.lengthR1 CiteSeq.lengthR2 \\\n", - "IM-1356_Ru553B_1 10x_v3 28 15 \n", + " CiteSeq.scRnaSeqPlatform CiteSeq.lengthR1 CiteSeq.lengthR2 \\\n", + "SV-1721_SV_LN11 10x_v3 28 25 \n", + "SV-1721_SV_LN12 10x_v3 28 25 \n", "\n", - " CiteSeq.cellBarcodeWhitelistUri \\\n", - "IM-1356_Ru553B_1 s3://dp-lab-data/SCRI_Projects/HTAN_... \n", + " CiteSeq.cellBarcodeWhitelistUri \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... \n", "\n", - " CiteSeq.cellBarcodeWhiteListMethod \\\n", - "IM-1356_Ru553B_1 SeqcDenseCountsMatrixCsv \n", + " CiteSeq.cellBarcodeWhiteListMethod \\\n", + "SV-1721_SV_LN11 10x \n", + "SV-1721_SV_LN12 10x \n", "\n", - " CiteSeq.tagList CiteSeq.cbStartPos \\\n", - "IM-1356_Ru553B_1 s3://dp-lab-data/SCRI_Projects/HTAN_... 1 \n", + " CiteSeq.tagList CiteSeq.cbStartPos \\\n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... 1 \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... 1 \n", "\n", - " ... CiteSeq.umiStartPos CiteSeq.umiEndPos \\\n", - "IM-1356_Ru553B_1 ... 17 28 \n", + " ... CiteSeq.umiStartPos CiteSeq.umiEndPos CiteSeq.trimPos \\\n", + "SV-1721_SV_LN11 ... 17 28 10 \n", + "SV-1721_SV_LN12 ... 17 28 10 \n", "\n", - " CiteSeq.trimPos CiteSeq.slidingWindowSearch \\\n", - "IM-1356_Ru553B_1 0 False \n", + " CiteSeq.slidingWindowSearch CiteSeq.translate10XBarcodes \\\n", + "SV-1721_SV_LN11 False False \n", + "SV-1721_SV_LN12 False False \n", "\n", - " CiteSeq.translate10XBarcodes CiteSeq.cbCollapsingDistance \\\n", - "IM-1356_Ru553B_1 False 1 \n", + " CiteSeq.cbCollapsingDistance CiteSeq.umiCollapsingDistance \\\n", + "SV-1721_SV_LN11 1 1 \n", + "SV-1721_SV_LN12 1 1 \n", "\n", - " CiteSeq.umiCollapsingDistance CiteSeq.numExpectedCells \\\n", - "IM-1356_Ru553B_1 1 0 \n", + " CiteSeq.numExpectedCells CiteSeq.resourceSpec \\\n", + "SV-1721_SV_LN11 0 {'cpu': 32, 'memory': -1} \n", + "SV-1721_SV_LN12 0 {'cpu': 32, 'memory': -1} \n", "\n", - " CiteSeq.resourceSpec CiteSeq.dockerRegistry \n", - "IM-1356_Ru553B_1 {'cpu': 32, 'memory': -1} quay.io/hisplan \n", + " CiteSeq.dockerRegistry \n", + "SV-1721_SV_LN11 quay.io/hisplan \n", + "SV-1721_SV_LN12 quay.io/hisplan \n", "\n", - "[1 rows x 21 columns]" + "[2 rows x 21 columns]" ] }, - "execution_count": 111, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -1831,7 +2839,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 113, "metadata": {}, "outputs": [ { @@ -1866,28 +2874,44 @@ " \n", " \n", " \n", - " IM-1356_Ru553B_1\n", + " SV-1721_SV_LN11\n", + " CITE-seq\n", + " Locally advanced gastric cancer\n", + " SV-1721_SV_LN11\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", + " -\n", + " sohailn\n", + " \n", + " \n", + " SV-1721_SV_LN12\n", " CITE-seq\n", - " HTAN_CITEseq\n", - " IM-1356_Ru553B_1\n", - " moormana\n", - " s3://dp-lab-data/SCRI_Projects/HTAN_...\n", + " Locally advanced gastric cancer\n", + " SV-1721_SV_LN12\n", + " sohailn\n", + " s3://dp-lab-data/collaborators/vardh...\n", " -\n", - " moormana\n", + " sohailn\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pipelineType project sample owner \\\n", - "IM-1356_Ru553B_1 CITE-seq HTAN_CITEseq IM-1356_Ru553B_1 moormana \n", + " pipelineType project \\\n", + "SV-1721_SV_LN11 CITE-seq Locally advanced gastric cancer \n", + "SV-1721_SV_LN12 CITE-seq Locally advanced gastric cancer \n", "\n", - " destination transfer comment \n", - "IM-1356_Ru553B_1 s3://dp-lab-data/SCRI_Projects/HTAN_... - moormana " + " sample owner \\\n", + "SV-1721_SV_LN11 SV-1721_SV_LN11 sohailn \n", + "SV-1721_SV_LN12 SV-1721_SV_LN12 sohailn \n", + "\n", + " destination transfer comment \n", + "SV-1721_SV_LN11 s3://dp-lab-data/collaborators/vardh... - sohailn \n", + "SV-1721_SV_LN12 s3://dp-lab-data/collaborators/vardh... - sohailn " ] }, - "execution_count": 112, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -1898,7 +2922,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 115, "metadata": { "tags": [] }, @@ -1906,12 +2930,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8ecadb35db68428887f127dc03248d55", + "model_id": "f2653ca7da3a4aa8b6d437310e123962", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
AT-1447_Ret_R1s3://dp-lab-data/collaborators/priya...3409mouse10X_V3.1Memory consolidation VRs3://seqc-public/genomes/mm38_long_p...
AT-1447_Ret_R2s3://dp-lab-data/collaborators/priya...3410mouse10X_V3.1Memory consolidation VRs3://seqc-public/genomes/mm38_long_p...
\n", + "" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... 3409 mouse \n", + "AT-1447_Ret_R2 s3://dp-lab-data/collaborators/priya... 3410 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "AT-1447_Ret_R1 10X_V3.1 Memory consolidation VR \n", + "AT-1447_Ret_R2 10X_V3.1 Memory consolidation VR \n", + "\n", + " reference \n", + "Sample \n", + "AT-1447_Ret_R1 s3://seqc-public/genomes/mm38_long_p... \n", + "AT-1447_Ret_R2 s3://seqc-public/genomes/mm38_long_p... " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "request_ids = ['AT-1447']\n", + "samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "# sample_ids = [3872]\n", + "# samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
AT-1447_Ret_R1s3://dp-lab-data/collaborators/priya...3409mouse10X_V3.1Memory consolidation VRs3://seqc-public/genomes/mm38_long_p...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... 3409 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "AT-1447_Ret_R1 10X_V3.1 Memory consolidation VR \n", + "\n", + " reference \n", + "Sample \n", + "AT-1447_Ret_R1 s3://seqc-public/genomes/mm38_long_p... " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = samples.loc[samples.index == 'AT-1447_Ret_R1']\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_96156/3820246244.py:11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples.loc[sample, 'denseCountMatrix'] = get_denseCountMatrix(row['AWS_storage'])[0]\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_96156/3820246244.py:14: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"Whitelist_Params\"] = wl_params\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_96156/3820246244.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"Barcode_Params\"] = bc_params\n", + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_96156/3820246244.py:16: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"Barcodes\"] = bcs\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferencedenseCountMatrixWhitelist_ParamsBarcode_ParamsBarcodes
Sample
AT-1447_Ret_R1s3://dp-lab-data/collaborators/priya...3409mouse10X_V3.1Memory consolidation VRs3://seqc-public/genomes/mm38_long_p...s3://dp-lab-data/collaborators/priya...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(ATGAGGAATTCCTGC, A0301, m16, 0), (...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... 3409 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "AT-1447_Ret_R1 10X_V3.1 Memory consolidation VR \n", + "\n", + " reference \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Whitelist_Params \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'uri': 's3://dp-lab-data/collaborat... \n", + "\n", + " Barcode_Params \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "\n", + " Barcodes \n", + "Sample \n", + "AT-1447_Ret_R1 [(ATGAGGAATTCCTGC, A0301, m16, 0), (... " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wl_params = []\n", + "bc_params = []\n", + "bcs = []\n", + "\n", + "for sample, row in samples.iterrows():\n", + " idx = row['id']\n", + " \n", + " wl_params.append(get_wl_params(idx, creds, prefix, row['AWS_storage']))\n", + " bc_params.append(get_bc_params(idx, creds))\n", + " bcs.append(get_bcs(idx, creds))\n", + " samples.loc[sample, 'denseCountMatrix'] = get_denseCountMatrix(row['AWS_storage'])[0]\n", + "\n", + "\n", + "samples[\"Whitelist_Params\"] = wl_params\n", + "samples[\"Barcode_Params\"] = bc_params\n", + "samples[\"Barcodes\"] = bcs\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AT-1447_Ret_R1\n", + " PRE AT-1447_Ret_R1/\n", + " PRE AT-1447_Ret_R1_CPL/\n", + "\n", + "AT-1447_Ret_R2\n", + " PRE AT-1447_Ret_R2/\n", + " PRE AT-1447_Ret_R2_CPL/\n", + "\n" + ] + } + ], + "source": [ + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL',\n", + " 's3://dp-lab-data/collaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL']" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['AWS_storage'] = samples['AWS_storage'] + '_CPL'\n", + "samples['AWS_storage'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_96156/4168958876.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferencedenseCountMatrixWhitelist_ParamsBarcode_ParamsBarcodesFASTQs
Sample
AT-1447_Ret_R1s3://dp-lab-data/collaborators/priya...3409mouse10X_V3.1Memory consolidation VRs3://seqc-public/genomes/mm38_long_p...s3://dp-lab-data/collaborators/priya...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(ATGAGGAATTCCTGC, A0301, m16, 0), (...{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... 3409 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "AT-1447_Ret_R1 10X_V3.1 Memory consolidation VR \n", + "\n", + " reference \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Whitelist_Params \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'uri': 's3://dp-lab-data/collaborat... \n", + "\n", + " Barcode_Params \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "\n", + " Barcodes \\\n", + "Sample \n", + "AT-1447_Ret_R1 [(ATGAGGAATTCCTGC, A0301, m16, 0), (... \n", + "\n", + " FASTQs \n", + "Sample \n", + "AT-1447_Ret_R1 {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE\n", + "\n", + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AT-1447_Ret_R1\n", + "collaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_R2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_R2_001.fastq.gz\n", + "\n", + "AT-1447_Ret_R2\n", + "collaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_R2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_R2_001.fastq.gz\n", + "\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQs'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "for sample, row in samples.iterrows():\n", + " fastqs = row['FASTQs']\n", + " fastqs = list(np.ravel(list(fastqs.values())))\n", + " \n", + " for fastq in fastqs:\n", + " file = fastq.replace('s3://', '')\n", + " bucket = file.split('/')[0]\n", + " key = file.replace(f'{bucket}/', '')\n", + "\n", + " cmd = f'aws s3api restore-object --bucket {bucket} --key {key} --restore-request '\n", + " cmd += '\\'{\"Days\":25, \"GlacierJobParameters\":{\"Tier\":\"Standard\"}}\\''\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Upload the barcodes to AWS" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "045b7fc884bc488f950b3464c6dfa532", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00IMPORTANT NOTE\n", + "\n", + "Check what version of 10x you are using if you are using the outputs of SEQC to generate your whitelist!" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['10X_V3.1']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['sc_tech'].unique().tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "platform = '10x_v3'" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Hashtag.uriFastqR1Hashtag.uriFastqR2Hashtag.sampleNameHashtag.scRnaSeqPlatformHashtag.lengthR1Hashtag.lengthR2Hashtag.cellBarcodeWhitelistUriHashtag.cellBarcodeWhiteListMethodHashtag.translate10XBarcodesHashtag.hashTagList...Hashtag.umiEndPosHashtag.slidingWindowSearchHashtag.cbCollapsingDistanceHashtag.umiCollapsingDistanceHashtag.numExpectedCellsHashtag.minCountHashtag.denseCountMatrixHashtag.resourceSpecHashtag.demuxModeHashtag.dockerRegistry
Sample
AT-1447_Ret_R1[s3://dp-lab-data/collaborators/priy...[s3://dp-lab-data/collaborators/priy...AT-1447_Ret_R110x_v32815s3://dp-lab-data/collaborators/priya...SeqcDenseCountsMatrixCsvTrues3://dp-lab-data/collaborators/priya......28False11010s3://dp-lab-data/collaborators/priya...{'cpu': 32, 'memory': -1}1quay.io/hisplan
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " Hashtag.uriFastqR1 \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... \n", + "\n", + " Hashtag.uriFastqR2 Hashtag.sampleName \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... AT-1447_Ret_R1 \n", + "\n", + " Hashtag.scRnaSeqPlatform Hashtag.lengthR1 Hashtag.lengthR2 \\\n", + "Sample \n", + "AT-1447_Ret_R1 10x_v3 28 15 \n", + "\n", + " Hashtag.cellBarcodeWhitelistUri \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.cellBarcodeWhiteListMethod \\\n", + "Sample \n", + "AT-1447_Ret_R1 SeqcDenseCountsMatrixCsv \n", + "\n", + " Hashtag.translate10XBarcodes \\\n", + "Sample \n", + "AT-1447_Ret_R1 True \n", + "\n", + " Hashtag.hashTagList ... \\\n", + "Sample ... \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... ... \n", + "\n", + " Hashtag.umiEndPos Hashtag.slidingWindowSearch \\\n", + "Sample \n", + "AT-1447_Ret_R1 28 False \n", + "\n", + " Hashtag.cbCollapsingDistance Hashtag.umiCollapsingDistance \\\n", + "Sample \n", + "AT-1447_Ret_R1 1 1 \n", + "\n", + " Hashtag.numExpectedCells Hashtag.minCount \\\n", + "Sample \n", + "AT-1447_Ret_R1 0 10 \n", + "\n", + " Hashtag.denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.resourceSpec Hashtag.demuxMode \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cpu': 32, 'memory': -1} 1 \n", + "\n", + " Hashtag.dockerRegistry \n", + "Sample \n", + "AT-1447_Ret_R1 quay.io/hisplan \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/cellplex.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "fastq_file_ids = fastq_map[prefix]\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index\n", + "inputs[f\"{prefix}.scRnaSeqPlatform\"] = platform \n", + "\n", + "inputs[f\"{prefix}.lengthR1\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"umi\"])\n", + "inputs[f\"{prefix}.lengthR2\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"seq_length\"])\n", + "\n", + "inputs[f\"{prefix}.cbStartPos\"] = 1\n", + "inputs[f\"{prefix}.cbEndPos\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"cb\"])\n", + "inputs[f\"{prefix}.umiEndPos\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"umi\"])\n", + "inputs[f\"{prefix}.umiStartPos\"] = inputs[f\"{prefix}.cbEndPos\"] + 1\n", + "\n", + "# inputs[f\"{prefix}.trimPos\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"bp_shift\"])\n", + "\n", + "inputs[f\"{prefix}.cellBarcodeWhitelistUri\"] = samples[\"Whitelist_Params\"].apply(lambda x: x[\"uri\"])\n", + "inputs[f\"{prefix}.cellBarcodeWhiteListMethod\"] = samples[\"Whitelist_Params\"].apply(lambda x: x[\"method\"])\n", + "\n", + "inputs[f\"{prefix}.translate10XBarcodes\"] = True\n", + "\n", + "inputs[f\"{prefix}.hashTagList\"] = samples[\"AWS_storage\"] + f\"/{output_dirname}/tag-list.csv\" \n", + "inputs[f\"{prefix}.denseCountMatrix\"] = samples[\"denseCountMatrix\"]\n", + "\n", + "\n", + "for file_id in fastq_file_ids: # Set FASTQs\n", + " inputs[f\"{prefix}.uriFastq{file_id}\"] = samples[\"FASTQs\"].apply(lambda x: x[file_id])\n", + "\n", + "# ********************\n", + "# Defaults\n", + "# Note: These may need to be changed on a per-sample or per-execution basis\n", + "\n", + "inputs[f\"{prefix}.slidingWindowSearch\"] = False\n", + "inputs[f\"{prefix}.cbCollapsingDistance\"] = 1\n", + "inputs[f\"{prefix}.umiCollapsingDistance\"] = 1\n", + "inputs[f\"{prefix}.numExpectedCells\"] = 0\n", + "# Need trick to set dictionary for each row\n", + "common_resource_spec = {\n", + " \"cpu\": 32,\n", + " \"memory\": -1,\n", + "}\n", + "inputs[f\"{prefix}.resourceSpec\"] = inputs.iloc[:, 0].apply(lambda x: common_resource_spec)\n", + "if prefix == \"Hashtag\":\n", + " inputs[f\"{prefix}.minCount\"] = 10\n", + "inputs[f\"{prefix}.demuxMode\"] = 1\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/priya/MemConsolidationVr/AT-1447_Ret_R1/seqc-results/3409_AT-1447_Ret_R1_IGO_12437_P_10_dense.csv']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(inputs[f'{prefix}.denseCountMatrix'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
AT-1447_Ret_R1HashtagMemory consolidation VRAT-1447_Ret_R1sohailns3://dp-lab-data/collaborators/priya...-sohailn
AT-1447_Ret_R2HashtagMemory consolidation VRAT-1447_Ret_R2sohailns3://dp-lab-data/collaborators/priya...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner \\\n", + "Sample \n", + "AT-1447_Ret_R1 Hashtag Memory consolidation VR AT-1447_Ret_R1 sohailn \n", + "AT-1447_Ret_R2 Hashtag Memory consolidation VR AT-1447_Ret_R2 sohailn \n", + "\n", + " destination transfer comment \n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... - sohailn \n", + "AT-1447_Ret_R2 s3://dp-lab-data/collaborators/priya... - sohailn " + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/cellplex.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Hashtag.uriFastqR1Hashtag.uriFastqR2Hashtag.sampleNameHashtag.scRnaSeqPlatformHashtag.lengthR1Hashtag.lengthR2Hashtag.cellBarcodeWhitelistUriHashtag.cellBarcodeWhiteListMethodHashtag.translate10XBarcodesHashtag.hashTagList...Hashtag.umiEndPosHashtag.slidingWindowSearchHashtag.cbCollapsingDistanceHashtag.umiCollapsingDistanceHashtag.numExpectedCellsHashtag.minCountHashtag.denseCountMatrixHashtag.resourceSpecHashtag.demuxModeHashtag.dockerRegistry
Sample
AT-1447_Ret_R1[s3://dp-lab-data/collaborators/priy...[s3://dp-lab-data/collaborators/priy...AT-1447_Ret_R110x_v32815s3://dp-lab-data/collaborators/priya...SeqcDenseCountsMatrixCsvTrues3://dp-lab-data/collaborators/priya......28False11010s3://dp-lab-data/collaborators/priya...{'cpu': 32, 'memory': -1}1quay.io/hisplan
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " Hashtag.uriFastqR1 \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... \n", + "\n", + " Hashtag.uriFastqR2 Hashtag.sampleName \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... AT-1447_Ret_R1 \n", + "\n", + " Hashtag.scRnaSeqPlatform Hashtag.lengthR1 Hashtag.lengthR2 \\\n", + "Sample \n", + "AT-1447_Ret_R1 10x_v3 28 15 \n", + "\n", + " Hashtag.cellBarcodeWhitelistUri \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.cellBarcodeWhiteListMethod \\\n", + "Sample \n", + "AT-1447_Ret_R1 SeqcDenseCountsMatrixCsv \n", + "\n", + " Hashtag.translate10XBarcodes \\\n", + "Sample \n", + "AT-1447_Ret_R1 True \n", + "\n", + " Hashtag.hashTagList ... \\\n", + "Sample ... \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... ... \n", + "\n", + " Hashtag.umiEndPos Hashtag.slidingWindowSearch \\\n", + "Sample \n", + "AT-1447_Ret_R1 28 False \n", + "\n", + " Hashtag.cbCollapsingDistance Hashtag.umiCollapsingDistance \\\n", + "Sample \n", + "AT-1447_Ret_R1 1 1 \n", + "\n", + " Hashtag.numExpectedCells Hashtag.minCount \\\n", + "Sample \n", + "AT-1447_Ret_R1 0 10 \n", + "\n", + " Hashtag.denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.resourceSpec Hashtag.demuxMode \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cpu': 32, 'memory': -1} 1 \n", + "\n", + " Hashtag.dockerRegistry \n", + "Sample \n", + "AT-1447_Ret_R1 quay.io/hisplan \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
AT-1447_Ret_R1HashtagMemory consolidation VRAT-1447_Ret_R1sohailns3://dp-lab-data/collaborators/priya...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner \\\n", + "Sample \n", + "AT-1447_Ret_R1 Hashtag Memory consolidation VR AT-1447_Ret_R1 sohailn \n", + "\n", + " destination transfer comment \n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... - sohailn " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28519fe475714697b8af5b257dac468d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00 $file_out" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AT-1447_Ret_R1\n", + "CITE-seq-Count Version: 1.4.3\n", + "Correction:\n", + " Cell barcodes collapsing threshold: 1\n", + " Cell barcodes corrected: 209632\n", + " UMI collapsing threshold: 1\n", + " UMIs corrected: 269339\n", + "Date: 2022-11-04\n", + "Percentage mapped: 98\n", + "Percentage unmapped: 2\n", + "Reads processed: 99797110\n", + "Running time: 2.0 hours, 10.0 minutes, 33.55 seconds\n", + "Uncorrected cells: 0\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import yaml\n", + "\n", + "for sample, row in labels.iterrows():\n", + " file_out = f'web_summary/{sample}.run_report.yaml'\n", + " \n", + " with open(file_out, 'r') as stream:\n", + " try:\n", + " parsed_yaml = yaml.safe_load(stream)\n", + " except yaml.YAMLError as exc:\n", + " print(exc)\n", + " \n", + " print(sample)\n", + " del parsed_yaml['Run parameters']\n", + " print(yaml.dump(parsed_yaml, default_flow_style=False))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/Run_Sharp_Hashtag.ipynb b/notebooks/Run_Sharp_Hashtag.ipynb new file mode 100644 index 0000000..3c6fad7 --- /dev/null +++ b/notebooks/Run_Sharp_Hashtag.ipynb @@ -0,0 +1,1655 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging\n", + "import pandas as pd\n", + "import numpy as np\n", + "from s3path import S3Path\n", + "from pathlib import Path\n", + "from tqdm.notebook import tqdm\n", + "from packaging import version\n", + "\n", + "import glob\n", + "import os\n", + "\n", + "pd.set_option(\"display.max_colwidth\", 40)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from utils.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AWS setup" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMCCBMO5FL\n", + "env: AWS_SECRET_ACCESS_KEY=DG9oLl+A09X6+uLmnXevErupT/HEQad4zG/y5TTR\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjECIaCXVzLWVhc3QtMSJIMEYCIQCpTwuT8/TR5SFwS2ZpI9wHgMePNQiGB9baiXqhz2SAbwIhAKmdXEC38Ab1PXRjmANETp0MZtr5FEAuzMK0OZ9ae/+8KvgBCNv//////////wEQBBoMNTgzNjQzNTY3NTEyIgw5HxnmXcMH0+MIrwQqzAGiga/yHEbRDYb135G/x96XUChjUvHSZHTH9jmq5M4inysP5wrGBk+8UWrGb7ruczmRHJOB9dIfpKeGo639AGiYHpycHW6w8S4+4wCVEzXId2j+lt887h31Qp6qoJoLwTKA3FhWL6moZmR7vdIApGoWmqG9HaaP+J1RPqM2eTFsqM2yefMQC/xPLzNxVQVIjqMG633ZYaETSLxmtOobSdHJpq/6KKhSmpNxZXyC2v8MRPmmbwr1RtoJHFAxBvjspxjITTTi2GTvqJNJcj4w9+XCoAY6lwEEdKTD9DinLSSszZ2XW9H0Yhk8ygREAdVYIErlav1UR+SV7MzA1jWAby2mlEx7xWLj7XjQYC3hYWCdo5f+5do633bae9UIOAoYrjxxOyOnS6+/Tc89lJEBETX9qMLtQ7p5zQqT3XQa6eEduIVFa5SS3dSOMNORPtf4HkNxuBvcfFRP5viWvQi8ISHlY0QyhtKC3ROcEcSJ\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMCCBMO5FL\n", + "%env AWS_SECRET_ACCESS_KEY=DG9oLl+A09X6+uLmnXevErupT/HEQad4zG/y5TTR\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjECIaCXVzLWVhc3QtMSJIMEYCIQCpTwuT8/TR5SFwS2ZpI9wHgMePNQiGB9baiXqhz2SAbwIhAKmdXEC38Ab1PXRjmANETp0MZtr5FEAuzMK0OZ9ae/+8KvgBCNv//////////wEQBBoMNTgzNjQzNTY3NTEyIgw5HxnmXcMH0+MIrwQqzAGiga/yHEbRDYb135G/x96XUChjUvHSZHTH9jmq5M4inysP5wrGBk+8UWrGb7ruczmRHJOB9dIfpKeGo639AGiYHpycHW6w8S4+4wCVEzXId2j+lt887h31Qp6qoJoLwTKA3FhWL6moZmR7vdIApGoWmqG9HaaP+J1RPqM2eTFsqM2yefMQC/xPLzNxVQVIjqMG633ZYaETSLxmtOobSdHJpq/6KKhSmpNxZXyC2v8MRPmmbwr1RtoJHFAxBvjspxjITTTi2GTvqJNJcj4w9+XCoAY6lwEEdKTD9DinLSSszZ2XW9H0Yhk8ygREAdVYIErlav1UR+SV7MzA1jWAby2mlEx7xWLj7XjQYC3hYWCdo5f+5do633bae9UIOAoYrjxxOyOnS6+/Tc89lJEBETX9qMLtQ7p5zQqT3XQa6eEduIVFa5SS3dSOMNORPtf4HkNxuBvcfFRP5viWvQi8ISHlY0QyhtKC3ROcEcSJ\n", + "\n", + "!aws s3 ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Which pipeline are you running\n", + "prefix = \"Hashtag\" # Workflow to run; also .wdl filename prefix\n", + "output_dirname = \"Hashtag-results\"\n", + "\n", + "workflow_dir = glob.glob(f\"{Path.home()}/scing/bin/sharp*\")[0]\n", + "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", + "\n", + "# Locations of workflow-related directories and files\n", + "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\"\n", + "db_credentials_path = f\"{Path.home()}/.config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Location of docker files\n", + "common_docker_registry = \"quay.io/hisplan\"\n", + "pipeline_type = prefix # field in *.labels.json\n", + "comment = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Workflow file paths\n", + "config_dir = f\"{workflow_dir}/configs\"\n", + "path_to_options = f\"{workflow_dir}/Sharp.options.aws.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Set credentials based on SCRIdb CLI config file\n", + "with open(db_credentials_path) as f:\n", + " creds = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample information" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomes3://dp-lab-data/collaborators/strud...4531humanmultiomeDOGMAseq sample plusGRCh38
AE-2116_mDA_neurons_DOGMA-seq_multiome_mATACs3://dp-lab-data/collaborators/strud...4532humanmultiomeDOGMAseq sample plusGRCh38
\n", + "
" + ], + "text/plain": [ + " AWS_storage \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome s3://dp-lab-data/collaborators/strud... \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiom... s3://dp-lab-data/collaborators/strud... \n", + "\n", + " id species sc_tech \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4531 human multiome \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiom... 4532 human multiome \n", + "\n", + " project_id reference \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome DOGMAseq sample plus GRCh38 \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiom... DOGMAseq sample plus GRCh38 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "request_ids = ['AE-2116']\n", + "samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
AE-2116_mDA_neurons_DOGMA-seq_multiomes3://dp-lab-data/collaborators/strud...4531humanmultiomeDOGMAseq sample plusGRCh38
\n", + "
" + ], + "text/plain": [ + " AWS_storage \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome s3://dp-lab-data/collaborators/strud... \n", + "\n", + " id species sc_tech \\\n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome 4531 human multiome \n", + "\n", + " project_id reference \n", + "Sample \n", + "AE-2116_mDA_neurons_DOGMA-seq_multiome DOGMAseq sample plus GRCh38 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = samples.iloc[0:1]\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:root:Path to barcodes or counts matrix of GEX data is missing!\n" + ] + }, + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [17]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m bc_params\u001b[38;5;241m.\u001b[39mappend(get_bc_params(idx, creds))\n\u001b[1;32m 10\u001b[0m bcs\u001b[38;5;241m.\u001b[39mappend(get_bcs(idx, creds))\n\u001b[0;32m---> 11\u001b[0m samples\u001b[38;5;241m.\u001b[39mloc[sample, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdenseCountMatrix\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mget_denseCountMatrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mAWS_storage\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 13\u001b[0m samples[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWhitelist_Params\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m wl_params\n\u001b[1;32m 14\u001b[0m samples[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBarcode_Params\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m bc_params\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "wl_params = []\n", + "bc_params = []\n", + "bcs = []\n", + "\n", + "for sample, row in samples.iterrows():\n", + " idx = row['id']\n", + " \n", + " wl_params.append(get_wl_params(idx, creds, prefix, row['AWS_storage']))\n", + " bc_params.append(get_bc_params(idx, creds))\n", + " bcs.append(get_bcs(idx, creds))\n", + " samples.loc[sample, 'denseCountMatrix'] = get_denseCountMatrix(row['AWS_storage'])[0]\n", + "\n", + "samples[\"Whitelist_Params\"] = wl_params\n", + "samples[\"Barcode_Params\"] = bc_params\n", + "samples[\"Barcodes\"] = bcs\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_denseCountMatrix(row['AWS_storage'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AE-2116_mDA_neurons_DOGMA-seq_multiome\n", + " PRE AE-2116_mDA_neurons_DOGMA-seq_multiome/\n", + " PRE AE-2116_mDA_neurons_DOGMA-seq_multiome_HTO/\n", + " PRE AE-2116_mDA_neurons_DOGMA-seq_multiome_mATAC/\n", + "\n" + ] + } + ], + "source": [ + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_61990/2341054346.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples['AWS_storage'] = samples['AWS_storage'] + '_HTO'\n" + ] + }, + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/struder/DogmaseqSamplePlus/AE-2116_mDA_neurons_DOGMA-seq_multiome_HTO']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['AWS_storage'] = samples['AWS_storage'] + '_HTO'\n", + "samples['AWS_storage'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9v/1lnyrkxd0yq8l2pgl002wh1s7c6fqv/T/ipykernel_96156/4168958876.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferencedenseCountMatrixWhitelist_ParamsBarcode_ParamsBarcodesFASTQs
Sample
AT-1447_Ret_R1s3://dp-lab-data/collaborators/priya...3409mouse10X_V3.1Memory consolidation VRs3://seqc-public/genomes/mm38_long_p...s3://dp-lab-data/collaborators/priya...{'uri': 's3://dp-lab-data/collaborat...{'cb': 16, 'umi': 28, 'conjugation':...[(ATGAGGAATTCCTGC, A0301, m16, 0), (...{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... 3409 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "AT-1447_Ret_R1 10X_V3.1 Memory consolidation VR \n", + "\n", + " reference \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Whitelist_Params \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'uri': 's3://dp-lab-data/collaborat... \n", + "\n", + " Barcode_Params \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cb': 16, 'umi': 28, 'conjugation':... \n", + "\n", + " Barcodes \\\n", + "Sample \n", + "AT-1447_Ret_R1 [(ATGAGGAATTCCTGC, A0301, m16, 0), (... \n", + "\n", + " FASTQs \n", + "Sample \n", + "AT-1447_Ret_R1 {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IMPORTANT NOTE\n", + "\n", + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AT-1447_Ret_R1\n", + "collaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L003_R2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/FASTQ/3409_AT-1447_Ret_R1_CPL_IGO_12437_Q_3_S32_L004_R2_001.fastq.gz\n", + "\n", + "AT-1447_Ret_R2\n", + "collaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L003_R2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_I1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_I2_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_R1_001.fastq.gz\tcollaborators/priya/MemConsolidationVr/AT-1447_Ret_R2_CPL/FASTQ/3410_AT-1447_Ret_R2_CPL_IGO_12437_Q_4_S33_L004_R2_001.fastq.gz\n", + "\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQs'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "for sample, row in samples.iterrows():\n", + " fastqs = row['FASTQs']\n", + " fastqs = list(np.ravel(list(fastqs.values())))\n", + " \n", + " for fastq in fastqs:\n", + " file = fastq.replace('s3://', '')\n", + " bucket = file.split('/')[0]\n", + " key = file.replace(f'{bucket}/', '')\n", + "\n", + " cmd = f'aws s3api restore-object --bucket {bucket} --key {key} --restore-request '\n", + " cmd += '\\'{\"Days\":25, \"GlacierJobParameters\":{\"Tier\":\"Standard\"}}\\''\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Upload the barcodes to AWS" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "045b7fc884bc488f950b3464c6dfa532", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00IMPORTANT NOTE\n", + "\n", + "Check what version of 10x you are using if you are using the outputs of SEQC to generate your whitelist!" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['10X_V3.1']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['sc_tech'].unique().tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "platform = '10x_v3'" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Hashtag.uriFastqR1Hashtag.uriFastqR2Hashtag.sampleNameHashtag.scRnaSeqPlatformHashtag.lengthR1Hashtag.lengthR2Hashtag.cellBarcodeWhitelistUriHashtag.cellBarcodeWhiteListMethodHashtag.translate10XBarcodesHashtag.hashTagList...Hashtag.umiEndPosHashtag.slidingWindowSearchHashtag.cbCollapsingDistanceHashtag.umiCollapsingDistanceHashtag.numExpectedCellsHashtag.minCountHashtag.denseCountMatrixHashtag.resourceSpecHashtag.demuxModeHashtag.dockerRegistry
Sample
AT-1447_Ret_R1[s3://dp-lab-data/collaborators/priy...[s3://dp-lab-data/collaborators/priy...AT-1447_Ret_R110x_v32815s3://dp-lab-data/collaborators/priya...SeqcDenseCountsMatrixCsvTrues3://dp-lab-data/collaborators/priya......28False11010s3://dp-lab-data/collaborators/priya...{'cpu': 32, 'memory': -1}1quay.io/hisplan
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " Hashtag.uriFastqR1 \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... \n", + "\n", + " Hashtag.uriFastqR2 Hashtag.sampleName \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... AT-1447_Ret_R1 \n", + "\n", + " Hashtag.scRnaSeqPlatform Hashtag.lengthR1 Hashtag.lengthR2 \\\n", + "Sample \n", + "AT-1447_Ret_R1 10x_v3 28 15 \n", + "\n", + " Hashtag.cellBarcodeWhitelistUri \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.cellBarcodeWhiteListMethod \\\n", + "Sample \n", + "AT-1447_Ret_R1 SeqcDenseCountsMatrixCsv \n", + "\n", + " Hashtag.translate10XBarcodes \\\n", + "Sample \n", + "AT-1447_Ret_R1 True \n", + "\n", + " Hashtag.hashTagList ... \\\n", + "Sample ... \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... ... \n", + "\n", + " Hashtag.umiEndPos Hashtag.slidingWindowSearch \\\n", + "Sample \n", + "AT-1447_Ret_R1 28 False \n", + "\n", + " Hashtag.cbCollapsingDistance Hashtag.umiCollapsingDistance \\\n", + "Sample \n", + "AT-1447_Ret_R1 1 1 \n", + "\n", + " Hashtag.numExpectedCells Hashtag.minCount \\\n", + "Sample \n", + "AT-1447_Ret_R1 0 10 \n", + "\n", + " Hashtag.denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.resourceSpec Hashtag.demuxMode \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cpu': 32, 'memory': -1} 1 \n", + "\n", + " Hashtag.dockerRegistry \n", + "Sample \n", + "AT-1447_Ret_R1 quay.io/hisplan \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/cellplex.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "fastq_file_ids = fastq_map[prefix]\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index\n", + "inputs[f\"{prefix}.scRnaSeqPlatform\"] = platform \n", + "\n", + "inputs[f\"{prefix}.lengthR1\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"umi\"])\n", + "inputs[f\"{prefix}.lengthR2\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"seq_length\"])\n", + "\n", + "inputs[f\"{prefix}.cbStartPos\"] = 1\n", + "inputs[f\"{prefix}.cbEndPos\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"cb\"])\n", + "inputs[f\"{prefix}.umiEndPos\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"umi\"])\n", + "inputs[f\"{prefix}.umiStartPos\"] = inputs[f\"{prefix}.cbEndPos\"] + 1\n", + "\n", + "# inputs[f\"{prefix}.trimPos\"] = samples[\"Barcode_Params\"].apply(lambda x: x[\"bp_shift\"])\n", + "\n", + "inputs[f\"{prefix}.cellBarcodeWhitelistUri\"] = samples[\"Whitelist_Params\"].apply(lambda x: x[\"uri\"])\n", + "inputs[f\"{prefix}.cellBarcodeWhiteListMethod\"] = samples[\"Whitelist_Params\"].apply(lambda x: x[\"method\"])\n", + "\n", + "inputs[f\"{prefix}.translate10XBarcodes\"] = True\n", + "\n", + "inputs[f\"{prefix}.hashTagList\"] = samples[\"AWS_storage\"] + f\"/{output_dirname}/tag-list.csv\" \n", + "inputs[f\"{prefix}.denseCountMatrix\"] = samples[\"denseCountMatrix\"]\n", + "\n", + "\n", + "for file_id in fastq_file_ids: # Set FASTQs\n", + " inputs[f\"{prefix}.uriFastq{file_id}\"] = samples[\"FASTQs\"].apply(lambda x: x[file_id])\n", + "\n", + "# ********************\n", + "# Defaults\n", + "# Note: These may need to be changed on a per-sample or per-execution basis\n", + "\n", + "inputs[f\"{prefix}.slidingWindowSearch\"] = False\n", + "inputs[f\"{prefix}.cbCollapsingDistance\"] = 1\n", + "inputs[f\"{prefix}.umiCollapsingDistance\"] = 1\n", + "inputs[f\"{prefix}.numExpectedCells\"] = 0\n", + "# Need trick to set dictionary for each row\n", + "common_resource_spec = {\n", + " \"cpu\": 32,\n", + " \"memory\": -1,\n", + "}\n", + "inputs[f\"{prefix}.resourceSpec\"] = inputs.iloc[:, 0].apply(lambda x: common_resource_spec)\n", + "if prefix == \"Hashtag\":\n", + " inputs[f\"{prefix}.minCount\"] = 10\n", + "inputs[f\"{prefix}.demuxMode\"] = 1\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/collaborators/priya/MemConsolidationVr/AT-1447_Ret_R1/seqc-results/3409_AT-1447_Ret_R1_IGO_12437_P_10_dense.csv']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(inputs[f'{prefix}.denseCountMatrix'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
AT-1447_Ret_R1HashtagMemory consolidation VRAT-1447_Ret_R1sohailns3://dp-lab-data/collaborators/priya...-sohailn
AT-1447_Ret_R2HashtagMemory consolidation VRAT-1447_Ret_R2sohailns3://dp-lab-data/collaborators/priya...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner \\\n", + "Sample \n", + "AT-1447_Ret_R1 Hashtag Memory consolidation VR AT-1447_Ret_R1 sohailn \n", + "AT-1447_Ret_R2 Hashtag Memory consolidation VR AT-1447_Ret_R2 sohailn \n", + "\n", + " destination transfer comment \n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... - sohailn \n", + "AT-1447_Ret_R2 s3://dp-lab-data/collaborators/priya... - sohailn " + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/cellplex.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Hashtag.uriFastqR1Hashtag.uriFastqR2Hashtag.sampleNameHashtag.scRnaSeqPlatformHashtag.lengthR1Hashtag.lengthR2Hashtag.cellBarcodeWhitelistUriHashtag.cellBarcodeWhiteListMethodHashtag.translate10XBarcodesHashtag.hashTagList...Hashtag.umiEndPosHashtag.slidingWindowSearchHashtag.cbCollapsingDistanceHashtag.umiCollapsingDistanceHashtag.numExpectedCellsHashtag.minCountHashtag.denseCountMatrixHashtag.resourceSpecHashtag.demuxModeHashtag.dockerRegistry
Sample
AT-1447_Ret_R1[s3://dp-lab-data/collaborators/priy...[s3://dp-lab-data/collaborators/priy...AT-1447_Ret_R110x_v32815s3://dp-lab-data/collaborators/priya...SeqcDenseCountsMatrixCsvTrues3://dp-lab-data/collaborators/priya......28False11010s3://dp-lab-data/collaborators/priya...{'cpu': 32, 'memory': -1}1quay.io/hisplan
\n", + "

1 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " Hashtag.uriFastqR1 \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... \n", + "\n", + " Hashtag.uriFastqR2 Hashtag.sampleName \\\n", + "Sample \n", + "AT-1447_Ret_R1 [s3://dp-lab-data/collaborators/priy... AT-1447_Ret_R1 \n", + "\n", + " Hashtag.scRnaSeqPlatform Hashtag.lengthR1 Hashtag.lengthR2 \\\n", + "Sample \n", + "AT-1447_Ret_R1 10x_v3 28 15 \n", + "\n", + " Hashtag.cellBarcodeWhitelistUri \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.cellBarcodeWhiteListMethod \\\n", + "Sample \n", + "AT-1447_Ret_R1 SeqcDenseCountsMatrixCsv \n", + "\n", + " Hashtag.translate10XBarcodes \\\n", + "Sample \n", + "AT-1447_Ret_R1 True \n", + "\n", + " Hashtag.hashTagList ... \\\n", + "Sample ... \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... ... \n", + "\n", + " Hashtag.umiEndPos Hashtag.slidingWindowSearch \\\n", + "Sample \n", + "AT-1447_Ret_R1 28 False \n", + "\n", + " Hashtag.cbCollapsingDistance Hashtag.umiCollapsingDistance \\\n", + "Sample \n", + "AT-1447_Ret_R1 1 1 \n", + "\n", + " Hashtag.numExpectedCells Hashtag.minCount \\\n", + "Sample \n", + "AT-1447_Ret_R1 0 10 \n", + "\n", + " Hashtag.denseCountMatrix \\\n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... \n", + "\n", + " Hashtag.resourceSpec Hashtag.demuxMode \\\n", + "Sample \n", + "AT-1447_Ret_R1 {'cpu': 32, 'memory': -1} 1 \n", + "\n", + " Hashtag.dockerRegistry \n", + "Sample \n", + "AT-1447_Ret_R1 quay.io/hisplan \n", + "\n", + "[1 rows x 23 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pipelineTypeprojectsampleownerdestinationtransfercomment
Sample
AT-1447_Ret_R1HashtagMemory consolidation VRAT-1447_Ret_R1sohailns3://dp-lab-data/collaborators/priya...-sohailn
\n", + "
" + ], + "text/plain": [ + " pipelineType project sample owner \\\n", + "Sample \n", + "AT-1447_Ret_R1 Hashtag Memory consolidation VR AT-1447_Ret_R1 sohailn \n", + "\n", + " destination transfer comment \n", + "Sample \n", + "AT-1447_Ret_R1 s3://dp-lab-data/collaborators/priya... - sohailn " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28519fe475714697b8af5b257dac468d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00 $file_out" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AT-1447_Ret_R1\n", + "CITE-seq-Count Version: 1.4.3\n", + "Correction:\n", + " Cell barcodes collapsing threshold: 1\n", + " Cell barcodes corrected: 209632\n", + " UMI collapsing threshold: 1\n", + " UMIs corrected: 269339\n", + "Date: 2022-11-04\n", + "Percentage mapped: 98\n", + "Percentage unmapped: 2\n", + "Reads processed: 99797110\n", + "Running time: 2.0 hours, 10.0 minutes, 33.55 seconds\n", + "Uncorrected cells: 0\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import yaml\n", + "\n", + "for sample, row in labels.iterrows():\n", + " file_out = f'web_summary/{sample}.run_report.yaml'\n", + " \n", + " with open(file_out, 'r') as stream:\n", + " try:\n", + " parsed_yaml = yaml.safe_load(stream)\n", + " except yaml.YAMLError as exc:\n", + " print(exc)\n", + " \n", + " print(sample)\n", + " del parsed_yaml['Run parameters']\n", + " print(yaml.dump(parsed_yaml, default_flow_style=False))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/Run_Transgenes_new.ipynb b/notebooks/Run_Transgenes_new.ipynb new file mode 100644 index 0000000..426b428 --- /dev/null +++ b/notebooks/Run_Transgenes_new.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a480ca5b-5dad-4673-9cd4-b9fb5137f40c", + "metadata": {}, + "outputs": [], + "source": [ + "import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging\n", + "import pandas as pd\n", + "import numpy as np\n", + "from s3path import S3Path\n", + "from pathlib import Path\n", + "from tqdm.notebook import tqdm\n", + "from packaging import version\n", + "\n", + "import glob\n", + "import os\n", + "\n", + "pd.set_option(\"display.max_colwidth\", 40)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7b1b09b4-7cde-45a8-8ae4-a401972d0acd", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cb85f7c-d187-4a57-b795-e07c002eddc6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "10b1a35a-406c-4f66-882d-10eeaabce91b", + "metadata": {}, + "source": [ + "# AWS setup" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4370e93d-2565-48cf-92e1-3bd6eac525d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMGNQHAG7V\n", + "env: AWS_SECRET_ACCESS_KEY=Zt6lmgFmhx1Qn1/mTajuJHxORPKjMd7zHo9yxCv3\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEL///////////wEaCXVzLWVhc3QtMSJHMEUCIBOn2n7FYhkwDrJArswLQBQzsB8bEcfdUZp6ZJWuQkNlAiEA/ZQVvnqwuRjHE0/2cTzrTEwF8bS0pfOp5sd5XU9QANMq+AEIqP//////////ARADGgw1ODM2NDM1Njc1MTIiDMFsXq9WBJoyLEdVQyrMAdIJg0FT0qo+cpTVLPKnFLvyQCjzH/7ZlWLd5w1NkTkMDIR4W/d+02fgfncbF3cGwglqlO18saOqvMTOjEUvvvhihRLRzuSajiF3sqG6wsdwu6IgCyk6xdYap84Wt0uo5LshyJAma2fK31yVPTip6n6nxoCcTmsJjhvFtMRGlgiC4bIAeW2lC8lqGHPmyz//tutqj9fZeUR0qmGtr1ium7Gg39WX5IzOgHAWTbb/HSspeP0+xDVfdzOHZyA47RVNc2EIwIXoHCntzPmUEjCNjYqbBjqYAcTJn0OZLrzdFL5RsIVXPaoQMedyiuxr27Bmz4QZoB32whcGu/0osFhwL4la0v+BUXuz2zGPB+byyrGkMB249QQitouR1DIZ/mCUnEz6HvCEQwdH/yIFkeNcY6p+rV/1sOa7op3p43reWd7s9ui3mlQ4QDi63ZJG/aBmGMwmPrCMQVgW9vJR5vr+LjV4YWxqVJ1PqOiZnGf+\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMGNQHAG7V\n", + "%env AWS_SECRET_ACCESS_KEY=Zt6lmgFmhx1Qn1/mTajuJHxORPKjMd7zHo9yxCv3\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEL///////////wEaCXVzLWVhc3QtMSJHMEUCIBOn2n7FYhkwDrJArswLQBQzsB8bEcfdUZp6ZJWuQkNlAiEA/ZQVvnqwuRjHE0/2cTzrTEwF8bS0pfOp5sd5XU9QANMq+AEIqP//////////ARADGgw1ODM2NDM1Njc1MTIiDMFsXq9WBJoyLEdVQyrMAdIJg0FT0qo+cpTVLPKnFLvyQCjzH/7ZlWLd5w1NkTkMDIR4W/d+02fgfncbF3cGwglqlO18saOqvMTOjEUvvvhihRLRzuSajiF3sqG6wsdwu6IgCyk6xdYap84Wt0uo5LshyJAma2fK31yVPTip6n6nxoCcTmsJjhvFtMRGlgiC4bIAeW2lC8lqGHPmyz//tutqj9fZeUR0qmGtr1ium7Gg39WX5IzOgHAWTbb/HSspeP0+xDVfdzOHZyA47RVNc2EIwIXoHCntzPmUEjCNjYqbBjqYAcTJn0OZLrzdFL5RsIVXPaoQMedyiuxr27Bmz4QZoB32whcGu/0osFhwL4la0v+BUXuz2zGPB+byyrGkMB249QQitouR1DIZ/mCUnEz6HvCEQwdH/yIFkeNcY6p+rV/1sOa7op3p43reWd7s9ui3mlQ4QDi63ZJG/aBmGMwmPrCMQVgW9vJR5vr+LjV4YWxqVJ1PqOiZnGf+\n", + "\n", + "!aws s3 ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1be9e8c4-2c18-4cb5-85cc-ee33b510f23c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "7091dcda-0a18-43a2-9c3f-077662f3bbc3", + "metadata": { + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "66d94c6b-ca49-4e15-be39-13ee21ec94e3", + "metadata": {}, + "outputs": [], + "source": [ + "# Maps from .wdl name (prefix) to results dirname\n", + "results_dirs = {\n", + " \"TransgenesSeqc\": \"refdata-seqc\",\n", + " \"TransgenesCellRanger\": \"refdata-cellranger\",\n", + "}\n", + "\n", + "# Maps from .wdl name (prefix) to shell script\n", + "sh_files = {\n", + " \"TransgenesSeqc\": \"submit-seqc.sh\",\n", + " \"TransgenesCellRanger\": \"submit-cellranger.sh\",\n", + "}\n", + "\n", + "# Version (CellRanger or Star) to use; Should be in database\n", + "latest_STAR = \"2.5.3a\"\n", + "latest_CellRanger = \"6.1.1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f3d482b7-cc91-44f8-9b80-93847138c22c", + "metadata": {}, + "outputs": [], + "source": [ + "# Which pipeline are you running\n", + "prefix = \"TransgenesCellRanger\" # Workflow to run; also .wdl filename prefix\n", + "output_dirname = \"refdata-cellranger\"\n", + "\n", + "workflow_dir = glob.glob(f\"{Path.home()}/scing/bin/transgenes*\")[0]\n", + "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", + "\n", + "# Locations of workflow-related directories and files\n", + "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\"\n", + "db_credentials_path = f\"{Path.home()}/.config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cca545a8-ccc3-4e15-8813-ae10e2d07ba6", + "metadata": {}, + "outputs": [], + "source": [ + "# Location of docker files\n", + "common_docker_registry = \"quay.io/hisplan\"\n", + "pipeline_type = prefix # field in *.labels.json\n", + "comment = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bdb9c13b-3129-4069-81a6-19919996f094", + "metadata": {}, + "outputs": [], + "source": [ + "# Workflow file paths\n", + "config_dir = f\"{workflow_dir}/configs\"\n", + "path_to_options = f\"{workflow_dir}/Sharp.options.aws.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "48f3391c-cd0e-48da-b05c-4adcc4e44f87", + "metadata": {}, + "outputs": [], + "source": [ + "# Set credentials based on SCRIdb CLI config file\n", + "with open(db_credentials_path) as f:\n", + " creds = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6670e0cc-8731-41cc-8d31-761deb3f3a1b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "e88c91c2-6044-463e-9bd9-14e1631963b4", + "metadata": {}, + "source": [ + "# Sample information" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "328b93a1-447a-4f30-9d5c-8cf82497fa43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
ARN-1167_M-4T1s3://dp-lab-data/collaborators/aboir...3249mouse10X_V3.1Leptomeningeal metastasis heterogeneitys3://seqc-public/genomes/mm38_long_p...
ARN-1167_Normals3://dp-lab-data/collaborators/aboir...3250mouse10X_V3.1Leptomeningeal metastasis heterogeneitys3://seqc-public/genomes/mm38_long_p...
ARN-1167_PM-4T1s3://dp-lab-data/collaborators/aboir...3251mouse10X_V3.1Leptomeningeal metastasis heterogeneitys3://seqc-public/genomes/mm38_long_p...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species \\\n", + "Sample \n", + "ARN-1167_M-4T1 s3://dp-lab-data/collaborators/aboir... 3249 mouse \n", + "ARN-1167_Normal s3://dp-lab-data/collaborators/aboir... 3250 mouse \n", + "ARN-1167_PM-4T1 s3://dp-lab-data/collaborators/aboir... 3251 mouse \n", + "\n", + " sc_tech project_id \\\n", + "Sample \n", + "ARN-1167_M-4T1 10X_V3.1 Leptomeningeal metastasis heterogeneity \n", + "ARN-1167_Normal 10X_V3.1 Leptomeningeal metastasis heterogeneity \n", + "ARN-1167_PM-4T1 10X_V3.1 Leptomeningeal metastasis heterogeneity \n", + "\n", + " reference \n", + "Sample \n", + "ARN-1167_M-4T1 s3://seqc-public/genomes/mm38_long_p... \n", + "ARN-1167_Normal s3://seqc-public/genomes/mm38_long_p... \n", + "ARN-1167_PM-4T1 s3://seqc-public/genomes/mm38_long_p... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "request_ids = ['ARN-1167']\n", + "samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b837d999-a7e3-431b-ab24-1076ed1876ae", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "aa4f0df0-2405-4d1a-aaf2-d2f10246690e", + "metadata": {}, + "source": [ + "# Manually add AWS path to fasta files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d229087-fb55-479e-aa12-9a98f323e41d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2529ec18-37f3-42e9-a91a-8f0be5b0c2e4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5b371e6a-d0c1-4d6e-bf7e-0af3a3317e53", + "metadata": {}, + "source": [ + "# Make labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c6cfd3b-c96e-4605-8961-00f210584517", + "metadata": {}, + "outputs": [], + "source": [ + "# Load minimum inputs and labels fields from templates\n", + "pipeline_type = prefix.split('Transgenes')[-1].lower()\n", + "with open(f\"{config_dir}/template.{pipeline_type}.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "with open(f\"{config_dir}/template.{pipeline_type}.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=submissions.index, columns=std_inputs_fields,)\n", + "labels = pd.DataFrame(index=submissions.index, columns=std_labels_fields,)\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.referenceName\"] = \\\n", + " submissions[\"Project_Name\"] + \"-\" + \\\n", + " submissions[\"Parsed_Species\"].apply(lambda x: f\"{x[1]}-Ensembl-{x[2]}\") + \"-\" \\\n", + " \"transgenes\"\n", + "inputs[f\"{prefix}.genomeReferenceFasta\"] = \\\n", + " submissions[\"Parsed_Species\"].apply(lambda x: ref_map[x[1:]][1])\n", + "inputs[f\"{prefix}.annotationGtf\"] = \\\n", + " submissions[\"Parsed_Species\"].apply(lambda x: ref_map[x[1:]][0])\n", + "inputs[f\"{prefix}.customFastaFiles\"] = \\\n", + " submissions[\"AWS S3 location(s) of the FASTA sequence of your reporter genes\"].apply(lambda x: x.split(\"\\n\"))\n", + "inputs[f\"{prefix}.ensembleIdPrefix\"] = \\\n", + " submissions[\"Parsed_Species\"].apply(lambda x: ensembl_map[x[0]])\n", + "inputs[f\"{prefix}.ensembleIds\"] = submissions[\"Ensembl ID of transgene\"].apply(lambda x: x.split(\"\\n\"))\n", + "if pipeline_type == \"TransgenesSEQC\":\n", + " inputs[f\"{prefix}.starVersion\"] = latest_STAR\n", + "else:\n", + " inputs[f\"{prefix}.cellRangerVersion\"] = latest_CellRanger\n", + "standard_biotypes = [\n", + " \"protein_coding\", \"lincRNA\", \"antisense\",\n", + " \"IG_V_gene\", \"IG_D_gene\", \"IG_J_gene\", \"IG_C_gene\",\n", + " \"TR_V_gene\", \"TR_D_gene\", \"TR_J_gene\", \"TR_C_gene\",\n", + "]\n", + "inputs[f\"{prefix}.biotypes\"] = inputs.iloc[:, 0].apply(lambda x: standard_biotypes)\n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be14a08e-71f3-41ea-b716-e9e69ee7c44c", + "metadata": {}, + "outputs": [], + "source": [ + "# Annotate labels\n", + "labels[\"pipelineType\"] = prefix\n", + "labels[\"project\"] = submissions[\"Project_Name\"]\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = \\\n", + " submissions[\"AWS S3 location where you want the newly built genome to be stored\"].str.strip(\"/\") + \\\n", + " \"/\" + output_dirname\n", + " \n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/Run_Unarchive.ipynb b/notebooks/Run_Unarchive.ipynb new file mode 100644 index 0000000..46928cb --- /dev/null +++ b/notebooks/Run_Unarchive.ipynb @@ -0,0 +1,2316 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "id": "786ff4b8-3736-4713-ae19-bc6ff2a064b0", + "metadata": {}, + "outputs": [], + "source": [ + "import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging\n", + "import pandas as pd\n", + "import numpy as np\n", + "from s3path import S3Path\n", + "from pathlib import Path\n", + "from tqdm.notebook import tqdm\n", + "from packaging import version\n", + "\n", + "import glob\n", + "import os\n", + "\n", + "pd.set_option(\"display.max_colwidth\", 40)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "b376274a-736f-4b6f-8f00-1d59b50aa2d7", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c190d0ba-29b5-4ec3-a184-f4baa20de020", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "bc22f78b-0a39-44a1-8e73-adf0193aa2ae", + "metadata": {}, + "source": [ + "# AWS setup" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "fe926571-9c8b-4b9e-a18b-1dbf3d2f2b65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMLCBOSPXR\n", + "env: AWS_SECRET_ACCESS_KEY=8d1QIWmMHqKC+EJogVNghypTHwJdYkh2ouE0AsfF\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEAwaCXVzLWVhc3QtMSJHMEUCIBKpQZUyEbGT/3d6MIci71Fmxb2kagr4rj2i31pSLgDmAiEAlOyH+ApwWLSYLHEEjVbGLUbJgjaVUE8DhuHq7C1ojmoq+AEI5f//////////ARAEGgw1ODM2NDM1Njc1MTIiDHuVIB+ajNumPYXAFSrMARZ+1iEQSZSyhpvFWsVgmKG0eEbXmXU9HcTqVpIZ7ZFJjoGlsDx/TA7KIR+jX0KY52BL5zUnBaZNHSLu46y2VoSQ90qQdOpf3+d6rd8yQ0suuJ09Ywpg6BINBkw5oHByT++xcd0+7thePo+26U3oZOH6H1OYM1YiHYEQ6syKw5SZ/mNh/soIFO9UOiy5Yq0ZVFq185bppVxKC7AQEbQWoSAF2n+PnOVHlBQK7ARVmJxyvUhRCG0A+TyQWp6NJ90kZdObIr/mxXlR7FMEmDDRvq6hBjqYAesJGkceqUfrzrYzZ1Mt9CQvTaAl7H6fDeVe1nn87E/Fy1PA32nbtaRcbk/5kvO5dIi0TMLAMJsH46yNsNdrtN/6wZqbLIwawkMiiYbCdqphmeVGe3Ks1B3iXN1OluG6OC+hhvwiKoAO+SnfvpsdodaN4/0J3y7fUi8M4O7MCOwFWwCl1lo/oSMw4i0w25ZWK55SRL/EbOmH\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "# Load aws\n", + "\n", + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMLCBOSPXR\n", + "%env AWS_SECRET_ACCESS_KEY=8d1QIWmMHqKC+EJogVNghypTHwJdYkh2ouE0AsfF\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEAwaCXVzLWVhc3QtMSJHMEUCIBKpQZUyEbGT/3d6MIci71Fmxb2kagr4rj2i31pSLgDmAiEAlOyH+ApwWLSYLHEEjVbGLUbJgjaVUE8DhuHq7C1ojmoq+AEI5f//////////ARAEGgw1ODM2NDM1Njc1MTIiDHuVIB+ajNumPYXAFSrMARZ+1iEQSZSyhpvFWsVgmKG0eEbXmXU9HcTqVpIZ7ZFJjoGlsDx/TA7KIR+jX0KY52BL5zUnBaZNHSLu46y2VoSQ90qQdOpf3+d6rd8yQ0suuJ09Ywpg6BINBkw5oHByT++xcd0+7thePo+26U3oZOH6H1OYM1YiHYEQ6syKw5SZ/mNh/soIFO9UOiy5Yq0ZVFq185bppVxKC7AQEbQWoSAF2n+PnOVHlBQK7ARVmJxyvUhRCG0A+TyQWp6NJ90kZdObIr/mxXlR7FMEmDDRvq6hBjqYAesJGkceqUfrzrYzZ1Mt9CQvTaAl7H6fDeVe1nn87E/Fy1PA32nbtaRcbk/5kvO5dIi0TMLAMJsH46yNsNdrtN/6wZqbLIwawkMiiYbCdqphmeVGe3Ks1B3iXN1OluG6OC+hhvwiKoAO+SnfvpsdodaN4/0J3y7fUi8M4O7MCOwFWwCl1lo/oSMw4i0w25ZWK55SRL/EbOmH\n", + "!aws s3 ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa9818c3-5a2e-40eb-ac23-aed04f3d8835", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "f0b6b1b0-1ad6-4a09-aae9-1b297293198a", + "metadata": { + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f3d482b7-cc91-44f8-9b80-93847138c22c", + "metadata": {}, + "outputs": [], + "source": [ + "# Locations of workflow-related directories and files\n", + "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\"\n", + "db_credentials_path = f\"{Path.home()}/.config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0b774596-c7ee-45b4-b3db-82df63451034", + "metadata": {}, + "outputs": [], + "source": [ + "# Set credentials based on SCRIdb CLI config file\n", + "with open(db_credentials_path) as f:\n", + " creds = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "167bd533-363f-4110-b163-b27664fb6f17", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "73d1fe27-747d-4bde-a37b-8a1b5c3bdd5c", + "metadata": {}, + "source": [ + "# Sample information" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5a213cdc-45cc-4204-9060-b39dbf3b5c12", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
AV-1759_Ru1083_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3924humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1760_MSK_LX_1083c_T_2_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3925humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1761_POSIE_101920_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3926humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1762_Ru1083d_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3927humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1763_Ru1250C_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3928humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1764_MSK_LX_1250b_PM_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3929humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1764_Ru1250D_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3930humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1765_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3931humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1766_MSK_LX_1250f_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3932humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
AV-1760_Ru263_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...3933humanCell_RangerLung Tumor AtlasGRCh38-3.0.0
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "AV-1759_Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3924 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3925 \n", + "AV-1761_POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3926 \n", + "AV-1762_Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3927 \n", + "AV-1763_Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3928 \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3929 \n", + "AV-1764_Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3930 \n", + "AV-1765_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3931 \n", + "AV-1766_MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3932 \n", + "AV-1760_Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... 3933 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "AV-1759_Ru1083_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1760_MSK_LX_1083c_T_2_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1761_POSIE_101920_T_1_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1762_Ru1083d_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1763_Ru1250C_T_1_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1764_Ru1250D_T_1_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1765_Ru1250e_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1766_MSK_LX_1250f_MITO human Cell_Ranger Lung Tumor Atlas \n", + "AV-1760_Ru263_MITO human Cell_Ranger Lung Tumor Atlas \n", + "\n", + " reference \n", + "Sample \n", + "AV-1759_Ru1083_MITO GRCh38-3.0.0 \n", + "AV-1760_MSK_LX_1083c_T_2_MITO GRCh38-3.0.0 \n", + "AV-1761_POSIE_101920_T_1_MITO GRCh38-3.0.0 \n", + "AV-1762_Ru1083d_MITO GRCh38-3.0.0 \n", + "AV-1763_Ru1250C_T_1_MITO GRCh38-3.0.0 \n", + "AV-1764_MSK_LX_1250b_PM_1_MITO GRCh38-3.0.0 \n", + "AV-1764_Ru1250D_T_1_MITO GRCh38-3.0.0 \n", + "AV-1765_Ru1250e_MITO GRCh38-3.0.0 \n", + "AV-1766_MSK_LX_1250f_MITO GRCh38-3.0.0 \n", + "AV-1760_Ru263_MITO GRCh38-3.0.0 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs\n", + "\n", + "# request_ids = ['BM-1408', 'BM-1286', ]\n", + "# request_ids = ['s3://dp-lab-data/collaborators/lowe/9P/IgG_A/', \n", + "# 's3://dp-lab-data/collaborators/lowe/9P/IgG_C/', \n", + "# 's3://dp-lab-data/collaborators/lowe/9P/IFNAR1_A/',\n", + "# 's3://dp-lab-data/collaborators/lowe/9P/IFNAR_A/'\n", + "# ]\n", + "# samples = sample_scridb_info(request_ids, 'AWS_storage', creds)\n", + "\n", + "sample_ids = list(range(3924, 3934))\n", + "samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "\n", + "# Specify the type of pipeline you are going to be running\n", + "# prefix = ''\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c966b3d-a2cf-4200-909b-9d485112ae4a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5e2023cd-a58a-4b1c-8424-dbc30f5c8f88", + "metadata": {}, + "source": [ + "# FASTQs" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3e8e1456-a502-4c35-a614-f1bc6e458d18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
joe_sample_nameAWS_storagesample
0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...Ru1083_MITO
1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1083c_T_2_MITO
2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...Ru263_MITO
3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...POSIE_101920_T_1_MITO
4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...Ru1083d_MITO
5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250C_T_1_MITO
6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1250b_PM_1_MITO
7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250D_T_1_MITO
8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250e_MITO
9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1250f_MITO
10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...Ru581D_MITO
11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...Ru581b_T1_MITO
13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...Ru581c-LN1_MITO
\n", + "
" + ], + "text/plain": [ + " joe_sample_name AWS_storage \\\n", + "0 RU1083_LIV s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "1 RU1083_T2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "2 RU263_PDX s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "3 RU1083_T1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "4 RU1083_ST s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "5 RU1250_T1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "6 RU1250_PL s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "7 RU1250_T2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "8 RU1250_ASC1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "9 RU1250_ASC2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "10 RU581_LIV s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "11 RU581_Ta s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "13 RU581_LNa s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " sample \n", + "0 Ru1083_MITO \n", + "1 MSK_LX_1083c_T_2_MITO \n", + "2 Ru263_MITO \n", + "3 POSIE_101920_T_1_MITO \n", + "4 Ru1083d_MITO \n", + "5 Ru1250C_T_1_MITO \n", + "6 MSK_LX_1250b_PM_1_MITO \n", + "7 Ru1250D_T_1_MITO \n", + "8 Ru1250e_MITO \n", + "9 MSK_LX_1250f_MITO \n", + "10 Ru581D_MITO \n", + "11 Ru581b_T1_MITO \n", + "13 Ru581c-LN1_MITO " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = pd.read_csv('joe_samples_unarchive.txt', index_col=0)\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d89a8acf-4360-4b75-af0c-51af9be396a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
joe_sample_nameAWS_storagesampleaws_pathFASTQs
0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...Ru1083_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1083c_T_2_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...Ru263_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...POSIE_101920_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...Ru1083d_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250C_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1250b_PM_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250D_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1250f_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...Ru581D_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...Ru581b_T1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...Ru581c-LN1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...None
\n", + "
" + ], + "text/plain": [ + " joe_sample_name AWS_storage \\\n", + "0 RU1083_LIV s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "1 RU1083_T2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "2 RU263_PDX s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "3 RU1083_T1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "4 RU1083_ST s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "5 RU1250_T1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "6 RU1250_PL s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "7 RU1250_T2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "8 RU1250_ASC1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "9 RU1250_ASC2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "10 RU581_LIV s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "11 RU581_Ta s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "13 RU581_LNa s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " sample aws_path FASTQs \n", + "0 Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "1 MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "2 Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "3 POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "4 Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "5 Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "6 MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "7 Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "8 Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "9 MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "10 Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "11 Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None \n", + "13 Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... None " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['AWS_storage'] = samples['AWS_storage'].str.strip('/')\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "fec4f19e-a9c7-432a-b2ca-78b94c251eaf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
joe_sample_nameAWS_storagesampleaws_pathFASTQs
0RU1083_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...Ru1083_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
1RU1083_T2s3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1083c_T_2_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
2RU263_PDXs3://dp-lab-data/SCRI_Projects/HTA/M...Ru263_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
3RU1083_T1s3://dp-lab-data/SCRI_Projects/HTA/M...POSIE_101920_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
4RU1083_STs3://dp-lab-data/SCRI_Projects/HTA/M...Ru1083d_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
5RU1250_T1s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250C_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
6RU1250_PLs3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1250b_PM_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
7RU1250_T2s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250D_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
8RU1250_ASC1s3://dp-lab-data/SCRI_Projects/HTA/M...Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
9RU1250_ASC2s3://dp-lab-data/SCRI_Projects/HTA/M...MSK_LX_1250f_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
10RU581_LIVs3://dp-lab-data/SCRI_Projects/HTA/M...Ru581D_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
11RU581_Tas3://dp-lab-data/SCRI_Projects/HTA/M...Ru581b_T1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
13RU581_LNas3://dp-lab-data/SCRI_Projects/HTA/M...Ru581c-LN1_MITOs3://dp-lab-data/SCRI_Projects/HTA/M...{'R2': ['s3://dp-lab-data/SCRI_Proje...
\n", + "
" + ], + "text/plain": [ + " joe_sample_name AWS_storage \\\n", + "0 RU1083_LIV s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "1 RU1083_T2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "2 RU263_PDX s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "3 RU1083_T1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "4 RU1083_ST s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "5 RU1250_T1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "6 RU1250_PL s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "7 RU1250_T2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "8 RU1250_ASC1 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "9 RU1250_ASC2 s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "10 RU581_LIV s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "11 RU581_Ta s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "13 RU581_LNa s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " sample aws_path \\\n", + "0 Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "1 MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "2 Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "3 POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "4 Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "5 Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "6 MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "7 Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "8 Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "9 MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "10 Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "11 Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "13 Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/M... \n", + "\n", + " FASTQs \n", + "0 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "1 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "2 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "3 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "4 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "5 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "6 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "7 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "8 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "9 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "10 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "11 {'R2': ['s3://dp-lab-data/SCRI_Proje... \n", + "13 {'R2': ['s3://dp-lab-data/SCRI_Proje... " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, ['R2'], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "929d6279-ddaf-4855-a3fa-1d8440af5d48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ru1083_MITO\n", + "MSK_LX_1083c_T_2_MITO\n", + "Ru263_MITO\n", + "POSIE_101920_T_1_MITO\n", + "Ru1083d_MITO\n", + "Ru1250C_T_1_MITO\n", + "MSK_LX_1250b_PM_1_MITO\n", + "Ru1250D_T_1_MITO\n", + "Ru1250e_MITO\n", + "MSK_LX_1250f_MITO\n", + "Ru581D_MITO\n", + "Ru581b_T1_MITO\n", + "Ru581c-LN1_MITO\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " sample = row['sample']\n", + " print(sample)\n", + " \n", + " fastqs = row['FASTQs']\n", + " fastqs = list(np.ravel(list(fastqs.values())))\n", + " \n", + " for fastq in fastqs:\n", + " file = fastq.replace('s3://', '')\n", + " bucket = file.split('/')[0]\n", + " key = file.replace(f'{bucket}/', '')\n", + "\n", + " cmd = f'aws s3api restore-object --bucket {bucket} --key {key} --restore-request '\n", + " cmd += '\\'{\"Days\":25, \"GlacierJobParameters\":{\"Tier\":\"Standard\"}}\\''\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97a8d236-7723-445c-bf9c-e83c5580946e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df01855a-c372-42cd-b23a-ebf5a7085af8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "234696ce-4a50-4518-837b-e4bb0f37023c", + "metadata": {}, + "source": [ + "# SEQC" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "96d1e728-dba4-42b3-8599-2a1c479d4a33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
IgG_As3://dp-lab-data/collaborators/lowe/...2638mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2639mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IFNAR_As3://dp-lab-data/collaborators/lowe/...2640mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IgG_As3://dp-lab-data/collaborators/lowe/...2917mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2918mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
IFNAR1_As3://dp-lab-data/collaborators/lowe/...2919mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species sc_tech \\\n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2638 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2639 mouse 10X_V3.1 \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... 2640 mouse 10X_V3.1 \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2917 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2918 mouse 10X_V3.1 \n", + "IFNAR1_A s3://dp-lab-data/collaborators/lowe/... 2919 mouse 10X_V3.1 \n", + "\n", + " project_id reference \\\n", + "Sample \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR1_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " FASTQs \n", + "Sample \n", + "IgG_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R1': ['s3://dp-lab-data/collaborat... \n", + "IFNAR_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "IgG_A {'R1': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R1': ['s3://dp-lab-data/collaborat... \n", + "IFNAR1_A {'R1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, ['R1'], \"barcode\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "032216a0-eeb7-41cd-8d35-6a8953b38a0f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " fastqs = row['FASTQs']\n", + " fastqs = list(np.ravel(list(fastqs.values())))\n", + " \n", + " for fastq in fastqs:\n", + " file = fastq.replace('s3://', '')\n", + " bucket = file.split('/')[0]\n", + " key = file.replace(f'{bucket}/', '')\n", + "\n", + " cmd = f'aws s3api restore-object --bucket {bucket} --key {key} --restore-request '\n", + " cmd += '\\'{\"Days\":25, \"GlacierJobParameters\":{\"Tier\":\"Standard\"}}\\''\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "532ea79c-9ce1-416e-b6c8-fece3af49346", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0c2ebf81-4ad0-48aa-a917-e6ecea929b6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
IgG_As3://dp-lab-data/collaborators/lowe/...2638mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R2': ['s3://dp-lab-data/collaborat...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2639mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R2': ['s3://dp-lab-data/collaborat...
IFNAR_As3://dp-lab-data/collaborators/lowe/...2640mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R2': ['s3://dp-lab-data/collaborat...
IgG_As3://dp-lab-data/collaborators/lowe/...2917mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R2': ['s3://dp-lab-data/collaborat...
IgG_Cs3://dp-lab-data/collaborators/lowe/...2918mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R2': ['s3://dp-lab-data/collaborat...
IFNAR1_As3://dp-lab-data/collaborators/lowe/...2919mouse10X_V3.19ps3://seqc-public/genomes/mm38_long_p...{'R2': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id species sc_tech \\\n", + "Sample \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2638 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2639 mouse 10X_V3.1 \n", + "IFNAR_A s3://dp-lab-data/collaborators/lowe/... 2640 mouse 10X_V3.1 \n", + "IgG_A s3://dp-lab-data/collaborators/lowe/... 2917 mouse 10X_V3.1 \n", + "IgG_C s3://dp-lab-data/collaborators/lowe/... 2918 mouse 10X_V3.1 \n", + "IFNAR1_A s3://dp-lab-data/collaborators/lowe/... 2919 mouse 10X_V3.1 \n", + "\n", + " project_id reference \\\n", + "Sample \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IgG_C 9p s3://seqc-public/genomes/mm38_long_p... \n", + "IFNAR1_A 9p s3://seqc-public/genomes/mm38_long_p... \n", + "\n", + " FASTQs \n", + "Sample \n", + "IgG_A {'R2': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R2': ['s3://dp-lab-data/collaborat... \n", + "IFNAR_A {'R2': ['s3://dp-lab-data/collaborat... \n", + "IgG_A {'R2': ['s3://dp-lab-data/collaborat... \n", + "IgG_C {'R2': ['s3://dp-lab-data/collaborat... \n", + "IFNAR1_A {'R2': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, ['R2'], \"genomic\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fb9a6253-e6c4-4ab4-9c71-3c9e5938b05e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n", + "\n", + "An error occurred (RestoreAlreadyInProgress) when calling the RestoreObject operation: Object restore is already in progress\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " fastqs = row['FASTQs']\n", + " fastqs = list(np.ravel(list(fastqs.values())))\n", + " \n", + " for fastq in fastqs:\n", + " file = fastq.replace('s3://', '')\n", + " bucket = file.split('/')[0]\n", + " key = file.replace(f'{bucket}/', '')\n", + "\n", + " cmd = f'aws s3api restore-object --bucket {bucket} --key {key} --restore-request '\n", + " cmd += '\\'{\"Days\":25, \"GlacierJobParameters\":{\"Tier\":\"Standard\"}}\\''\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57a9c7e1-3a88-42e7-8f28-06f48a4e3f97", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "2fd9753e-6c19-496c-a265-123121826222", + "metadata": {}, + "source": [ + "# Joe mito" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "871dd670-8c1f-4e2b-a368-98302c5c0208", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_nameaws_path
0epi_Ru581D_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
2epi_Ru581b_T1_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
4epi_Ru581c-LN1_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
6epi_Ru1083_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
8epi_MSK_LX_1083c_T_2_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
10epi_POSIE_101920_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
12epi_Ru1083d_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
14epi_Ru1250C_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
16epi_MSK_LX_1250b_PM_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
18epi_Ru1250D_T_1_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
20epi_Ru1250e_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
22epi_MSK_LX_1250f_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
24epi_Ru263_MITOs3://dp-lab-data/SCRI_Projects/HTA/...
\n", + "
" + ], + "text/plain": [ + " sample_name aws_path\n", + "0 epi_Ru581D_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "2 epi_Ru581b_T1_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "4 epi_Ru581c-LN1_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "6 epi_Ru1083_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "8 epi_MSK_LX_1083c_T_2_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "10 epi_POSIE_101920_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "12 epi_Ru1083d_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "14 epi_Ru1250C_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "16 epi_MSK_LX_1250b_PM_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "18 epi_Ru1250D_T_1_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "20 epi_Ru1250e_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "22 epi_MSK_LX_1250f_MITO s3://dp-lab-data/SCRI_Projects/HTA/...\n", + "24 epi_Ru263_MITO s3://dp-lab-data/SCRI_Projects/HTA/..." + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = pd.read_csv('joe_samples.csv')\n", + "samples = samples.loc[samples['sample_name'].str.startswith('epi')]\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "dd2e640a-9647-4726-b44d-55ac1db70447", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sample_name', ' aws_path']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(samples)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e9cf5684-8c83-4a76-8046-b2b86a8e23cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/epi_mito-tracing-outs\n", + "0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/epi_mito-tracing-outs\n", + "2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/epi_mito-tracing-outs\n", + "4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/epi_mito-tracing-outs\n", + "6\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/epi_mito-tracing-outs\n", + "8\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/epi_mito-tracing-outs\n", + "10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/epi_mito-tracing-outs\n", + "12\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/epi_mito-tracing-outs\n", + "14\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/epi_mito-tracing-outs\n", + "16\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/epi_mito-tracing-outs\n", + "18\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/epi_mito-tracing-outs\n", + "20\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/epi_mito-tracing-outs\n", + "22\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs\n", + "24\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the RestoreObject operation: The provided token is malformed or otherwise invalid.\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " path = row[' aws_path']\n", + " path = os.path.join(path, )\n", + " print(path)\n", + " \n", + " print(sample)\n", + " file = path.replace('s3://', '')\n", + " bucket = file.split('/')[0]\n", + " key = file.replace(f'{bucket}/', '')\n", + "\n", + " cmd = f'aws s3api restore-object --bucket {bucket} --key {key} --restore-request '\n", + " cmd += '\\'{\"Days\":25, \"GlacierJobParameters\":{\"Tier\":\"Standard\"}}\\''\n", + " os.system(cmd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e63e9ce1-af7f-4e71-be8c-3ccef391c17b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "be0a9e07-127e-4797-90ec-6d6b0eb8560d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the ListObjectsV2 operation: The provided token is malformed or otherwise invalid.\n" + ] + } + ], + "source": [ + "!aws s3 ls s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/epi_mito-tracing-outs/\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1246359-b8c2-4b57-be86-5fe97daf5846", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8d3862bb-1e0b-4aff-99ef-6d54f155316d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "42939eaa-b8a1-471b-993a-15a8f54c202f", + "metadata": {}, + "outputs": [ + { + "ename": "ClientError", + "evalue": "An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mClientError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [15]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_s3_objects\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdp-lab-data\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs/\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mper-barcode.G.vcf.tar\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/scing/scing-notebooks/notebooks/utils/utils.py:200\u001b[0m, in \u001b[0;36mget_s3_objects\u001b[0;34m(bucket, key, pattern, full_uri)\u001b[0m\n\u001b[1;32m 198\u001b[0m bucket_s3 \u001b[38;5;241m=\u001b[39m s3r\u001b[38;5;241m.\u001b[39mBucket(bucket)\n\u001b[1;32m 199\u001b[0m objects \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m bucket_s3\u001b[38;5;241m.\u001b[39mobjects\u001b[38;5;241m.\u001b[39mfilter(Prefix\u001b[38;5;241m=\u001b[39mkey):\n\u001b[1;32m 201\u001b[0m hit \u001b[38;5;241m=\u001b[39m pattern\u001b[38;5;241m.\u001b[39msearch(obj\u001b[38;5;241m.\u001b[39mkey)\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m hit:\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:81\u001b[0m, in \u001b[0;36mResourceCollection.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 78\u001b[0m limit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlimit\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpages():\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m page:\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m item\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:171\u001b[0m, in \u001b[0;36mResourceCollection.pages\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Now that we have a page iterator or single page of results\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# we start processing and yielding individual items.\u001b[39;00m\n\u001b[1;32m 170\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m pages:\n\u001b[1;32m 172\u001b[0m page_items \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handler(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, params, page):\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:264\u001b[0m, in \u001b[0;36mPageIterator.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inject_starting_params(current_kwargs)\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 264\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 265\u001b[0m parsed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extract_parsed_response(response)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m first_request:\n\u001b[1;32m 267\u001b[0m \u001b[38;5;66;03m# The first request is handled differently. We could\u001b[39;00m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# possibly have a resume/starting token that tells us where\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;66;03m# to index into the retrieved page.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:352\u001b[0m, in \u001b[0;36mPageIterator._make_request\u001b[0;34m(self, current_kwargs)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\u001b[38;5;28mself\u001b[39m, current_kwargs):\n\u001b[0;32m--> 352\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_method\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:508\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 505\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpy_operation_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m() only accepts keyword arguments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 506\u001b[0m )\n\u001b[1;32m 507\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 508\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:911\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 909\u001b[0m error_code \u001b[38;5;241m=\u001b[39m parsed_response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCode\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 910\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parsed_response\n", + "\u001b[0;31mClientError\u001b[0m: An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid." + ] + } + ], + "source": [ + "get_s3_objects('dp-lab-data', 'SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs/' , 'per-barcode.G.vcf.tar')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12f89ef7-8a16-48f8-9c20-8f29b2d22645", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "80d11971-f5a9-4ee4-b88f-aade34db7b12", + "metadata": {}, + "outputs": [], + "source": [ + "# Get every FASTQ in a folder\n", + "def get_all_files(\n", + " path: str, # path to directory containing FASTQ files\n", + " # folder: str = \"\",\n", + "):\n", + " _, bucket, key, _, _ = urllib.parse.urlsplit(path)\n", + " files = get_s3_objects(\n", + " bucket, key.lstrip(\"/\"),\n", + " re.compile(f\"tar\")\n", + " )\n", + " \n", + " try:\n", + " fastqs = [os.path.join(\"s3://\", bucket, str(f)) for f in files]\n", + " except AssertionError as err:\n", + " logging.warning(\"%s\\n\\t %s\", err, path)\n", + " return\n", + " return fastqs" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "909f2351-ad6a-4322-8945-c67c5756df08", + "metadata": {}, + "outputs": [ + { + "ename": "ClientError", + "evalue": "An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mClientError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [25]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_all_files\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstrip\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "Input \u001b[0;32mIn [24]\u001b[0m, in \u001b[0;36mget_all_files\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_all_files\u001b[39m(\n\u001b[1;32m 3\u001b[0m path: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;66;03m# path to directory containing FASTQ files\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# folder: str = \"\",\u001b[39;00m\n\u001b[1;32m 5\u001b[0m ):\n\u001b[1;32m 6\u001b[0m _, bucket, key, _, _ \u001b[38;5;241m=\u001b[39m urllib\u001b[38;5;241m.\u001b[39mparse\u001b[38;5;241m.\u001b[39murlsplit(path)\n\u001b[0;32m----> 7\u001b[0m files \u001b[38;5;241m=\u001b[39m \u001b[43mget_s3_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mbucket\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlstrip\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompile\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtar\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 13\u001b[0m fastqs \u001b[38;5;241m=\u001b[39m [os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ms3://\u001b[39m\u001b[38;5;124m\"\u001b[39m, bucket, \u001b[38;5;28mstr\u001b[39m(f)) \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m files]\n", + "File \u001b[0;32m~/scing/scing-notebooks/notebooks/utils/utils.py:200\u001b[0m, in \u001b[0;36mget_s3_objects\u001b[0;34m(bucket, key, pattern, full_uri)\u001b[0m\n\u001b[1;32m 198\u001b[0m bucket_s3 \u001b[38;5;241m=\u001b[39m s3r\u001b[38;5;241m.\u001b[39mBucket(bucket)\n\u001b[1;32m 199\u001b[0m objects \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m bucket_s3\u001b[38;5;241m.\u001b[39mobjects\u001b[38;5;241m.\u001b[39mfilter(Prefix\u001b[38;5;241m=\u001b[39mkey):\n\u001b[1;32m 201\u001b[0m hit \u001b[38;5;241m=\u001b[39m pattern\u001b[38;5;241m.\u001b[39msearch(obj\u001b[38;5;241m.\u001b[39mkey)\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m hit:\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:81\u001b[0m, in \u001b[0;36mResourceCollection.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 78\u001b[0m limit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlimit\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpages():\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m page:\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m item\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:171\u001b[0m, in \u001b[0;36mResourceCollection.pages\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Now that we have a page iterator or single page of results\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# we start processing and yielding individual items.\u001b[39;00m\n\u001b[1;32m 170\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m pages:\n\u001b[1;32m 172\u001b[0m page_items \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handler(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, params, page):\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:264\u001b[0m, in \u001b[0;36mPageIterator.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inject_starting_params(current_kwargs)\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 264\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 265\u001b[0m parsed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extract_parsed_response(response)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m first_request:\n\u001b[1;32m 267\u001b[0m \u001b[38;5;66;03m# The first request is handled differently. We could\u001b[39;00m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# possibly have a resume/starting token that tells us where\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;66;03m# to index into the retrieved page.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:352\u001b[0m, in \u001b[0;36mPageIterator._make_request\u001b[0;34m(self, current_kwargs)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\u001b[38;5;28mself\u001b[39m, current_kwargs):\n\u001b[0;32m--> 352\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_method\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:508\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 505\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpy_operation_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m() only accepts keyword arguments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 506\u001b[0m )\n\u001b[1;32m 507\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 508\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:911\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 909\u001b[0m error_code \u001b[38;5;241m=\u001b[39m parsed_response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCode\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 910\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parsed_response\n", + "\u001b[0;31mClientError\u001b[0m: An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid." + ] + } + ], + "source": [ + "get_all_files(path.strip())" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "f4227628-ba8e-4d98-8338-42bc54d909e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "0ad9c23e-83ca-458e-b967-43208c2044b9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ef8fe52-0ed3-4d87-8d61-027d7b9d41b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "47469f8b-1ce6-49bf-bd95-72f60669954a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "An error occurred (InvalidToken) when calling the ListObjectsV2 operation: The provided token is malformed or otherwise invalid.\n" + ] + } + ], + "source": [ + "!aws s3 ls s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae5870f8-11d2-44b1-9d82-9d89c02a55a9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70f8fa5a-5501-4251-bd41-113a7f5b2184", + "metadata": {}, + "outputs": [], + "source": [ + "path" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "aad5fa26-5bad-4c73-849f-4e2d4fe019f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs'" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89a4bcf0-c39f-4a58-929d-783fc619e7a7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "e53f5671-5225-4f51-bfda-11d71eae41b2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dp-lab-data SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs\n" + ] + } + ], + "source": [ + "path = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO'\n", + "folder = 'epi_mito-tracing-outs'\n", + "\n", + "_, bucket, key, _, _ = urllib.parse.urlsplit(f\"{path}/{folder}\")\n", + "key = key.lstrip('/')\n", + "print(bucket, key)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "77b0a60a-430c-483a-ac11-f7870079911f", + "metadata": {}, + "outputs": [], + "source": [ + "s3r = boto3.resource(\"s3\")\n", + "bucket_s3 = s3r.Bucket(bucket)\n", + "objects = []" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "9a1c681b-2c18-4ae4-940d-1c6bb0fa17e9", + "metadata": {}, + "outputs": [ + { + "ename": "ClientError", + "evalue": "An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mClientError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [55]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m bucket_s3\u001b[38;5;241m.\u001b[39mobjects\u001b[38;5;241m.\u001b[39mfilter(Prefix\u001b[38;5;241m=\u001b[39mkey):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(obj\u001b[38;5;241m.\u001b[39mkey)\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:81\u001b[0m, in \u001b[0;36mResourceCollection.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 78\u001b[0m limit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlimit\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpages():\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m page:\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m item\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:171\u001b[0m, in \u001b[0;36mResourceCollection.pages\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Now that we have a page iterator or single page of results\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# we start processing and yielding individual items.\u001b[39;00m\n\u001b[1;32m 170\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m pages:\n\u001b[1;32m 172\u001b[0m page_items \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handler(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, params, page):\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:264\u001b[0m, in \u001b[0;36mPageIterator.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inject_starting_params(current_kwargs)\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 264\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 265\u001b[0m parsed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extract_parsed_response(response)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m first_request:\n\u001b[1;32m 267\u001b[0m \u001b[38;5;66;03m# The first request is handled differently. We could\u001b[39;00m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# possibly have a resume/starting token that tells us where\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;66;03m# to index into the retrieved page.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:352\u001b[0m, in \u001b[0;36mPageIterator._make_request\u001b[0;34m(self, current_kwargs)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\u001b[38;5;28mself\u001b[39m, current_kwargs):\n\u001b[0;32m--> 352\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_method\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:508\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 505\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpy_operation_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m() only accepts keyword arguments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 506\u001b[0m )\n\u001b[1;32m 507\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 508\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:911\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 909\u001b[0m error_code \u001b[38;5;241m=\u001b[39m parsed_response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCode\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 910\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parsed_response\n", + "\u001b[0;31mClientError\u001b[0m: An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid." + ] + } + ], + "source": [ + "for obj in bucket_s3.objects.filter(Prefix=key):\n", + " print(obj.key)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "1765730e-613b-4bd2-a1e0-956222aa9e16", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = 'tar'" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "623e503e-7115-4d59-a003-5a7e65797b5a", + "metadata": {}, + "outputs": [ + { + "ename": "ClientError", + "evalue": "An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mClientError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [54]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m bucket_s3 \u001b[38;5;241m=\u001b[39m s3r\u001b[38;5;241m.\u001b[39mBucket(bucket)\n\u001b[1;32m 3\u001b[0m objects \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m bucket_s3\u001b[38;5;241m.\u001b[39mobjects\u001b[38;5;241m.\u001b[39mfilter(Prefix\u001b[38;5;241m=\u001b[39mkey):\n\u001b[1;32m 5\u001b[0m hit \u001b[38;5;241m=\u001b[39m pattern\u001b[38;5;241m.\u001b[39msearch(obj\u001b[38;5;241m.\u001b[39mkey)\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:81\u001b[0m, in \u001b[0;36mResourceCollection.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 78\u001b[0m limit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlimit\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpages():\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m page:\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m item\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:171\u001b[0m, in \u001b[0;36mResourceCollection.pages\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Now that we have a page iterator or single page of results\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# we start processing and yielding individual items.\u001b[39;00m\n\u001b[1;32m 170\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m pages:\n\u001b[1;32m 172\u001b[0m page_items \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handler(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, params, page):\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:264\u001b[0m, in \u001b[0;36mPageIterator.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inject_starting_params(current_kwargs)\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 264\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 265\u001b[0m parsed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extract_parsed_response(response)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m first_request:\n\u001b[1;32m 267\u001b[0m \u001b[38;5;66;03m# The first request is handled differently. We could\u001b[39;00m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# possibly have a resume/starting token that tells us where\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;66;03m# to index into the retrieved page.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:352\u001b[0m, in \u001b[0;36mPageIterator._make_request\u001b[0;34m(self, current_kwargs)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\u001b[38;5;28mself\u001b[39m, current_kwargs):\n\u001b[0;32m--> 352\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_method\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:508\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 505\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpy_operation_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m() only accepts keyword arguments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 506\u001b[0m )\n\u001b[1;32m 507\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 508\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:911\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 909\u001b[0m error_code \u001b[38;5;241m=\u001b[39m parsed_response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCode\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 910\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parsed_response\n", + "\u001b[0;31mClientError\u001b[0m: An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid." + ] + } + ], + "source": [ + "s3r = boto3.resource(\"s3\")\n", + "bucket_s3 = s3r.Bucket(bucket)\n", + "objects = []\n", + "for obj in bucket_s3.objects.filter(Prefix=key):\n", + " hit = pattern.search(obj.key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "298bf7db-0464-47f0-bb13-caaa3666b124", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d7c18f8-7b9e-44e5-b263-349ee1fc9e3f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a58acc1f-3d89-4190-a6ca-f5767f795aeb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "6f8ad661-caaf-4b97-8f26-29ef0341c361", + "metadata": {}, + "outputs": [], + "source": [ + "def get_s3_objects(bucket, key, pattern, full_uri=False):\n", + " s3r = boto3.resource(\"s3\")\n", + " bucket_s3 = s3r.Bucket(bucket)\n", + " objects = []\n", + " for obj in bucket_s3.objects.filter(Prefix=key):\n", + " hit = pattern.search(obj.key)\n", + " if hit:\n", + " objects.append(obj.key)\n", + " if full_uri:\n", + " objects = [f\"s3://{bucket}/{o}\" for o in objects]\n", + " return objects\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e0b4a008-6286-4b6a-a094-54f88767a72e", + "metadata": {}, + "outputs": [ + { + "ename": "ClientError", + "evalue": "An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mClientError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m bucket_s3\u001b[38;5;241m.\u001b[39mobjects\u001b[38;5;241m.\u001b[39mfilter(Prefix\u001b[38;5;241m=\u001b[39mkey):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(obj)\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:81\u001b[0m, in \u001b[0;36mResourceCollection.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 78\u001b[0m limit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlimit\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 80\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpages():\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m page:\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m item\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/boto3/resources/collection.py:171\u001b[0m, in \u001b[0;36mResourceCollection.pages\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Now that we have a page iterator or single page of results\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# we start processing and yielding individual items.\u001b[39;00m\n\u001b[1;32m 170\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m page \u001b[38;5;129;01min\u001b[39;00m pages:\n\u001b[1;32m 172\u001b[0m page_items \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handler(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_parent, params, page):\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:264\u001b[0m, in \u001b[0;36mPageIterator.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inject_starting_params(current_kwargs)\n\u001b[1;32m 263\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 264\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 265\u001b[0m parsed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extract_parsed_response(response)\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m first_request:\n\u001b[1;32m 267\u001b[0m \u001b[38;5;66;03m# The first request is handled differently. We could\u001b[39;00m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# possibly have a resume/starting token that tells us where\u001b[39;00m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;66;03m# to index into the retrieved page.\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/paginate.py:352\u001b[0m, in \u001b[0;36mPageIterator._make_request\u001b[0;34m(self, current_kwargs)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_make_request\u001b[39m(\u001b[38;5;28mself\u001b[39m, current_kwargs):\n\u001b[0;32m--> 352\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_method\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurrent_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:508\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 505\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpy_operation_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m() only accepts keyword arguments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 506\u001b[0m )\n\u001b[1;32m 507\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 508\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scing/lib/python3.8/site-packages/botocore/client.py:911\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 909\u001b[0m error_code \u001b[38;5;241m=\u001b[39m parsed_response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError\u001b[39m\u001b[38;5;124m\"\u001b[39m, {})\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCode\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 910\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 911\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parsed_response\n", + "\u001b[0;31mClientError\u001b[0m: An error occurred (InvalidToken) when calling the ListObjects operation: The provided token is malformed or otherwise invalid." + ] + } + ], + "source": [ + "for obj in bucket_s3.objects.filter(Prefix=key):\n", + " print(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "100624da-1ad0-43e6-9fd4-56ccc0578978", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "f0026000-9b6d-4ae9-8896-f06c286b119d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/Ru581D_MITO epi_mito-tracing-outs\n", + "dp-lab-data SCRI_Projects/HTA/Mito_tracing/Ru581D_MITO/epi_mito-tracing-outs\n" + ] + } + ], + "source": [ + "path = samples[' aws_path'][0].replace('barcodes_rna/', '')\n", + "path = path.strip()\n", + "\n", + "folder = 'epi_mito-tracing-outs'\n", + "path = path.replace(f'{folder}', '')\n", + "path = path.strip('/')\n", + "print(path, folder)\n", + "\n", + "_, bucket, key, _, _ = urllib.parse.urlsplit(f\"{path}/{folder}\")\n", + "key = key.lstrip('/')\n", + "print(bucket, key)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "6eb6d355-8c50-40f2-a4ef-3928b8bb68a5", + "metadata": {}, + "outputs": [], + "source": [ + "s3r = boto3.resource(\"s3\")\n", + "bucket_s3 = s3r.Bucket(bucket)\n", + "objects = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a433bac9-c493-4926-ab6b-254572415a51", + "metadata": {}, + "outputs": [], + "source": [ + "for obj in bucket_s3.objects.filter(Prefix=key):\n", + " print(obj.key)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb new file mode 100644 index 0000000..dbce721 --- /dev/null +++ b/notebooks/Untitled.ipynb @@ -0,0 +1,1029 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d13e6985-95f3-45eb-8162-0e24a2a049d4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging\n", + "import pandas as pd\n", + "import numpy as np\n", + "from s3path import S3Path\n", + "from pathlib import Path\n", + "from tqdm.notebook import tqdm\n", + "from packaging import version\n", + "\n", + "import glob\n", + "import os\n", + "\n", + "pd.set_option(\"display.max_colwidth\", 40)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6547d881-c5e0-494d-9189-0059c14e126e", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6fb891f-0934-4489-98fb-4713ab904a5d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "bf6e1460-34e1-4654-9fa5-2e82eb23902b", + "metadata": {}, + "source": [ + "# AWS setup" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7052097b-da4a-444c-aebb-1b585266c353", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMAGM24WHE\n", + "env: AWS_SECRET_ACCESS_KEY=jWUmwaWT+IQhV+71PWwfi1We6vSusB78GBwNw15e\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEHIaCXVzLWVhc3QtMSJIMEYCIQC5kbf1i5ib760fqEmYsdjvX/g7ogw0RaGEz2BUIy+R/wIhAOi31VVCUdy7zmAZddGoXvKRm7JtoJzwErPPPRahY4l/KvgBCNv//////////wEQAxoMNTgzNjQzNTY3NTEyIgx4r2eNH51B/0SJBpAqzAEM95AVY9oz0nVlU4hjvRWFYWEUuXzOkPwo6RagXIByGUj0SP4HWQX0kvraLFqa34JYyijiRhgoPu7xHiGOjdHuS93qSG4mSQhLDgMTa+K7ze+8I407NeMlNxiiS5KvKCVaTt6U76rBe4kGauNcMSqT7sIbpO4btLXtwI41CnqbbXKTgeJc87BwxcvWUqW8WE9D61uZ0Sv2qmvGoL+jkFOfOGw/bMSnMOtNtWtcYHOEFI0WW3i1DeTamyOoiJ6ZfYObSQ1V+SNEZNYhW64wwo+7ngY6lwEZ//N+c9HWTzOSb9crQjpV+hRMjDy1O+FLXll70i5dHZe5RxsmFfm8PYjYaz4AZQ6Tl716sEm25s1CCbTiYx3x2OeGIuj7XLbeE8lNDqDemhlAdkhXjuCUDFk/wAz+z4rwqu3PvsA0YB+Ut2jG/l+1RYzfaXBgSWxbl6c1oJuDz5ngUewaTQGDRFomu7/u8dCN6j89zovp\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "# Load aws\n", + "\n", + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMAGM24WHE\n", + "%env AWS_SECRET_ACCESS_KEY=jWUmwaWT+IQhV+71PWwfi1We6vSusB78GBwNw15e\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEHIaCXVzLWVhc3QtMSJIMEYCIQC5kbf1i5ib760fqEmYsdjvX/g7ogw0RaGEz2BUIy+R/wIhAOi31VVCUdy7zmAZddGoXvKRm7JtoJzwErPPPRahY4l/KvgBCNv//////////wEQAxoMNTgzNjQzNTY3NTEyIgx4r2eNH51B/0SJBpAqzAEM95AVY9oz0nVlU4hjvRWFYWEUuXzOkPwo6RagXIByGUj0SP4HWQX0kvraLFqa34JYyijiRhgoPu7xHiGOjdHuS93qSG4mSQhLDgMTa+K7ze+8I407NeMlNxiiS5KvKCVaTt6U76rBe4kGauNcMSqT7sIbpO4btLXtwI41CnqbbXKTgeJc87BwxcvWUqW8WE9D61uZ0Sv2qmvGoL+jkFOfOGw/bMSnMOtNtWtcYHOEFI0WW3i1DeTamyOoiJ6ZfYObSQ1V+SNEZNYhW64wwo+7ngY6lwEZ//N+c9HWTzOSb9crQjpV+hRMjDy1O+FLXll70i5dHZe5RxsmFfm8PYjYaz4AZQ6Tl716sEm25s1CCbTiYx3x2OeGIuj7XLbeE8lNDqDemhlAdkhXjuCUDFk/wAz+z4rwqu3PvsA0YB+Ut2jG/l+1RYzfaXBgSWxbl6c1oJuDz5ngUewaTQGDRFomu7/u8dCN6j89zovp\n", + "!aws s3 ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "131f63b2-9e4c-4b36-b9b2-e6a6693dd4a5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "087ae658-2d9d-4b3f-b159-ccb06f4c5626", + "metadata": { + "tags": [] + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "931b1af4-a619-4308-848b-6a55bc65072e", + "metadata": {}, + "outputs": [], + "source": [ + "# Which pipeline are you running\n", + "\n", + "prefix = \"CellRangerArc\" # Workflow to run; also .wdl filename prefix\n", + "output_dirname = \"cr-arc-results\"\n", + "\n", + "workflow_dir = glob.glob(f\"{Path.home()}/scing/bin/cellranger-arc-*\")[0]\n", + "path_to_exec = f\"{workflow_dir}/submit.sh\" # CHANGE THIS FOR SHARP\n", + "\n", + "# Locations of workflow-related directories and files\n", + "path_to_cromwell_secrets = f\"{Path.home()}/.cromwell/cromwell-secrets.json\"\n", + "db_credentials_path = f\"{Path.home()}/.config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "aaee197b-c610-46a0-b786-9bfc6aafa07d", + "metadata": {}, + "outputs": [], + "source": [ + "# Location of docker files\n", + "common_docker_registry = \"quay.io/hisplan\"\n", + "pipeline_type = prefix # field in *.labels.json\n", + "comment = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "963b6641-1c6d-430f-99f4-4077b24fa1b5", + "metadata": {}, + "outputs": [], + "source": [ + "# Workflow file paths\n", + "config_dir = f\"{workflow_dir}/configs\"\n", + "path_to_options = f\"{workflow_dir}/{prefix}.options.aws.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "809159de-a318-434b-8d3d-f660573a3990", + "metadata": {}, + "outputs": [], + "source": [ + "# Set credentials based on SCRIdb CLI config file\n", + "with open(db_credentials_path) as f:\n", + " creds = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f359259-186e-4beb-92c8-bce6b6b03fd7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "e652e114-578e-4a34-8624-bf001533af26", + "metadata": {}, + "source": [ + "# Sample information" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b47a3ba3-ee28-4a98-afff-2b90cd10822c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreference
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilotGRCh38-1.1.0
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilotGRCh38-1.1.0
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq GRCh38-1.1.0 \n", + "RB-2041_mRB54_1003_DOGMAseq GRCh38-1.1.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can supply a list of IDs or request IDs or even AWS paths\n", + "\n", + "# request_ids = ['PM-1779']\n", + "# samples = sample_scridb_info(request_ids, 'request_id', creds)\n", + "\n", + "# sample_ids = [4138]\n", + "# samples = sample_scridb_info(sample_ids, 'id', creds)\n", + "\n", + "aws_storage = ['s3://dp-lab-data/collaborators/sfeira/ScatacSeqPilot/RB-2041_mRB54_1003_DOGMAseq/',\n", + " 's3://dp-lab-data/collaborators/sfeira/ScatacSeqPilot/RB-2041_WildType_DOGMAseq/']\n", + "samples = sample_scridb_info(aws_storage, 'AWS_storage', creds)\n", + "\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "91a0a018-06ff-4b7e-92a9-46be980ca402", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://dp-lab-data/SCRI_Projects/10XFixedScrnaseqPilot/TX-1886_D34M_FAfixed_scRNA']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['AWS_storage'].tolist()" + ] + }, + { + "cell_type": "markdown", + "id": "66be82d8-2707-44ae-b7ac-7c14e7525a0f", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "Check the name of the folder you are running. Typically the folder that is stored in the database is just the GEX. So if another library is generated (multiome ATAC, VDJ, hashtag, etc) then it needs to be manually changed." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b851f925-3a5a-4cec-9ef3-4bdc53eb295b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RB-2041_WildType_DOGMAseq\n", + " PRE RB-2041_WildType_DOGMAseq/\n", + "\n", + "RB-2041_mRB54_1003_DOGMAseq\n", + " PRE RB-2041_mRB54_1003_DOGMAseq/\n", + "\n" + ] + } + ], + "source": [ + "# Check the name of the folder you are running\n", + "# Especially if there are multiple libraries (i.e ATAC, TCR_VDJ, etc.)\n", + "\n", + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " path = os.path.split(row['AWS_storage'])[0] + '/'\n", + " os.system(f'aws s3 ls {path} | grep {sample}')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5b09d8d6-46ef-403f-816b-13f34a603e9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilotGRCh38-1.1.0{'I1': ['s3://dp-lab-data/collaborat...
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilotGRCh38-1.1.0{'I1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq GRCh38-1.1.0 \n", + "RB-2041_mRB54_1003_DOGMAseq GRCh38-1.1.0 \n", + "\n", + " FASTQs \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'I1': ['s3://dp-lab-data/collaborat... \n", + "RB-2041_mRB54_1003_DOGMAseq {'I1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples[\"FASTQs\"] = samples[\"AWS_storage\"].apply(lambda x: get_fastqs(x, fastq_map[prefix], \"FASTQ\"))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2861ee5a-1a3c-4774-9eda-28ac92cb3df7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "563dd74c-53ef-4595-8400-b28c23f52085", + "metadata": {}, + "source": [ + "IMPORTANT NOTE\n", + "\n", + "Make sure that your files are not archived. The following command will print any FASTQ file that is archived. Unarchive the files and then come back to processing the sample." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "569312fd-750c-4356-8a73-1feff86601e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RB-2041_WildType_DOGMAseq\n", + "RB-2041_mRB54_1003_DOGMAseq\n" + ] + } + ], + "source": [ + "for sample, row in samples.iterrows():\n", + " print(sample)\n", + " fastqs = np.ravel(list(row['FASTQs'].values()))\n", + " dirnames = set([os.path.dirname(x) for x in fastqs])\n", + " \n", + " for dirname in dirnames:\n", + " file = dirname.replace('s3://', '')\n", + " \n", + " bucket = file.split('/')[0]\n", + " pre = file.replace(f'{bucket}/', '')\n", + " \n", + " !aws s3api list-objects-v2 --bucket $bucket --prefix $pre --query \"Contents[?StorageClass!='STANDARD'].Key\" --output text " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "534b90a7-ab25-462c-9ae4-40c4294a9fd5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "3f0f3377-1c19-4545-868f-5da05f0ea295", + "metadata": {}, + "source": [ + "IMPORTANT NOTE \n", + "\n", + "For CellRanger you need to supply an HTTPS path. So if you are using a custom genome stored on AWS, you must make the reference public ! Be sure to manually change the \"reference\" argument if it has not been updated correctly!!!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5586ef8-e219-4388-b5f1-646e8d652e1e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b8007050-06f2-4325-ad13-68bb986b626c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AWS_storageidspeciessc_techproject_idreferenceFASTQs
Sample
RB-2041_WildType_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4440human10X_scATACscATAC-seq pilothttps://cf.10xgenomics.com/supp/cell...{'I1': ['s3://dp-lab-data/collaborat...
RB-2041_mRB54_1003_DOGMAseqs3://dp-lab-data/collaborators/sfeir...4441human10X_scATACscATAC-seq pilothttps://cf.10xgenomics.com/supp/cell...{'I1': ['s3://dp-lab-data/collaborat...
\n", + "
" + ], + "text/plain": [ + " AWS_storage id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4440 \n", + "RB-2041_mRB54_1003_DOGMAseq s3://dp-lab-data/collaborators/sfeir... 4441 \n", + "\n", + " species sc_tech project_id \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "RB-2041_mRB54_1003_DOGMAseq human 10X_scATAC scATAC-seq pilot \n", + "\n", + " reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq https://cf.10xgenomics.com/supp/cell... \n", + "RB-2041_mRB54_1003_DOGMAseq https://cf.10xgenomics.com/supp/cell... \n", + "\n", + " FASTQs \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'I1': ['s3://dp-lab-data/collaborat... \n", + "RB-2041_mRB54_1003_DOGMAseq {'I1': ['s3://dp-lab-data/collaborat... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples = update_ref(samples, prefix)\n", + "\n", + "if not samples['reference'].isna().any():\n", + " samples[\"reference\"].apply(lambda x: {\n", + " \"name\": re.match(r'.*refdata-cellranger-arc-(.*).tar.gz', x)[1],\n", + " \"location\": x,\n", + " }) \n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6b56024e-a4c1-488f-861a-17bf9c191223", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sample\n", + "RB-2041_WildType_DOGMAseq {'name': 'human-rna-mitoblacklist', ...\n", + "RB-2041_mRB54_1003_DOGMAseq {'name': 'human-rna-mitoblacklist', ...\n", + "Name: reference, dtype: object" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "samples['reference'] = [\n", + " {\n", + " 'name' : 'human-rna-mitoblacklist',\n", + " 'location' : 's3://dp-lab-data/collaborators/sfeira/ScatacSeqPilot/human-rna-mitoblacklist/human-rna-mitoblacklist.tar.gz'\n", + " }] * len(samples)\n", + "samples['reference']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83ed197c-562c-459e-b113-83fee86388ff", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5e6b5cd2-ca13-4ce8-808a-5404ede8e528", + "metadata": {}, + "source": [ + "# Generate inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0561da32-5979-4b0e-a40e-35af26218060", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CellRangerArc.runIDCellRangerArc.gexFastqNameCellRangerArc.gexFastqFilesCellRangerArc.atacFastqNameCellRangerArc.atacFastqFilesCellRangerArc.referenceCellRangerArc.dockerRegistryCellRangerArc.sampleNameCellRangerArc.fastqFilesCellRangerArc.fastqNamesCellRangerArc.referenceGenome
Sample
RB-2041_WildType_DOGMAseqNaNNaNNaNNaNNaNNaNquay.io/hisplanRB-2041_WildType_DOGMAseq[s3://dp-lab-data/collaborators/sfei...4440_RB-2041_WildType_DOGMAseq_IGO_1...{'name': 'human-rna-mitoblacklist', ...
RB-2041_mRB54_1003_DOGMAseqNaNNaNNaNNaNNaNNaNquay.io/hisplanRB-2041_mRB54_1003_DOGMAseq[s3://dp-lab-data/collaborators/sfei...4441_RB-2041_mRB54_1003_DOGMAseq_IGO...{'name': 'human-rna-mitoblacklist', ...
\n", + "
" + ], + "text/plain": [ + " CellRangerArc.runID CellRangerArc.gexFastqName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq NaN NaN \n", + "RB-2041_mRB54_1003_DOGMAseq NaN NaN \n", + "\n", + " CellRangerArc.gexFastqFiles \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq NaN \n", + "RB-2041_mRB54_1003_DOGMAseq NaN \n", + "\n", + " CellRangerArc.atacFastqName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq NaN \n", + "RB-2041_mRB54_1003_DOGMAseq NaN \n", + "\n", + " CellRangerArc.atacFastqFiles \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq NaN \n", + "RB-2041_mRB54_1003_DOGMAseq NaN \n", + "\n", + " CellRangerArc.reference \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq NaN \n", + "RB-2041_mRB54_1003_DOGMAseq NaN \n", + "\n", + " CellRangerArc.dockerRegistry \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq quay.io/hisplan \n", + "RB-2041_mRB54_1003_DOGMAseq quay.io/hisplan \n", + "\n", + " CellRangerArc.sampleName \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq RB-2041_WildType_DOGMAseq \n", + "RB-2041_mRB54_1003_DOGMAseq RB-2041_mRB54_1003_DOGMAseq \n", + "\n", + " CellRangerArc.fastqFiles \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "RB-2041_mRB54_1003_DOGMAseq [s3://dp-lab-data/collaborators/sfei... \n", + "\n", + " CellRangerArc.fastqNames \\\n", + "Sample \n", + "RB-2041_WildType_DOGMAseq 4440_RB-2041_WildType_DOGMAseq_IGO_1... \n", + "RB-2041_mRB54_1003_DOGMAseq 4441_RB-2041_mRB54_1003_DOGMAseq_IGO... \n", + "\n", + " CellRangerArc.referenceGenome \n", + "Sample \n", + "RB-2041_WildType_DOGMAseq {'name': 'human-rna-mitoblacklist', ... \n", + "RB-2041_mRB54_1003_DOGMAseq {'name': 'human-rna-mitoblacklist', ... " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard inputs, modify fields as need be\n", + "\n", + "# Load minimum inputs and labels fields from templates\n", + "with open(f\"{config_dir}/template.inputs.json\") as f:\n", + " std_inputs_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)\n", + "\n", + "# Annotate inputs\n", + "inputs[f\"{prefix}.sampleName\"] = samples.index\n", + "inputs[f\"{prefix}.fastqFiles\"] = samples[\"FASTQs\"].apply(lambda x: np.ravel(list(x.values())))\n", + "inputs[f\"{prefix}.fastqNames\"] = inputs[f\"{prefix}.fastqFiles\"].apply(lambda x: get_fastqs_name(x))\n", + "inputs[f\"{prefix}.referenceGenome\"] = samples[\"reference\"] \n", + "inputs[f\"{prefix}.dockerRegistry\"] = common_docker_registry\n", + "\n", + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4700f244-fe4b-44a5-9d1b-fbc1b2b35afb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['CellRangerArc.runID',\n", + " 'CellRangerArc.gexFastqName',\n", + " 'CellRangerArc.gexFastqFiles',\n", + " 'CellRangerArc.atacFastqName',\n", + " 'CellRangerArc.atacFastqFiles',\n", + " 'CellRangerArc.reference',\n", + " 'CellRangerArc.dockerRegistry']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "std_inputs_fields" + ] + }, + { + "cell_type": "markdown", + "id": "2f37043a-c883-4da4-9c87-f1912b102e91", + "metadata": {}, + "source": [ + "# Generate labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81500471-4a2e-4eae-8b69-174608902709", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard labels, modify fields as need be\n", + "\n", + "with open(f\"{config_dir}/template.labels.json\") as f:\n", + " std_labels_fields = list(json.load(f).keys())\n", + " \n", + "# Annotate all samples with workflow inputs and labels\n", + "labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)\n", + "\n", + "labels[\"pipelineType\"] = pipeline_type\n", + "labels[\"project\"] = samples['project_id']\n", + "labels[\"sample\"] = labels.index\n", + "labels[\"owner\"] = creds[\"user\"]\n", + "labels[\"destination\"] = samples['AWS_storage'] + \"/\" + output_dirname\n", + "labels[\"transfer\"] = \"-\"\n", + "labels[\"comment\"] = creds[\"user\"]\n", + "\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "688fe090-52a7-493e-bf52-617b91809c5b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d7e3422e-a130-4b5b-ba68-f1d59921388c", + "metadata": {}, + "source": [ + "# Run samples" + ] + }, + { + "cell_type": "markdown", + "id": "2b231634-b533-4b48-afc3-f6c201bf3df7", + "metadata": {}, + "source": [ + "Look over the samples before submitting one last time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e71f5101-edb7-4c9d-ad07-408e756bd197", + "metadata": {}, + "outputs": [], + "source": [ + "inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61801e60-ab82-4012-afcc-22ee288b22d1", + "metadata": {}, + "outputs": [], + "source": [ + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8057ea75-3c2f-40c9-b6fe-d957dd401504", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cf7b481-ec81-46ff-9452-d7d831972764", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())\n", + "assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())\n", + "\n", + "stdouts = [] # to store all outputs\n", + "process = True\n", + "\n", + "with tqdm(inputs.index) as t:\n", + "\n", + " for sample_name in t:\n", + "\n", + " # Write inputs and labels to file\n", + " path_to_inputs = f\"{config_dir}/{sample_name}.inputs.json\"\n", + " with open(path_to_inputs, \"w\") as f_inputs:\n", + " json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)\n", + "\n", + " path_to_labels = f\"{config_dir}/{sample_name}.labels.json\"\n", + " with open(path_to_labels, \"w\") as f_labels:\n", + " json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)\n", + "\n", + " if process:\n", + " stdouts.append(run(\n", + " workflow_path = workflow_dir,\n", + " execp = \"submit.sh\",\n", + " secrets = path_to_cromwell_secrets,\n", + " inputs = path_to_inputs,\n", + " labels = path_to_labels,\n", + " options = path_to_options,\n", + " ))\n", + " \n", + " time.sleep(20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1efc63e0-a3bf-42d5-83d2-3392dc6abb20", + "metadata": {}, + "outputs": [], + "source": [ + "labels['destination'].values.tolist()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/barcodes/AT-1447_Ret_R1.QC.h5ad b/notebooks/barcodes/AT-1447_Ret_R1.QC.h5ad new file mode 100644 index 0000000..b1b2614 Binary files /dev/null and b/notebooks/barcodes/AT-1447_Ret_R1.QC.h5ad differ diff --git a/notebooks/barcodes/AT-1447_Ret_R1_tag-list.csv b/notebooks/barcodes/AT-1447_Ret_R1_tag-list.csv new file mode 100644 index 0000000..ada9478 --- /dev/null +++ b/notebooks/barcodes/AT-1447_Ret_R1_tag-list.csv @@ -0,0 +1,3 @@ +AACGTTAATCACTCA,A0304,m19,0 +CGCGATATGGTCGGA,A0305,m20,0 +AAGATGAGGTCTGTG,A0306,m21,0 \ No newline at end of file diff --git a/notebooks/barcodes/AT-1447_Ret_R2_tag-list.csv b/notebooks/barcodes/AT-1447_Ret_R2_tag-list.csv new file mode 100644 index 0000000..6e3fab8 --- /dev/null +++ b/notebooks/barcodes/AT-1447_Ret_R2_tag-list.csv @@ -0,0 +1,3 @@ +CATGCCAATAGAGCG,A0302,m5,0 +ATGAGGAATTCCTGC,A0301,m16,0 +CCGTCGTCCAAGCAT,A0303,m17,0 \ No newline at end of file diff --git a/notebooks/barcodes/AT-1716_retrieval_day1_reward2_tag-list.csv b/notebooks/barcodes/AT-1716_retrieval_day1_reward2_tag-list.csv new file mode 100644 index 0000000..f36807a --- /dev/null +++ b/notebooks/barcodes/AT-1716_retrieval_day1_reward2_tag-list.csv @@ -0,0 +1,6 @@ +ATGAGGAATTCCTGC,A0301,sample_2_m1,0 +CATGCCAATAGAGCG,A0302,sample_2_m2,0 +CCGTCGTCCAAGCAT,A0303,sample_2_m22,0 +AACGTTAATCACTCA,A0304,sample_1_m6,0 +CGCGATATGGTCGGA,A0305,sample_1_m17,0 +AAGATGAGGTCTGTG,A0306,sample_1_m13,0 diff --git a/notebooks/barcodes/AT-1727_retrieval_day_8_reward_1_tag-list.csv b/notebooks/barcodes/AT-1727_retrieval_day_8_reward_1_tag-list.csv new file mode 100644 index 0000000..c5e7dc1 --- /dev/null +++ b/notebooks/barcodes/AT-1727_retrieval_day_8_reward_1_tag-list.csv @@ -0,0 +1,3 @@ +GCAGGAGGTATCAAT,A0310,sample_1_m4,0 +GAATCGTGATTCTTC,A0311,sample_1_m7,0 +ACATGGTCAACGCTG,A0312,sample_1_m24,0 diff --git a/notebooks/barcodes/AT-1727_retrieval_day_8_reward_2_tag-list.csv b/notebooks/barcodes/AT-1727_retrieval_day_8_reward_2_tag-list.csv new file mode 100644 index 0000000..4352f28 --- /dev/null +++ b/notebooks/barcodes/AT-1727_retrieval_day_8_reward_2_tag-list.csv @@ -0,0 +1,3 @@ +AAGCTCGTTGGAAGA,A0307,sample_2_m10,0 +CGGATTCCACATCAT,A0308,sample_2_m14,0 +GTTGATCTATAACAG,A0309,sample_2_m23,0 diff --git a/notebooks/barcodes/AT-1734_retrieval_day_15_reward_1_tag-list.csv b/notebooks/barcodes/AT-1734_retrieval_day_15_reward_1_tag-list.csv new file mode 100644 index 0000000..ac42087 --- /dev/null +++ b/notebooks/barcodes/AT-1734_retrieval_day_15_reward_1_tag-list.csv @@ -0,0 +1,3 @@ +AAGATGAGGTCTGTG,A0306,sample_1_m20,0 +CATGCCAATAGAGCG,A0302,sample_1_m19,0 +CCGTCGTCCAAGCAT,A0303,sample_1_m25,0 diff --git a/notebooks/barcodes/AT-1734_retrieval_day_15_reward_2_tag-list.csv b/notebooks/barcodes/AT-1734_retrieval_day_15_reward_2_tag-list.csv new file mode 100644 index 0000000..7becab8 --- /dev/null +++ b/notebooks/barcodes/AT-1734_retrieval_day_15_reward_2_tag-list.csv @@ -0,0 +1,3 @@ +AACGTTAATCACTCA,A0304,sample_2_m9,0 +CGCGATATGGTCGGA,A0305,sample_2_m11,0 +ATGAGGAATTCCTGC,A0301,sample_2_m18,0 diff --git a/notebooks/barcodes/AT-1756_retrieval_day_15_reward_1_tag-list.csv b/notebooks/barcodes/AT-1756_retrieval_day_15_reward_1_tag-list.csv new file mode 100644 index 0000000..ff4a672 --- /dev/null +++ b/notebooks/barcodes/AT-1756_retrieval_day_15_reward_1_tag-list.csv @@ -0,0 +1,3 @@ +GCAGGAGGTATCAAT,A0310,reward_1_m5,0 +GAATCGTGATTCTTC,A0311,reward_1_m8,0 +ACATGGTCAACGCTG,A0312,reward_1_m15,0 diff --git a/notebooks/barcodes/AT-1756_retrieval_day_15_reward_2_tag-list.csv b/notebooks/barcodes/AT-1756_retrieval_day_15_reward_2_tag-list.csv new file mode 100644 index 0000000..465adf9 --- /dev/null +++ b/notebooks/barcodes/AT-1756_retrieval_day_15_reward_2_tag-list.csv @@ -0,0 +1,3 @@ +AAGCTCGTTGGAAGA,A0307,reward_2_m3,0 +CGGATTCCACATCAT,A0308,reward_2_m12,0 +GTTGATCTATAACAG,A0309,reward_2_m16,0 diff --git a/notebooks/barcodes/check_sharp_quality.ipynb b/notebooks/barcodes/check_sharp_quality.ipynb new file mode 100644 index 0000000..2af1f85 --- /dev/null +++ b/notebooks/barcodes/check_sharp_quality.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d3bf717e-1a40-4f81-a1b0-b727fb8004b0", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'scanpy'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mscanpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msc\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'scanpy'" + ] + } + ], + "source": [ + "import scanpy as sc" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "bb9dda65-cb30-4a62-87fd-2739eb66e72a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMGNQHAG7V\n", + "env: AWS_SECRET_ACCESS_KEY=Zt6lmgFmhx1Qn1/mTajuJHxORPKjMd7zHo9yxCv3\n", + "env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEL///////////wEaCXVzLWVhc3QtMSJHMEUCIBOn2n7FYhkwDrJArswLQBQzsB8bEcfdUZp6ZJWuQkNlAiEA/ZQVvnqwuRjHE0/2cTzrTEwF8bS0pfOp5sd5XU9QANMq+AEIqP//////////ARADGgw1ODM2NDM1Njc1MTIiDMFsXq9WBJoyLEdVQyrMAdIJg0FT0qo+cpTVLPKnFLvyQCjzH/7ZlWLd5w1NkTkMDIR4W/d+02fgfncbF3cGwglqlO18saOqvMTOjEUvvvhihRLRzuSajiF3sqG6wsdwu6IgCyk6xdYap84Wt0uo5LshyJAma2fK31yVPTip6n6nxoCcTmsJjhvFtMRGlgiC4bIAeW2lC8lqGHPmyz//tutqj9fZeUR0qmGtr1ium7Gg39WX5IzOgHAWTbb/HSspeP0+xDVfdzOHZyA47RVNc2EIwIXoHCntzPmUEjCNjYqbBjqYAcTJn0OZLrzdFL5RsIVXPaoQMedyiuxr27Bmz4QZoB32whcGu/0osFhwL4la0v+BUXuz2zGPB+byyrGkMB249QQitouR1DIZ/mCUnEz6HvCEQwdH/yIFkeNcY6p+rV/1sOa7op3p43reWd7s9ui3mlQ4QDi63ZJG/aBmGMwmPrCMQVgW9vJR5vr+LjV4YWxqVJ1PqOiZnGf+\n", + "2021-10-07 15:31:32 agc-583643567512-us-east-1\n", + "2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4\n", + "2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1\n", + "2019-10-10 12:46:54 dp-daily-reports\n", + "2017-09-26 10:15:15 dp-lab-data\n", + "2019-05-23 12:34:58 dp-lab-data-public\n", + "2018-03-01 13:29:07 dp-lab-glacier\n", + "2020-11-06 23:02:27 dp-lab-gwf-core\n", + "2021-07-15 18:14:01 dp-lab-gwf-core2\n", + "2017-09-26 10:17:18 dp-lab-home\n", + "2019-08-14 16:42:43 dp-lab-test\n", + "2019-04-25 12:35:35 elasticbeanstalk-us-east-1-583643567512\n", + "2019-12-13 15:53:57 scri-computational\n", + "2017-09-26 10:23:50 seqc-public\n" + ] + } + ], + "source": [ + "%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMGNQHAG7V\n", + "%env AWS_SECRET_ACCESS_KEY=Zt6lmgFmhx1Qn1/mTajuJHxORPKjMd7zHo9yxCv3\n", + "%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEL///////////wEaCXVzLWVhc3QtMSJHMEUCIBOn2n7FYhkwDrJArswLQBQzsB8bEcfdUZp6ZJWuQkNlAiEA/ZQVvnqwuRjHE0/2cTzrTEwF8bS0pfOp5sd5XU9QANMq+AEIqP//////////ARADGgw1ODM2NDM1Njc1MTIiDMFsXq9WBJoyLEdVQyrMAdIJg0FT0qo+cpTVLPKnFLvyQCjzH/7ZlWLd5w1NkTkMDIR4W/d+02fgfncbF3cGwglqlO18saOqvMTOjEUvvvhihRLRzuSajiF3sqG6wsdwu6IgCyk6xdYap84Wt0uo5LshyJAma2fK31yVPTip6n6nxoCcTmsJjhvFtMRGlgiC4bIAeW2lC8lqGHPmyz//tutqj9fZeUR0qmGtr1ium7Gg39WX5IzOgHAWTbb/HSspeP0+xDVfdzOHZyA47RVNc2EIwIXoHCntzPmUEjCNjYqbBjqYAcTJn0OZLrzdFL5RsIVXPaoQMedyiuxr27Bmz4QZoB32whcGu/0osFhwL4la0v+BUXuz2zGPB+byyrGkMB249QQitouR1DIZ/mCUnEz6HvCEQwdH/yIFkeNcY6p+rV/1sOa7op3p43reWd7s9ui3mlQ4QDi63ZJG/aBmGMwmPrCMQVgW9vJR5vr+LjV4YWxqVJ1PqOiZnGf+\n", + "\n", + "!aws s3 ls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e770af7a-d1ea-429d-aea8-79be0d4d47d0", + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp s3://dp-lab-data/collaborators/priya/MemConsolidationVr/AT-1447_Ret_R1_CPL/cellplex_results/QC/AT-1447_Ret_R1.QC.h5ad AT-1447_Ret_R1.QC.h5ad " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f001dec-1aa7-4a01-9cac-49252bb559e4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/joe_samples.csv b/notebooks/joe_samples.csv new file mode 100644 index 0000000..3511c61 --- /dev/null +++ b/notebooks/joe_samples.csv @@ -0,0 +1,27 @@ +sample_name, aws_path +epi_Ru581D_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/epi_mito-tracing-outs +non_epi_Ru581D_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/non_epi_mito-tracing-outs +epi_Ru581b_T1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/epi_mito-tracing-outs +non_epi_Ru581b_T1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/non_epi_mito-tracing-outs +epi_Ru581c-LN1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/epi_mito-tracing-outs +non_epi_Ru581c-LN1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/non_epi_mito-tracing-outs +epi_Ru1083_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/epi_mito-tracing-outs +non_epi_Ru1083_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/non_epi_mito-tracing-outs +epi_MSK_LX_1083c_T_2_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/epi_mito-tracing-outs +non_epi_MSK_LX_1083c_T_2_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/non_epi_mito-tracing-outs +epi_POSIE_101920_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/epi_mito-tracing-outs +non_epi_POSIE_101920_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/non_epi_mito-tracing-outs +epi_Ru1083d_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/epi_mito-tracing-outs +non_epi_Ru1083d_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/non_epi_mito-tracing-outs +epi_Ru1250C_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/epi_mito-tracing-outs +non_epi_Ru1250C_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/non_epi_mito-tracing-outs +epi_MSK_LX_1250b_PM_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/epi_mito-tracing-outs +non_epi_MSK_LX_1250b_PM_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/non_epi_mito-tracing-outs +epi_Ru1250D_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/epi_mito-tracing-outs +non_epi_Ru1250D_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/non_epi_mito-tracing-outs +epi_Ru1250e_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/epi_mito-tracing-outs +non_epi_Ru1250e_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/non_epi_mito-tracing-outs +epi_MSK_LX_1250f_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/epi_mito-tracing-outs +non_epi_MSK_LX_1250f_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/non_epi_mito-tracing-outs +epi_Ru263_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/epi_mito-tracing-outs +non_epi_Ru263_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/non_epi_mito-tracing-outs \ No newline at end of file diff --git a/notebooks/joe_samples_unarchive.txt b/notebooks/joe_samples_unarchive.txt new file mode 100644 index 0000000..1243e83 --- /dev/null +++ b/notebooks/joe_samples_unarchive.txt @@ -0,0 +1,14 @@ +,joe_sample_name,AWS_storage,sample +0,RU1083_LIV,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/,Ru1083_MITO +1,RU1083_T2,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/,MSK_LX_1083c_T_2_MITO +2,RU263_PDX,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/,Ru263_MITO +3,RU1083_T1,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/,POSIE_101920_T_1_MITO +4,RU1083_ST,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/,Ru1083d_MITO +5,RU1250_T1,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/,Ru1250C_T_1_MITO +6,RU1250_PL,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/,MSK_LX_1250b_PM_1_MITO +7,RU1250_T2,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/,Ru1250D_T_1_MITO +8,RU1250_ASC1,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/,Ru1250e_MITO +9,RU1250_ASC2,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1766_MSK_LX_1250f_MITO/,MSK_LX_1250f_MITO +10,RU581_LIV,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/Ru581D_MITO/,Ru581D_MITO +11,RU581_Ta,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/,Ru581b_T1_MITO +13,RU581_LNa,s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/,Ru581c-LN1_MITO \ No newline at end of file diff --git a/notebooks/updated.txt b/notebooks/updated.txt new file mode 100644 index 0000000..a67d564 --- /dev/null +++ b/notebooks/updated.txt @@ -0,0 +1,7 @@ +Updated notebooks: + +Run_CellRangerVdj.ipynb +Run_CellRangerGex.ipynv +Run_CellRangerAtac.ipynb +Run_FastQC.ipynb +utils/utils.py \ No newline at end of file diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py index edcbe42..72ae677 100644 --- a/notebooks/utils/utils.py +++ b/notebooks/utils/utils.py @@ -5,35 +5,13 @@ import importlib from mysql.connector import connect, Error +########## SCRIdb queries ########## -# Numpy encoder for JSON from pandas series -class NpEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, np.integer): - return int(obj) - elif isinstance(obj, np.floating): - return float(obj) - elif isinstance(obj, np.ndarray): - return obj.tolist() - else: - return super(NpEncoder, self).default(obj) - - -# from SCRIdb -def get_s3_objects(bucket, key, pattern, full_uri=False): - s3r = boto3.resource("s3") - bucket_s3 = s3r.Bucket(bucket) - objects = [] - for obj in bucket_s3.objects.filter(Prefix=key): - hit = pattern.search(obj.key) - if hit: - objects.append(obj.key) - if full_uri: - objects = [f"s3://{bucket}/{o}" for o in objects] - return objects - +def execute_query(query, creds): + + user = creds['user'] + password = creds['password'] -def execute_query(query, user, password): with connect( host="peer-lab-db.cggxmlwgzzpw.us-east-1.rds.amazonaws.com", database="peer_lab_db", @@ -45,52 +23,62 @@ def execute_query(query, user, password): result = cursor.fetchall() return result +def sample_scridb_info(querys, query_col, creds): + + samples = get_sample(querys, query_col, creds) + + species = [] + sc_tech = [] + proj_id = [] + reference = [] + + for query in querys: + species += get_species(query, query_col, creds) + sc_tech += get_sc_tech(query, query_col, creds) + proj_id += get_project_id(query, query_col, creds) + reference += get_reference(query, query_col, creds) + + samples['species'] = species + samples['sc_tech'] = sc_tech + samples['project_id'] = proj_id + samples['reference'] = reference + + return samples -# Get fastq file paths on S3 for each file id -# Returns dictionary from id to s3 path -# Throws exception if FASTQs don't exist for any id -def get_fastqs( - path: str, # path to directory containing FASTQ files - fastq_file_ids: list = None, # FASTQ file ids needed for this run type (e.g. I1, R1, R2, etc.) - folder: str = "", -): - fastq_map = dict() - _, bucket, key, _, _ = urllib.parse.urlsplit(f"{path}/{folder}") - # User may specify exactly which files are needed - if fastq_file_ids: - for fid in fastq_file_ids: - files = get_s3_objects( - bucket, key.lstrip("/"), - re.compile(f"_{fid}_\d{{3}}.fastq.gz$") - ) - try: - assert files, f"AssertionError: Missing `{fid}` archives!" - fastq_map[fid] = [os.path.join("s3://", bucket, str(f)) for f in files] - except AssertionError as err: - logging.warning("%s\n\t %s", err, path) - return - # Default: get all FASTQs - else: - files = get_s3_objects( - bucket, key.lstrip("/"), - re.compile(r"_\d{3}.fastq.gz$") - ) - fastq_map["All"] = [os.path.join("s3://", bucket, str(f)) for f in files] - - return fastq_map - - -# Extract FASTQ sample name from list of files -# Note: FASTQ name is file name up to lane id (e.g. L001, L002, etc.) -def get_fastqs_name(fastqs): - fastq_name_re = r".*/(.*)_S\d+_L\d{3}_[A-Za-z]\d_\d{3}.fastq.gz$" - fastq_names = [re.match(fastq_name_re, x)[1] for x in fastqs] - assert len(set(fastq_names)) == 1 # make sure all names are same - return fastq_names[0] +def get_sample(querys, query_col, creds): + user = creds['user'] + password = creds['password'] + + try: + table_sample_data = "peer_lab_db.sample_data" + query = f""" + SELECT Sample, AWS_storage, id + FROM {table_sample_data} + """ + + if len(querys) != 1: + query += f"WHERE {table_sample_data}.{query_col} IN {tuple(querys)}" + else: + query += f'WHERE {table_sample_data}.{query_col} = "{querys[0]}"' + + samples = execute_query(query, creds) + samples = pd.DataFrame(samples) + samples.columns = ['Sample', 'AWS_storage', 'id'] + samples = samples.set_index('Sample') + + samples['AWS_storage'] = samples['AWS_storage'].str.strip('/') + return samples + + except Error as e: + print(f"Error: {e}") -# Get species from database for given sample -def get_species(sample_id, user, password): + +def get_species(query, query_col, creds): + + user = creds['user'] + password = creds['password'] + try: table_sample_data = "peer_lab_db.sample_data" table_species = "peer_lab_db.species" @@ -102,15 +90,23 @@ def get_species(sample_id, user, password): ON {table_species}.id = {table_genome_idx}.species_id LEFT JOIN {table_sample_data} ON {table_genome_idx}.id = {table_sample_data}.genomeIndex_id - WHERE {table_sample_data}.id = {sample_id} + WHERE {table_sample_data}.{query_col} = "{query}" """ - result = execute_query(query, user, password)[0][0] - return result.lower() + + species = [] + results = execute_query(query, creds) + for result in results: + species.append(result[0].lower()) + return species except Error as e: print(f"Error: {e}") + +def get_sc_tech(query, query_col, creds): + + user = creds['user'] + password = creds['password'] -def get_sc_tech(sample_id, user, password): try: table_sample_data = "peer_lab_db.sample_data" table_sc_tech = "peer_lab_db.sc_tech" @@ -122,111 +118,448 @@ def get_sc_tech(sample_id, user, password): ON {table_sc_tech}.id = {table_genome_idx}.scTech_id LEFT JOIN {table_sample_data} ON {table_genome_idx}.id = {table_sample_data}.genomeIndex_id - WHERE {table_sample_data}.id = {sample_id} + WHERE {table_sample_data}.{query_col} = "{query}" """ - result = execute_query(query, user, password)[0][0] - return result + sc_tech = [] + results = execute_query(query, creds) + for result in results: + sc_tech.append(result[0]) + return sc_tech + + except Error as e: + print(f"Error: {e}") + +def get_project_id(query, query_col, creds): + try: + table_sample_data = "peer_lab_db.sample_data" + table_project_data = "peer_lab_db.project_data" + query = f""" + SELECT {table_project_data}.projectName + FROM {table_project_data} + LEFT JOIN {table_sample_data} + ON {table_project_data}.id = {table_sample_data}.projectData_id + WHERE {table_sample_data}.{query_col} = "{query}" + """ + + proj_id = [] + results = execute_query(query, creds) + for result in results: + proj_id.append(result[0]) + return proj_id + except Error as e: print(f"Error: {e}") -def get_sample_id(sample_name, user, password): +def get_reference(query, query_col, creds): + + user = creds['user'] + password = creds['password'] + try: table_sample_data = "peer_lab_db.sample_data" + table_genome_idx = "peer_lab_db.genome_index" query = f""" - SELECT {table_sample_data}.id - FROM {table_sample_data} - WHERE {table_sample_data}.Sample="{sample_name}" + SELECT {table_genome_idx}.gIndex + FROM {table_genome_idx} + LEFT JOIN {table_sample_data} + ON {table_genome_idx}.id = {table_sample_data}.genomeIndex_id + WHERE {table_sample_data}.{query_col} = "{query}" """ - result = execute_query(query, user, password)[0][0] - return result + + reference = [] + results = execute_query(query, creds) + for result in results: + reference.append(result[0]) + return reference + except Error as e: print(f"Error: {e}") + + +########## FASTQ map ########## + +fastq_map = { + 'CellRangerVdj': ['I1','R1','R2'], + 'Hashtag': ['R1','R2'], + 'CiteSeq': ['R1','R2'], + 'AsapSeq': ['R1','R2','R3'], + 'CellRangerATAC': ['I1','R1','R2','R3'], + 'CellRangerArc': ['I1','R1','R2','R3'], + 'CellRangerGex': ['I1','R1','R2'], + 'MitoTracing': ['R1', 'R2'], +} + + +########## AWS S3 functions ########## + +def get_s3_objects(bucket, key, pattern, full_uri=False): + s3r = boto3.resource("s3") + bucket_s3 = s3r.Bucket(bucket) + objects = [] + for obj in bucket_s3.objects.filter(Prefix=key): + hit = pattern.search(obj.key) + if hit: + objects.append(obj.key) + if full_uri: + objects = [f"s3://{bucket}/{o}" for o in objects] + return objects + +# Get fastq file paths on S3 for each file id +# Returns dictionary from id to s3 path +# Throws exception if FASTQs don't exist for any id +def get_fastqs( + path: str, # path to directory containing FASTQ files + fastq_file_ids: list, # FASTQ file ids needed for this run type (e.g. I1, R1, R2, etc.) + folder: str = "", +): + fastq_map = dict() + _, bucket, key, _, _ = urllib.parse.urlsplit(f"{path}/{folder}") + for fid in fastq_file_ids: + files = get_s3_objects( + bucket, key.lstrip("/"), + re.compile(f"_{fid}_\d{{3}}.fastq.gz$") + ) + try: + assert files, f"AssertionError: Missing `{fid}` archives!" + fastq_map[fid] = [os.path.join("s3://", bucket, str(f)) for f in files] + except AssertionError as err: + logging.warning("%s\n\t %s", err, path) + return + return fastq_map + +# Get every FASTQ in a folder +def get_all_fastqs( + path: str, # path to directory containing FASTQ files + folder: str = "", +): + _, bucket, key, _, _ = urllib.parse.urlsplit(f"{path}/{folder}") + files = get_s3_objects( + bucket, key.lstrip("/"), + re.compile(f".fastq.gz$") + ) + + try: + fastqs = [os.path.join("s3://", bucket, str(f)) for f in files] + except AssertionError as err: + logging.warning("%s\n\t %s", err, path) + return + return fastqs + + +########## Reference map ########## +reference_map = {} + +reference_map['CellRangerArc'] = { + "human": "https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-GRCh38-2020-A.tar.gz", + "mouse": "https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz" +} + +reference_map['CellRangerAtac'] = { + "human":"https://cf.10xgenomics.com/supp/cell-atac/refdata-cellranger-arc-GRCh38-2020-A-2.0.0.tar.gz", + "mouse":"https://cf.10xgenomics.com/supp/cell-atac/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz" +} + +reference_map['CellRangerGex'] = { + "human":"https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz", + "mouse":"https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz" +} + +reference_map['CellRangerCellPlex'] = { + "human":"https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz", + "mouse":"https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz" +} + +reference_map['CellRangerVdj'] = { + "human":"GRCh38", + "mouse":"GRCm38" +} + +def update_ref(samples, prefix): + for sample, row in samples.iterrows(): + species = row['species'] + + if not row['reference']: + if species in ['human', 'mouse']: + samples.loc[sample, 'reference'] = reference_map[prefix][row['species']] + + elif prefix.startswith('CellRanger'): + if not row['reference'].startswith('https'): + + if not species in ['human', 'mouse']: + print(f'{species} reference not in database. Manually change "reference" field') + samples.loc[sample, 'reference'] = np.nan + + else: + samples.loc[sample, 'reference'] = reference_map[prefix][row['species']] + + return samples + + + +########## SHARP functions ########## + +# Priority of GEX data if multiple outputs are found in db +sharp_wl_priority_map = { + m: ["SEQC", "CR_GEX"] for m in ["Hashtag", "CiteSeq"] + } +# File patterns to search for in S3 for each accompanying pipeline +sharp_wl_pattern_map = { + "SEQC": "_dense.csv$", + "CR_GEX": "/filtered_feature_bc_matrix/barcodes.tsv.gz$", + "CR_ATAC": "/filtered_peak_bc_matrix/barcodes.tsv" +} +sharp_wl_method_map = { + "SEQC": "SeqcDenseCountsMatrixCsv", + "CR_GEX": "10x", + "CR_ATAC": "10x", +} +# Names of FASTQ inputs in WDL; order is same as fastq_file_ids +# TODO: Ask to change all inputs to "fastq{file_id}" or "uriFastq{file_id}" +sharp_fastq_inputs_map = { + m: ["uriFastqR1", "uriFastqR2"] for m in ["Hashtag", "CiteSeq"] +} + +# Get s3 path of existing GEX analysis files +from mysql.connector import connect, Error +def get_wl_dir(sample_id, creds): + + user = creds['user'] + password = creds['password'] -def get_project_id(sample_id, user, password): try: table_sample_data = "peer_lab_db.sample_data" - table_project_data = "peer_lab_db.project_data" + table_stats_data = "peer_lab_db.stats_data" + table_stats_data = "peer_lab_db.stats_data" + table_hashtag_lib = "peer_lab_db.hashtag_lib" + table_genome_index = "peer_lab_db.genome_index" + table_sc_tech = "peer_lab_db.sc_tech" query = f""" - SELECT {table_project_data}.projectName - FROM {table_project_data} - LEFT JOIN {table_sample_data} - ON {table_project_data}.id = {table_sample_data}.projectData_id + SELECT {table_stats_data}.analysis_storage + FROM {table_sample_data} + LEFT JOIN {table_stats_data} + ON {table_stats_data}.sampleData_id = {table_sample_data}.id + LEFT JOIN {table_hashtag_lib} + ON {table_hashtag_lib}.sampleData_id = {table_sample_data}.id + LEFT JOIN {table_genome_index} + ON {table_genome_index}.id = {table_hashtag_lib}.genomeIndex_id + LEFT JOIN {table_sc_tech} + ON {table_sc_tech}.id = {table_genome_index}.scTech_id WHERE {table_sample_data}.id = {sample_id} """ - result = execute_query(query, user, password)[0][0] - return result + result = execute_query(query, creds)[0][0] + if result: + return result + # As backup, get AWS storage location directly from sample_data + else: + query = f""" + SELECT AWS_storage + FROM {table_sample_data} + WHERE {table_sample_data}.id = {sample_id} + """ + result = execute_query(query, creds)[0][0] + return result except Error as e: print(f"Error: {e}") +# Get white list method and associated file +# Throws exception if no white list exists +def get_wl_params( + sample_id: str, + creds, + prefix, + wl_dir +): + + user = creds['user'] + password = creds['password'] -def get_SEQC_version(loc): - try: - cmd = f"aws s3 cp {loc}/seqc-results/seqc_log.txt -" - out = subprocess.run( - shlex.split(cmd), universal_newlines=True, capture_output=True - ).__dict__["stdout"] - version = re.match(r".*SEQC=v(\d+\.\d+\.\d+).*", out)[1] - return version - except: - return "N/A" + wl_params = dict() + # wl_dir = get_wl_dir(sample_id, creds) + wl_patterns = [sharp_wl_pattern_map[p] for p in sharp_wl_priority_map[prefix]] -def get_file_prefix(loc): try: - cmd = f"aws s3 ls {loc}/seqc-results/" - out = subprocess.run( - shlex.split(cmd), universal_newlines=True, capture_output=True - ).__dict__["stdout"] - - # Note: I'm expecting the aligned bam file to be in loc - bam_pattern = re.compile(r"(.*)_Aligned\.out\.bam$") - filename = list(filter(bam_pattern.match, out.split()))[0] - file_prefix = re.match(bam_pattern, filename)[1] - return file_prefix - except: - raise ValueError(f"BAM file not found in {loc}") - - -def get_cr_reference(sample_id, prefix, user, password): - return get_reference( - sample_id, "CellRanger", prefix, user, password, - ) + # Check white list file exists before loading info from database + assert wl_dir, f"Empty analysis storage for sample id {sample_id}" + _, bucket, key, _, _ = urllib.parse.urlsplit(wl_dir) + # White list file and method is first entry found on S3 + wl = pd.DataFrame( + [get_s3_objects(bucket, key.strip("/"), re.compile(p)) for p in wl_patterns], + index = sharp_wl_priority_map[prefix], + ).dropna(how="all") + try: + wl_key = wl.iloc[0,0] # if empty, missing white list file + wl_params["uri"] = os.path.join("s3://", bucket, wl_key) + wl_params["method"] = sharp_wl_method_map[wl.index[0]] + except IndexError: + logging.error( + "Path to barcodes or counts matrix of GEX data is missing!" + ) + return + + except AssertionError: + logging.warning(f"Path to GEX output results is missing for {sample_id}!") + return + return wl_params -def get_reference( + +def get_bc_params( sample_id, - pipeline, - prefix, - user, - password, + creds, ): - # Get species from database to decide reference - species = get_species(sample_id, user, password) + user = creds['user'] + password = creds['password'] + + bc_params = dict() + + # JSON of bc and UMI positions are stored in database + # First check dense matrix exists before loading JSON from database + bc_json = get_bc_json(sample_id, creds) + bc_pos = json.loads(bc_json) + bc_params["cb"] = bc_pos["cellbarcode"] + bc_params["umi"] = bc_params["cb"] + bc_pos["UMIs"] + + # Get bc sequence data from database + bcs = get_bcs(sample_id, creds) + if not bcs: + logging.warning(f"Barcodes data Empty:\n\t {db_connect.cur.statement}") + return + for bc in bcs: + try: + assert bc[0], "AssertionError: Missing sequence barcodes!" + assert bc[1], "AssertionError: Missing barcode IDs" + except AssertionError as err: + logging.warning(f"{err}:\n\t {db_connect.cur.statement}") + return + + barcodes = pd.DataFrame(bcs, columns=["sequence", "code", "label", "bp_shift"]) + conjugation = barcodes["code"].str.get(0) + if conjugation.nunique() != 1: + logging.warning( + f"Sample has multiple hashtag barcode categories and will not be processed!" + ) + return + else: + bc_params["conjugation"] = conjugation.values[0] - # Map to reference locations - try: - with open("utils/genomes-data.json") as f: - genomes_data = json.load(f) - return genomes_data[pipeline][prefix][species] - except: - raise ValueError(f"Unknown Species: {species}") + if barcodes["bp_shift"].nunique() != 1: + logging.warning( + f"Sample {sample_id} has hashtag barcode categories, with bp-shift length/s " + f"{barcodes['bp_shift'].unique()}, and will not be processed!" + ) + return + else: + bc_params["bp_shift"] = int(barcodes["bp_shift"][0]) + bc_params["seq_length"] = bc_params["bp_shift"] + barcodes["sequence"].apply(len).max() + + return bc_params -def get_bc_whitelist(sample_id, user, password): - # Get version from database to decide whitelist - sc_tech = get_sc_tech(sample_id, user, password) +# Get bc sequence data from database +def get_bcs(sample_id, creds): + user = creds['user'] + password = creds['password'] - # Map to reference locations - if "V3" in sc_tech: - return "s3://seqc-public/barcodes/ten_x_v3/flat/3M-february-2018.txt" - elif "V2" in sc_tech: - return "s3://seqc-public/barcodes/ten_x_v2/flat/737K-august-2016.txt" - else: - raise ValueError(f"Unknown Technology: {sc_tech}") + try: + table_sample_data = "peer_lab_db.sample_data" + table_hashtag_barcodes = "peer_lab_db.hashtag_barcodes" + table_hashtags = "peer_lab_db.hashtags" + query = f""" + SELECT barcode_sequence, concat(substring(category, -1), barcode), + demultiplex_label, bp_shift FROM {table_hashtags} + LEFT JOIN {table_hashtag_barcodes} + ON {table_hashtag_barcodes}.id = {table_hashtags}.hashtagBarcodes_id + WHERE {table_hashtags}.sampleData_id = {sample_id} + """ + result = execute_query(query, creds) + return result + except Error as e: + print(f"Error: {e}") + +# Get bc and UMI positions from database stored in JSON format +def get_bc_json(sample_id, creds): + + user = creds['user'] + password = creds['password'] + + try: + table_sample_data = "peer_lab_db.sample_data" + table_stats_data = "peer_lab_db.stats_data" + table_stats_data = "peer_lab_db.stats_data" + table_hashtag_lib = "peer_lab_db.hashtag_lib" + table_genome_index = "peer_lab_db.genome_index" + table_sc_tech = "peer_lab_db.sc_tech" + query = f""" + SELECT barcodes + FROM {table_sample_data} + LEFT JOIN {table_stats_data} + ON {table_stats_data}.sampleData_id = {table_sample_data}.id + LEFT JOIN {table_hashtag_lib} + ON {table_hashtag_lib}.sampleData_id = {table_sample_data}.id + LEFT JOIN {table_genome_index} + ON {table_genome_index}.id = {table_hashtag_lib}.genomeIndex_id + LEFT JOIN {table_sc_tech} + ON {table_sc_tech}.id = {table_genome_index}.scTech_id + WHERE {table_sample_data}.id = {sample_id} + """ + result = execute_query(query, creds)[0][0] + return result + except Error as e: + print(f"Error: {e}") + +# Get fastq file paths on S3 for each file id +# Returns dictionary from id to s3 path +# Throws exception if FASTQs don't exist for any id +def get_denseCountMatrix( + path: str, # path to directory containing FASTQ files +): + _, bucket, key, _, _ = urllib.parse.urlsplit(path) + results = get_s3_objects( + bucket, key.lstrip("/"), + re.compile(f"_dense.csv$") + ) + whitelist = [] + for result in results: + whitelist.append(os.path.join("s3://", bucket, result)) + whitelist.sort() + return whitelist + + +# Function to reformat barcode labels for Sharp +def reformat_bc_label(label): + label = label.encode('ascii', 'namereplace').decode() + label = label.replace("\\N", "").replace(" ", "_") + return label + + +########## Misc functions ########## + +# Extract FASTQ sample name from list of files +# Note: FASTQ name is file name up to lane id (e.g. L001, L002, etc.) +def get_fastqs_name(fastqs): + fastq_name_re = r".*/(.*)_S\d+_L\d{3}_[A-Za-z]\d_\d{3}.fastq.gz$" + fastq_names = [re.match(fastq_name_re, x)[1] for x in fastqs] + assert len(set(fastq_names)) == 1 # make sure all names are same + return fastq_names[0] +########## Run workflow ########## + +# Numpy encoder for JSON from pandas series +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NpEncoder, self).default(obj) + def run( workflow_path: str, execp: str, @@ -250,56 +583,38 @@ def run( return out -# Get bc sequence data from database -def get_bcs(sample_id, user, password): - try: - table_sample_data = "peer_lab_db.sample_data" - table_hashtag_barcodes = "peer_lab_db.hashtag_barcodes" - table_hashtags = "peer_lab_db.hashtags" - query = f""" - SELECT barcode_sequence, concat(substring(category, -1), barcode), - demultiplex_label, bp_shift FROM {table_hashtags} - LEFT JOIN {table_hashtag_barcodes} - ON {table_hashtag_barcodes}.id = {table_hashtags}.hashtagBarcodes_id - WHERE {table_hashtags}.sampleData_id = {sample_id} - """ - result = execute_query(query, user, password) - return result - except Error as e: - print(f"Error: {e}") - - -# Create csv files and upload to S3 -# Note: follow CellRanger instructions for naming columns: -# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/multi#examples -def get_cmo_files( - samples: pd.DataFrame, - user: str, - password: str, +# Get fastq file paths on S3 for each file id +# Returns dictionary from id to s3 path +# Throws exception if FASTQs don't exist for any id +def get_mito_whitelist( + path: str, # path to directory containing FASTQ files ): - cmo_files = dict() - for name, sample in samples.iterrows(): - - # Get barcodes from database - bcs = pd.DataFrame.from_records( - get_bcs(sample['Sample_ID'], user, password), - columns=["sequence", "id", "sample_id", "bp_shift"], - ) - - # CMO map file - cmo_map = bcs[["sample_id", "id"]].copy().rename( - {"id": "cmo_ids"}, axis=1, + _, bucket, key, _, _ = urllib.parse.urlsplit(path) + results = get_s3_objects( + bucket, key.lstrip("/"), + re.compile(f".txt$") ) - cmo_map["sample_id"] = cmo_map["sample_id"].str.replace(" ","_") + whitelist = [] + for result in results: + whitelist.append(os.path.join("s3://", bucket, result)) + whitelist.sort() + return whitelist - # CMO reference file - cmo_ref = bcs[["id", "sequence"]].copy() - cmo_ref["name"] = cmo_ref["id"] - cmo_ref["read"] = "R2" - cmo_ref["pattern"] = "5P(BC)" - cmo_ref["feature_type"] = "Multiplexing Capture" - order = ["id", "name", "read", "pattern", "sequence", "feature_type"] - cmo_files[name] = (cmo_map, cmo_ref[order]) - - return cmo_files +# Get fastq file paths on S3 for each file id +# Returns dictionary from id to s3 path +# Throws exception if FASTQs don't exist for any id +def get_aws_file( + path: str, # path to directory containing FASTQ files + file_end: str # Extension of the file +): + _, bucket, key, _, _ = urllib.parse.urlsplit(path) + results = get_s3_objects( + bucket, key.lstrip("/"), + re.compile(f".{file_end}$") + ) + whitelist = [] + for result in results: + whitelist.append(os.path.join("s3://", bucket, result)) + whitelist.sort() + return whitelist