diff --git a/data/tcr_epitope_binding/example_processing_and_templates.ipynb b/data/tcr_epitope_binding/example_processing_and_templates.ipynb new file mode 100644 index 000000000..59528d11f --- /dev/null +++ b/data/tcr_epitope_binding/example_processing_and_templates.ipynb @@ -0,0 +1,1486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad", + "metadata": {}, + "source": [ + "# TCR-Epitope binding, Weber et al." + ] + }, + { + "cell_type": "markdown", + "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c", + "metadata": {}, + "source": [ + "Original data repository: https://tdcommons.ai/multi_pred_tasks/tcrepitope/" + ] + }, + { + "cell_type": "markdown", + "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "cf59e3e9-8061-4022-9eae-e978311b4155", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import yaml\n", + "from tdc.multi_pred import TCREpitopeBinding " + ] + }, + { + "cell_type": "markdown", + "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7", + "metadata": {}, + "source": [ + "# Data processing" + ] + }, + { + "cell_type": "markdown", + "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b", + "metadata": {}, + "source": [ + "## Download data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_original = \"data_original.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b39a142e-ccbc-49d2-98b0-a5f9bde9fd27", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found local copy...\n", + "Loading...\n", + "Done!\n" + ] + } + ], + "source": [ + "data = TCREpitopeBinding(name = 'weber', path = './data')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "26d9f62a-07f5-4113-8161-d5dfcf0bfb71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data.get_data().to_csv(fn_data_original, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43873fc3-20a8-487d-a7c5-33bd58414159", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 30664\r\n", + "drwxr-xr-x 3 cody staff 96B Mar 28 01:21 \u001b[34mdata\u001b[m\u001b[m\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 28 01:22 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 47K Mar 28 01:18 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 5.3K Mar 28 01:18 transform.py\r\n" + ] + } + ], + "source": [ + "!ls -lh" + ] + }, + { + "cell_type": "markdown", + "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93", + "metadata": {}, + "source": [ + "## Load original data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "77f614e7-b133-40bc-8759-2d930e4c120e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epitope_aa,epitope_smi,tcr,tcr_full,label\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASEGTSSYEQYFGPGTRLTVT,1\r\n" + ] + } + ], + "source": [ + "!head -n 5 {fn_data_original}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_csv(fn_data_original, delimiter=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smitcrtcr_fulllabel
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smi \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full label \n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... 1 \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... 1 \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... 1 \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... 1 \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... 1 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", + "metadata": {}, + "source": [ + "## Add column = field names\n", + "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['epitope_aa', 'epitope_smi', 'tcr', 'tcr_full', 'label']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fields_orig = df.columns.tolist()\n", + "fields_orig" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + " fields_clean = [\n", + " \"epitope_aa\",\n", + " \"epitope_smiles\",\n", + " \"tcr\",\n", + " \"tcr_full\",\n", + " \"binding\",\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.columns = fields_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372", + "metadata": {}, + "source": [ + "## Data cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7e746003-cb1f-434f-bba6-00f0c439c4ac", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.epitope_aa = (\n", + " df.epitope_aa.str.strip()\n", + ") # remove leading and trailing white space characters" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d544fa60-343e-40e1-bd0c-4750f07a7145", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "assert not df.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", + "metadata": {}, + "source": [ + "## Save to csv" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.to_csv(fn_data_csv, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 cody staff 15M Mar 28 01:23 data_clean.csv\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epitope_aa,epitope_smiles,tcr,tcr_full,binding\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASEGTSSYEQYFGPGTRLTVT,1\r\n" + ] + } + ], + "source": [ + "!head -n 5 {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1a512943-4909-4d56-867d-50c151d8d607", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", + "metadata": {}, + "source": [ + "## Load from csv" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "077b0c5f-8772-4879-9317-3fa28799689b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_csv(fn_data_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "43619e7c-9c82-4ff0-ae25-403861304635", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", + "metadata": {}, + "source": [ + "# meta YAML" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "49771077-471d-4d71-a9a7-d6b094bbc4f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "meta = {\n", + " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", + " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, \n", + " activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic \n", + " peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide \n", + " protection against a wide range of pathogens.This new task aims to predict the binding affinity \n", + " given a pair of TCR sequence and epitope sequence.\"\"\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"binding\", # name of the column in a tabular dataset\n", + " \"description\": \"TCR epitope binding.\", # description of what this column means\n", + " \"units\": \"\", # units of the values in this column (leave empty if unitless)\n", + " \"type\": \"binary classification\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", + " \"names\": [ # names for the property (to sample from for building the prompts)\n", + " \"tcr binding affinity\",\n", + " \"binding affinity\",\n", + " \"binding\",\n", + " \"epitope binding affinity\",\n", + " \"epitope binding\"\n", + "\n", + " ],\n", + " },\n", + " ],\n", + " \"identifiers\": [\n", + " {\n", + " \"id\": \"epitope_smiles\", # column name\n", + " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", + " \"description\": \"epitope smiles\", # description (optional, except for \"OTHER\")\n", + " },\n", + " {\n", + " \"id\": \"epitope_aa\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"epitope amino acid sequence\",\n", + "\n", + " },\n", + " {\n", + " \"id\": \"tcr\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"hypervariable CDR3 loop\",\n", + "\n", + " },\n", + " {\n", + " \"id\": \"tcr_full\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"tcr full amino acid sequence\",\n", + "\n", + " },\n", + " ],\n", + " \"templates\": [\n", + " {\n", + " \"prompt\": \"Please answer the following question.\\nPredict the for .\",\n", + " \"completion\": \"\"\n", + "\n", + " }\n", + " ],\n", + " \"fields\": [\n", + " {\n", + " \"exp_value\": [\n", + " \"values\": [ \n", + " {\n", + " \"name\": \"exp_value\",\n", + " \"column\": \"exp_value\",\n", + " \"text\": \"binding\"\n", + " }\n", + " ]\n", + " ],\n", + " \"molecule\": [\n", + " \"values\": [\n", + " {\n", + " \"name\": \"epitope_smiles\",\n", + " \"column\": \"epitope_smiles\",\n", + " \"text\": \"SMILES\"\n", + "\n", + " },\n", + " {\n", + " \"name\": \"tcr_full\",\n", + " \"column\": \"tcr_full\",\n", + " \"text\":\n", + "\n", + " },\n", + "\n", + " ]\n", + " ],\n", + "\n", + " }\n", + " ],\n", + " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", + " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", + " {\n", + " \"url\": \"https://tdcommons.ai/multi_pred_tasks/tcrepitope/\",\n", + " \"description\": \"original data set link\",\n", + " },\n", + " {\n", + " \"url\": \"https://doi.org/10.1093/bioinformatics/btab294\",\n", + " \"description\": \"corresponding publication\",\n", + " },\n", + " ],\n", + " \"benchmarks\": [\n", + " {\n", + " \"name\": \"TDC\",\n", + " \"link\": \"https://tdcommons.ai/\",\n", + " \"split_column\": \"split\",\n", + " },\n", + " ],\n", + " \"num_points\": len(df), # number of datapoints in this dataset\n", + " \"bibtex\": [\n", + " \"\"\"@article{weber2021titan,\n", + " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\n", + " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\n", + " journal={Bioinformatics},\n", + " volume={56},\n", + " number={4},\n", + " pages={i237-i234},\n", + " year={2021},\n", + " publisher={Oxford Academic}\n", + " }\"\"\",\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ec455cf0-962a-4c0d-bb3e-066e415ffd9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + "\n", + "yaml.add_representer(str, str_presenter)\n", + "yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + ") # to use with safe_dum" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "580bbd79-4845-4515-be94-3e4a9815d048", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_meta = \"meta.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d01686c0-6746-4fc4-b019-350270dfc26f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 cody staff 1.7K Mar 28 01:23 meta.yaml\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_meta}" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name: tcr_epitope_binding\r\n", + "description: \"T-cells are an integral part of the adaptive immune system, whose survival,\\\r\n", + " \\ proliferation, \\n activationand function are all governed by the interaction\\\r\n", + " \\ of their T-cell receptor (TCR) with immunogenic \\n peptides (epitopes).A large\\\r\n", + " \\ repertoire of T-cell receptors with different specificity is needed to provide\\\r\n", + " \\ \\n protection against a wide range of pathogens.This new task aims to predict\\\r\n", + " \\ the binding affinity \\n given a pair of TCR sequence and epitope sequence.\"\r\n", + "targets:\r\n", + "- id: binding\r\n", + " description: TCR epitope binding.\r\n", + " units: ''\r\n", + " type: binary classification\r\n", + " names:\r\n", + " - tcr binding affinity\r\n", + " - binding affinity\r\n", + " - binding\r\n", + " - epitope binding affinity\r\n", + " - epitope binding\r\n", + "identifiers:\r\n", + "- id: epitope_smiles\r\n", + " type: SMILES\r\n", + " description: epitope smiles\r\n", + "- id: epitope_aa\r\n", + " type: Other\r\n", + " description: epitope amino acid sequence\r\n", + "- id: tcr\r\n", + " type: Other\r\n", + " description: hypervariable CDR3 loop\r\n", + "- id: tcr_full\r\n", + " type: Other\r\n", + " description: tcr full amino acid sequence\r\n", + "license: CC BY 4.0\r\n", + "links:\r\n", + "- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/\r\n", + " description: original data set link\r\n", + "- url: https://doi.org/10.1093/bioinformatics/btab294\r\n", + " description: corresponding publication\r\n", + "benchmarks:\r\n", + "- name: TDC\r\n", + " link: https://tdcommons.ai/\r\n", + " split_column: split\r\n", + "num_points: 47182\r\n", + "bibtex:\r\n", + "- |-\r\n", + " @article{weber2021titan,\r\n", + " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\r\n", + " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\r\n", + " journal={Bioinformatics},\r\n", + " volume={56},\r\n", + " number={4},\r\n", + " pages={i237-i234},\r\n", + " year={2021},\r\n", + " publisher={Oxford Academic}\r\n", + " }\r\n" + ] + } + ], + "source": [ + "!cat {fn_meta}" + ] + }, + { + "cell_type": "markdown", + "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", + "metadata": {}, + "source": [ + "# create transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "path_file = \"transform.py\"" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting transform.py\n" + ] + } + ], + "source": [ + "%%writefile $path_file\n", + "import pandas as pd\n", + "import yaml\n", + "from tdc.multi_pred import TCREpitopeBinding \n", + "def get_and_transform_data():\n", + " # get raw data\n", + " data = TCREpitopeBinding(name = 'weber', path = './data')\n", + " \n", + " split = data.get_split()\n", + " df_train=split['train']\n", + " df_valid=split['valid']\n", + " df_test=split['test']\n", + " df_train['split']=\"train\"\n", + " df_valid['split']=\"valid\"\n", + " df_test['split']=\"test\"\n", + " df=pd.concat([df_train,df_valid,df_test],axis=0)\n", + "\n", + " # create dataframenot necessary but ensure we can load the saved data\n", + "\n", + " # check if fields are the same\n", + " fields_orig = df.columns.tolist()\n", + " assert fields_orig == [\n", + " \"epitope_aa\",\n", + " \"epitope_smi\",\n", + " \"tcr\",\n", + " \"tcr_full\",\n", + " \"label\",\n", + " \"split\"\n", + " ]\n", + "\n", + " # overwrite column names = fields\n", + " fields_clean = [\n", + " \"epitope_aa\",\n", + " \"epitope_smiles\",\n", + " \"tcr\",\n", + " \"tcr_full\",\n", + " \"binding\",\n", + " \"split\"\n", + " ]\n", + " df.columns = fields_clean\n", + "\n", + " # data cleaning\n", + " df.epitope_aa = (\n", + " df.epitope_aa.str.strip()\n", + " ) # remove leading and trailing white space characters\n", + "\n", + " assert not df.duplicated().sum()\n", + "\n", + " # save to csv\n", + " fn_data_csv = \"data_clean.csv\"\n", + " df.to_csv(fn_data_csv, index=False)\n", + "\n", + " # create meta yaml\n", + " meta = {\n", + " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", + " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\n", + " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\n", + " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\n", + " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\"\"\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"binding\", # name of the column in a tabular dataset\n", + " \"description\": \"TCR epitope binding.\", # description of what this column means\n", + " \"units\": \"\", # units of the values in this column (leave empty if unitless)\n", + " \"type\": \"binary classification\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", + " \"names\": [ # names for the property (to sample from for building the prompts)\n", + " \"tcr binding affinity\",\n", + " \"binding affinity\",\n", + " \"binding\",\n", + " \"epitope binding affinity\",\n", + " \"epitope binding\"\n", + "\n", + " ],\n", + " },\n", + " ],\n", + " \"identifiers\": [\n", + " {\n", + " \"id\": \"epitope_smiles\", # column name\n", + " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", + " \"description\": \"epitope smiles\", # description (optional, except for \"OTHER\")\n", + " },\n", + " {\n", + " \"id\": \"epitope_aa\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"epitope amino acid sequence\",\n", + " \n", + " },\n", + " {\n", + " \"id\": \"tcr\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"hypervariable CDR3 loop\",\n", + " \n", + " },\n", + " {\n", + " \"id\": \"tcr_full\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"tcr full amino acid sequence\",\n", + " \n", + " },\n", + " ],\n", + " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", + " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", + " {\n", + " \"url\": \"https://tdcommons.ai/multi_pred_tasks/tcrepitope/\",\n", + " \"description\": \"original data set link\",\n", + " },\n", + " {\n", + " \"url\": \"https://doi.org/10.1093/bioinformatics/btab294\",\n", + " \"description\": \"corresponding publication\",\n", + " },\n", + " ],\n", + " \"benchmarks\": [\n", + " {\n", + " \"name\": \"TDC\",\n", + " \"link\": \"https://tdcommons.ai/\",\n", + " \"split_column\": \"split\",\n", + " },\n", + " ],\n", + " \"num_points\": len(df), # number of datapoints in this dataset\n", + " \"bibtex\": [\n", + " \"\"\"@article{weber2021titan,\n", + " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\n", + " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\n", + " journal={Bioinformatics},\n", + " volume={56},\n", + " number={4},\n", + " pages={i237-i234},\n", + " year={2021},\n", + " publisher={Oxford Academic}\n", + " }\"\"\",\n", + " ],\n", + " }\n", + "\n", + " def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + "\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + " yaml.add_representer(str, str_presenter)\n", + " yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + " ) # to use with safe_dum\n", + " fn_meta = \"meta.yaml\"\n", + " with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)\n", + "\n", + " print(f\"Finished processing {meta['name']} dataset!\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " get_and_transform_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found local copy...\n", + "Loading...\n", + "Done!\n", + "Finished processing tcr_epitope_binding dataset!\n" + ] + } + ], + "source": [ + "!python3 transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 61592\r\n", + "drwxr-xr-x 3 cody staff 96B Mar 4 10:09 \u001b[34mdata\u001b[m\u001b[m/\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_clean.csv\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 45K Mar 4 10:06 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:10 meta.yaml\r\n", + "-rw-r--r--@ 1 cody staff 4.8K Mar 4 10:10 transform.py\r\n" + ] + } + ], + "source": [ + "ls -lh # fmt: skip" + ] + }, + { + "cell_type": "markdown", + "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", + "metadata": {}, + "source": [ + "# End" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml new file mode 100644 index 000000000..47b52ccd0 --- /dev/null +++ b/data/tcr_epitope_binding/meta.yaml @@ -0,0 +1,60 @@ +name: tcr_epitope_binding +description: |- + T-cells are an integral part of the adaptive immune system, whose survival, proliferation, + activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic + peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide + protection against a wide range of pathogens.This new task aims to predict the binding affinity + given a pair of TCR sequence and epitope sequence. +targets: + - id: binding + description: TCR epitope binding. + units: '' + type: boolean + names: + - tcr binding affinity + - binding affinity + - binding + - epitope binding affinity + - epitope binding +identifiers: + - id: epitope_smiles + type: SMILES + description: epitope smiles + - id: epitope_aa + type: Other + names: + - epitope amino acid sequence + description: epitope amino acid sequence + - id: tcr + type: Other + names: + - hypervariable CDR3 loop + description: hypervariable CDR3 loop + - id: tcr_full + type: Other + names: + - tcr full amino acid sequence + description: tcr full amino acid sequence +license: CC BY 4.0 +links: + - url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ + description: original data set link + - url: https://doi.org/10.1093/bioinformatics/btab294 + description: corresponding publication +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split +num_points: 47182 +bibtex: + - |- + @article{weber2021titan, + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + } diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py new file mode 100644 index 000000000..947bf53e2 --- /dev/null +++ b/data/tcr_epitope_binding/transform.py @@ -0,0 +1,201 @@ +import pandas as pd +import yaml +from tdc.multi_pred import TCREpitopeBinding + + +def get_and_transform_data(): + # get raw data + data = TCREpitopeBinding(name="weber", path="./data") + + split = data.get_split() + df_train = split["train"] + df_valid = split["valid"] + df_test = split["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) + + # create dataframenot necessary but ensure we can load the saved data + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "epitope_aa", + "epitope_smi", + "tcr", + "tcr_full", + "label", + "split", + ] + + # overwrite column names = fields + fields_clean = [ + "epitope_aa", + "epitope_smiles", + "tcr", + "tcr_full", + "binding", + "split", + ] + df.columns = fields_clean + + # data cleaning + df.epitope_aa = ( + df.epitope_aa.str.strip() + ) # remove leading and trailing white space characters + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "tcr_epitope_binding", # unique identifier, we will also use this for directory names + "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, + activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic + peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide + protection against a wide range of pathogens.This new task aims to predict the binding affinity + given a pair of TCR sequence and epitope sequence.""", + "targets": [ + { + "id": "binding", # name of the column in a tabular dataset + "description": "TCR epitope binding.", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "boolean", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "tcr binding affinity", + "binding affinity", + "binding", + "epitope binding affinity", + "epitope binding", + ], + }, + ], + "identifiers": [ + { + "id": "epitope_smiles", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "OTHER" + "description": "epitope smiles", # description (optional, except for "OTHER") + }, + { + "id": "epitope_aa", + "type": "Other", + "names": [ + "epitope amino acid sequence", + ], + "description": "epitope amino acid sequence", + }, + { + "id": "tcr", + "type": "Other", + "names": [ + "hypervariable CDR3 loop", + ], + "description": "hypervariable CDR3 loop", + }, + { + "id": "tcr_full", + "type": "Other", + "names": [ + "tcr full amino acid sequence", + ], + "description": "tcr full amino acid sequence", + }, + ], + "templates": [ + { + "prompt": "Please answer the following question.\nPredict the for .", + "completion": "" + } + ], + "fields": [ + { + "exp_value": [ + "values": [ + { + "name": "exp_value", + "column": "exp_value", + "text": "binding" + } + ] + ], + "molecule": [ + "values": [ + { + "name": "epitope_smiles", + "column": "epitope_smiles", + "text": "SMILES" + + }, + { + "name": "tcr_full", + "column": "tcr_full", + "text": + + }, + + ] + ], + + } + ], + + + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://tdcommons.ai/multi_pred_tasks/tcrepitope/", + "description": "original data set link", + }, + { + "url": "https://doi.org/10.1093/bioinformatics/btab294", + "description": "corresponding publication", + }, + ], + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{weber2021titan, +title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, +author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, +journal={Bioinformatics}, +volume={56}, +number={4}, +pages={i237-i234}, +year={2021}, +publisher={Oxford Academic} +}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data()