From 0d43a4d3e4356436023946d0ff9e2a21f98230cb Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Sat, 4 Mar 2023 10:24:04 -0800 Subject: [PATCH 01/16] add tcr epitope binding data --- data/tcr_epitope_binding/.DS_Store | Bin 0 -> 6148 bytes .../example_processing_and_templates.ipynb | 1405 +++++++++++++++++ data/tcr_epitope_binding/meta.yaml | 44 + data/tcr_epitope_binding/transform.py | 135 ++ 4 files changed, 1584 insertions(+) create mode 100644 data/tcr_epitope_binding/.DS_Store create mode 100644 data/tcr_epitope_binding/example_processing_and_templates.ipynb create mode 100644 data/tcr_epitope_binding/meta.yaml create mode 100644 data/tcr_epitope_binding/transform.py diff --git a/data/tcr_epitope_binding/.DS_Store b/data/tcr_epitope_binding/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..2adb43894a0bae5570163f553e57de6ce1dab7fd GIT binary patch literal 6148 zcmeHKIZgvX5Ud6VMi4k8oRA2KAB-#$IOYRjj%dvyumfkCAilsCcqLRHpj|Db1kf#Y z*Bo6lyA@2&0Fe3i<_wqtn9>#TqG4=0uRgN97*P~k;}knwVU1?ljIzHUQ0@U4x7d)` zdJ7Iqju%UZRL6{|Kq@d)VBfnlt^eorAKw2%Qub1T zRNzl3Ad|(#V$NTR+B*6\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smitcrtcr_fulllabel
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "" + ], + "text/plain": [ + " epitope_aa epitope_smi \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full label \n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... 1 \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... 1 \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... 1 \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... 1 \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... 1 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "95158ac1-05d7-4a21-b8e4-7f720306d331", + "metadata": {}, + "source": [ + "## Add column = field names\n", + "Clean column names (`fields_clean`) and keep original names (`fields_orig`)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['epitope_aa', 'epitope_smi', 'tcr', 'tcr_full', 'label']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fields_orig = df.columns.tolist()\n", + "fields_orig" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + " fields_clean = [\n", + " \"epitope_aa\",\n", + " \"epitope_smiles\",\n", + " \"tcr\",\n", + " \"tcr_full\",\n", + " \"binding\",\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.columns = fields_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372", + "metadata": {}, + "source": [ + "## Data cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7e746003-cb1f-434f-bba6-00f0c439c4ac", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.epitope_aa = (\n", + " df.epitope_aa.str.strip()\n", + ") # remove leading and trailing white space characters" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d544fa60-343e-40e1-bd0c-4750f07a7145", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "assert not df.duplicated().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8", + "metadata": {}, + "source": [ + "## Save to csv" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.to_csv(fn_data_csv, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "63c8d4a4-906e-418d-be39-879365b4dfa0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_clean.csv\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epitope_aa,epitope_smiles,tcr,tcr_full,binding\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1\r\n", + "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASEGTSSYEQYFGPGTRLTVT,1\r\n" + ] + } + ], + "source": [ + "!head -n 5 {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1a512943-4909-4d56-867d-50c151d8d607", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a", + "metadata": {}, + "source": [ + "## Load from csv" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "077b0c5f-8772-4879-9317-3fa28799689b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_csv(fn_data_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "43619e7c-9c82-4ff0-ae25-403861304635", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421", + "metadata": {}, + "source": [ + "# meta YAML" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "49771077-471d-4d71-a9a7-d6b094bbc4f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitope_aaepitope_smilestcrtcr_fullbinding
0FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGTGKTYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
1FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSVWGEGRSYEQYFSAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI...1
2FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSATILAGVPYGEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
3FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CSASEGTSSYEQYFGAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML...1
4FLKEKGGLCC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC...CASSFDREVTGELFFGAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL...1
\n", + "
" + ], + "text/plain": [ + " epitope_aa epitope_smiles \\\n", + "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n", + "\n", + " tcr tcr_full \\\n", + "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n", + "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n", + "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n", + "\n", + " binding \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "meta = {\n", + " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", + " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\n", + " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\n", + " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\n", + " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\"\"\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"binding\", # name of the column in a tabular dataset\n", + " \"description\": \"TCR epitope binding.\", # description of what this column means\n", + " \"units\": \"\", # units of the values in this column (leave empty if unitless)\n", + " \"type\": \"binary classification\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", + " \"names\": [ # names for the property (to sample from for building the prompts)\n", + " \"tcr binding affinity\",\n", + " \"binding affinity\",\n", + " \"binding\",\n", + "\n", + " ],\n", + " },\n", + " ],\n", + " \"identifiers\": [\n", + " {\n", + " \"id\": \"epitope_smiles\", # column name\n", + " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", + " \"description\": \"epitope smiles \", # description (optional, except for \"OTHER\")\n", + " },\n", + " {\n", + " \"id\": \"epitope_aa\",\n", + " \"type\": \"amino acid\",\n", + " \"description\": \"epitope amino acid sequence\",\n", + "\n", + " },\n", + " {\n", + " \"id\": \"tcr_aa\",\n", + " \"type\": \"amino acid\",\n", + " \"description\": \"tcr amino acid sequence\",\n", + "\n", + " },\n", + " ],\n", + " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", + " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", + " {\n", + " \"url\": \"https://tdcommons.ai/multi_pred_tasks/tcrepitope/\",\n", + " \"description\": \"original data set link\",\n", + " },\n", + " {\n", + " \"url\": \"https://doi.org/10.1093/bioinformatics/btab294\",\n", + " \"description\": \"corresponding publication\",\n", + " },\n", + " ],\n", + " \"num_points\": len(df), # number of datapoints in this dataset\n", + " \"bibtex\": [\n", + " \"\"\"@article{weber2021titan,\n", + " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\n", + " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\n", + " journal={Bioinformatics},\n", + " volume={56},\n", + " number={4},\n", + " pages={i237-i234},\n", + " year={2021},\n", + " publisher={Oxford Academic}\n", + " }\"\"\",\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "ec455cf0-962a-4c0d-bb3e-066e415ffd9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + "\n", + "yaml.add_representer(str, str_presenter)\n", + "yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + ") # to use with safe_dum" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "580bbd79-4845-4515-be94-3e4a9815d048", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fn_meta = \"meta.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d01686c0-6746-4fc4-b019-350270dfc26f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:10 meta.yaml\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_meta}" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ef6063c5-7a8b-4344-bccf-a073443feebf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name: tcr_epitope_binding\r\n", + "description: |-\r\n", + " T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\r\n", + " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\r\n", + " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\r\n", + " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\r\n", + "targets:\r\n", + "- id: binding\r\n", + " description: TCR epitope binding.\r\n", + " units: ''\r\n", + " type: binary classification\r\n", + " names:\r\n", + " - tcr binding affinity\r\n", + " - binding affinity\r\n", + " - binding\r\n", + "identifiers:\r\n", + "- id: epitope_smiles\r\n", + " type: SMILES\r\n", + " description: 'epitope smiles '\r\n", + "- id: epitope_aa\r\n", + " type: amino acid\r\n", + " description: epitope amino acid sequence\r\n", + "- id: tcr_aa\r\n", + " type: amino acid\r\n", + " description: tcr amino acid sequence\r\n", + "license: CC BY 4.0\r\n", + "links:\r\n", + "- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/\r\n", + " description: original data set link\r\n", + "- url: https://doi.org/10.1093/bioinformatics/btab294\r\n", + " description: corresponding publication\r\n", + "num_points: 47182\r\n", + "bibtex:\r\n", + "- |-\r\n", + " @article{weber2021titan,\r\n", + " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\r\n", + " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\r\n", + " journal={Bioinformatics},\r\n", + " volume={56},\r\n", + " number={4},\r\n", + " pages={i237-i234},\r\n", + " year={2021},\r\n", + " publisher={Oxford Academic}\r\n", + " }\r\n" + ] + } + ], + "source": [ + "!cat {fn_meta}" + ] + }, + { + "cell_type": "markdown", + "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891", + "metadata": {}, + "source": [ + "# create transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "path_file = \"transform.py\"" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting transform.py\n" + ] + } + ], + "source": [ + "%%writefile $path_file\n", + "import pandas as pd\n", + "import yaml\n", + "from tdc.multi_pred import TCREpitopeBinding \n", + "\n", + "def get_and_transform_data():\n", + " # get raw data\n", + " data = TCREpitopeBinding(name = 'weber', path = './data')\n", + " fn_data_original = \"data_original.csv\"\n", + " data.get_data().to_csv(fn_data_original, index=False)\n", + "\n", + " # create dataframe\n", + " df = pd.read_csv(\n", + " fn_data_original,\n", + " delimiter=\",\",\n", + " ) # not necessary but ensure we can load the saved data\n", + "\n", + " # check if fields are the same\n", + " fields_orig = df.columns.tolist()\n", + " assert fields_orig == [\n", + " \"epitope_aa\",\n", + " \"epitope_smi\",\n", + " \"tcr\",\n", + " \"tcr_full\",\n", + " \"label\",\n", + " ]\n", + "\n", + " # overwrite column names = fields\n", + " fields_clean = [\n", + " \"epitope_aa\",\n", + " \"epitope_smiles\",\n", + " \"tcr\",\n", + " \"tcr_full_aa\",\n", + " \"binding\",\n", + " ]\n", + " df.columns = fields_clean\n", + "\n", + " # data cleaning\n", + " df.epitope_aa = (\n", + " df.epitope_aa.str.strip()\n", + " ) # remove leading and trailing white space characters\n", + "\n", + " assert not df.duplicated().sum()\n", + "\n", + " # save to csv\n", + " fn_data_csv = \"data_clean.csv\"\n", + " df.to_csv(fn_data_csv, index=False)\n", + "\n", + " # create meta yaml\n", + " meta = {\n", + " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", + " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\n", + " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\n", + " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\n", + " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\"\"\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"binding\", # name of the column in a tabular dataset\n", + " \"description\": \"TCR epitope binding.\", # description of what this column means\n", + " \"units\": \"\", # units of the values in this column (leave empty if unitless)\n", + " \"type\": \"binary classification\", # can be \"categorical\", \"ordinal\", \"continuous\"\n", + " \"names\": [ # names for the property (to sample from for building the prompts)\n", + " \"tcr binding affinity\",\n", + " \"binding affinity\",\n", + " \"binding\",\n", + "\n", + " ],\n", + " },\n", + " ],\n", + " \"identifiers\": [\n", + " {\n", + " \"id\": \"epitope_smiles\", # column name\n", + " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", + " \"description\": \"epitope smiles\", # description (optional, except for \"OTHER\")\n", + " },\n", + " {\n", + " \"id\": \"epitope_aa\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"epitope amino acid sequence\",\n", + " \n", + " },\n", + " {\n", + " \"id\": \"tcr_full_aa\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"tcr amino acid sequence\",\n", + " \n", + " },\n", + " ],\n", + " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", + " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", + " {\n", + " \"url\": \"https://tdcommons.ai/multi_pred_tasks/tcrepitope/\",\n", + " \"description\": \"original data set link\",\n", + " },\n", + " {\n", + " \"url\": \"https://doi.org/10.1093/bioinformatics/btab294\",\n", + " \"description\": \"corresponding publication\",\n", + " },\n", + " ],\n", + " \"num_points\": len(df), # number of datapoints in this dataset\n", + " \"bibtex\": [\n", + " \"\"\"@article{weber2021titan,\n", + " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\n", + " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\n", + " journal={Bioinformatics},\n", + " volume={56},\n", + " number={4},\n", + " pages={i237-i234},\n", + " year={2021},\n", + " publisher={Oxford Academic}\n", + " }\"\"\",\n", + " ],\n", + " }\n", + "\n", + " def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + "\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + " yaml.add_representer(str, str_presenter)\n", + " yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + " ) # to use with safe_dum\n", + " fn_meta = \"meta.yaml\"\n", + " with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)\n", + "\n", + " print(f\"Finished processing {meta['name']} dataset!\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " get_and_transform_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found local copy...\n", + "Loading...\n", + "Done!\n", + "Finished processing tcr_epitope_binding dataset!\n" + ] + } + ], + "source": [ + "!python3 transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 61592\r\n", + "drwxr-xr-x 3 cody staff 96B Mar 4 10:09 \u001b[34mdata\u001b[m\u001b[m/\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_clean.csv\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 45K Mar 4 10:06 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:10 meta.yaml\r\n", + "-rw-r--r--@ 1 cody staff 4.8K Mar 4 10:10 transform.py\r\n" + ] + } + ], + "source": [ + "ls -lh # fmt: skip" + ] + }, + { + "cell_type": "markdown", + "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739", + "metadata": {}, + "source": [ + "# End" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml new file mode 100644 index 000000000..52d4bc8c9 --- /dev/null +++ b/data/tcr_epitope_binding/meta.yaml @@ -0,0 +1,44 @@ +name: tcr_epitope_binding +description: |- + T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation + and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). + A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. + This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. +targets: +- id: binding + description: TCR epitope binding. + units: '' + type: binary classification + names: + - tcr binding affinity + - binding affinity + - binding +identifiers: +- id: epitope_smiles + type: SMILES + description: 'epitope smiles ' +- id: epitope_aa + type: amino acid + description: epitope amino acid sequence +- id: tcr_aa + type: amino acid + description: tcr amino acid sequence +license: CC BY 4.0 +links: +- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ + description: original data set link +- url: https://doi.org/10.1093/bioinformatics/btab294 + description: corresponding publication +num_points: 47182 +bibtex: +- |- + @article{weber2021titan, + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + } diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py new file mode 100644 index 000000000..bf5f70eb5 --- /dev/null +++ b/data/tcr_epitope_binding/transform.py @@ -0,0 +1,135 @@ +import pandas as pd +import yaml +from tdc.multi_pred import TCREpitopeBinding + +def get_and_transform_data(): + # get raw data + data = TCREpitopeBinding(name = 'weber', path = './data') + fn_data_original = "data_original.csv" + data.get_data().to_csv(fn_data_original, index=False) + + # create dataframe + df = pd.read_csv( + fn_data_original, + delimiter=",", + ) # not necessary but ensure we can load the saved data + + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == [ + "epitope_aa", + "epitope_smi", + "tcr", + "tcr_aa", + "label", + ] + + # overwrite column names = fields + fields_clean = [ + "epitope_aa", + "epitope_smiles", + "tcr", + "tcr_aa", + "binding", + ] + df.columns = fields_clean + + # data cleaning + df.epitope_aa = ( + df.epitope_aa.str.strip() + ) # remove leading and trailing white space characters + + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "tcr_epitope_binding", # unique identifier, we will also use this for directory names + "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation + and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). + A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. + This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.""", + "targets": [ + { + "id": "binding", # name of the column in a tabular dataset + "description": "TCR epitope binding.", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "binary classification", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "tcr binding affinity", + "binding affinity", + "binding", + + ], + }, + ], + "identifiers": [ + { + "id": "epitope_smiles", # column name + "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "OTHER" + "description": "epitope smiles", # description (optional, except for "OTHER") + }, + { + "id": "epitope_aa", + "type": "Other", + "description": "epitope amino acid sequence", + + }, + { + "id": "tcr_full_aa", + "type": "Other", + "description": "tcr amino acid sequence", + + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://tdcommons.ai/multi_pred_tasks/tcrepitope/", + "description": "original data set link", + }, + { + "url": "https://doi.org/10.1093/bioinformatics/btab294", + "description": "corresponding publication", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{weber2021titan, + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + }""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() From 457f3b740938cb103f1a19742a696b1cf4ad7c07 Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Sat, 4 Mar 2023 10:24:35 -0800 Subject: [PATCH 02/16] add tcr epitope binding data --- data/tcr_epitope_binding/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 data/tcr_epitope_binding/.DS_Store diff --git a/data/tcr_epitope_binding/.DS_Store b/data/tcr_epitope_binding/.DS_Store deleted file mode 100644 index 2adb43894a0bae5570163f553e57de6ce1dab7fd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKIZgvX5Ud6VMi4k8oRA2KAB-#$IOYRjj%dvyumfkCAilsCcqLRHpj|Db1kf#Y z*Bo6lyA@2&0Fe3i<_wqtn9>#TqG4=0uRgN97*P~k;}knwVU1?ljIzHUQ0@U4x7d)` zdJ7Iqju%UZRL6{|Kq@d)VBfnlt^eorAKw2%Qub1T zRNzl3Ad|(#V$NTR+B*6 Date: Sat, 4 Mar 2023 11:16:26 -0800 Subject: [PATCH 03/16] changed identifiers and description of tcr --- .../example_processing_and_templates.ipynb | 42 ++++++++++++------- data/tcr_epitope_binding/transform.py | 12 ++++-- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/data/tcr_epitope_binding/example_processing_and_templates.ipynb b/data/tcr_epitope_binding/example_processing_and_templates.ipynb index bad9efd76..da47142c0 100644 --- a/data/tcr_epitope_binding/example_processing_and_templates.ipynb +++ b/data/tcr_epitope_binding/example_processing_and_templates.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "cf59e3e9-8061-4022-9eae-e978311b4155", "metadata": { "tags": [] @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31", "metadata": { "tags": [] @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "b39a142e-ccbc-49d2-98b0-a5f9bde9fd27", "metadata": { "tags": [] @@ -79,7 +79,7 @@ "output_type": "stream", "text": [ "Downloading...\n", - "100%|████████████████████████████████████████████████| 16.0M/16.0M [00:02<00:00, 5.98MiB/s]\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████| 16.0M/16.0M [00:02<00:00, 5.75MiB/s]\n", "Loading...\n", "Done!\n" ] @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "26d9f62a-07f5-4113-8161-d5dfcf0bfb71", "metadata": { "tags": [] @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "43873fc3-20a8-487d-a7c5-33bd58414159", "metadata": { "tags": [] @@ -114,11 +114,11 @@ "output_type": "stream", "text": [ "total 30672\r\n", - "drwxr-xr-x 3 cody staff 96B Mar 4 10:09 \u001b[34mdata\u001b[m\u001b[m\r\n", - "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_original.csv\r\n", - "-rw-r--r-- 1 cody staff 45K Mar 4 10:06 example_processing_and_templates.ipynb\r\n", - "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:05 meta.yaml\r\n", - "-rw-r--r--@ 1 cody staff 4.8K Mar 4 10:05 transform.py\r\n" + "drwxr-xr-x 3 cody staff 96B Mar 4 11:11 \u001b[34mdata\u001b[m\u001b[m\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 4 11:11 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 46K Mar 4 10:13 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:10 meta.yaml\r\n", + "-rw-r--r--@ 1 cody staff 4.8K Mar 4 11:10 transform.py\r\n" ] } ], @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "77f614e7-b133-40bc-8759-2d930e4c120e", "metadata": { "tags": [] @@ -943,12 +943,24 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 1, "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 51\u001b[0m\n\u001b[1;32m 1\u001b[0m meta \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtcr_epitope_binding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# unique identifier, we will also use this for directory names\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;124mT-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\u001b[39m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124m This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtargets\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\n\u001b[1;32m 8\u001b[0m {\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinding\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# name of the column in a tabular dataset\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTCR epitope binding.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# description of what this column means\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munits\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# units of the values in this column (leave empty if unitless)\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinary classification\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# can be \"categorical\", \"ordinal\", \"continuous\"\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m: [ \u001b[38;5;66;03m# names for the property (to sample from for building the prompts)\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtcr binding affinity\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 15\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinding affinity\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 16\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbinding\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 17\u001b[0m \n\u001b[1;32m 18\u001b[0m ],\n\u001b[1;32m 19\u001b[0m },\n\u001b[1;32m 20\u001b[0m ],\n\u001b[1;32m 21\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124midentifiers\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\n\u001b[1;32m 22\u001b[0m {\n\u001b[1;32m 23\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mepitope_smiles\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# column name\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSMILES\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mepitope smiles \u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# description (optional, except for \"OTHER\")\u001b[39;00m\n\u001b[1;32m 26\u001b[0m },\n\u001b[1;32m 27\u001b[0m {\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mepitope_aa\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 29\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamino acid\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 30\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mepitope amino acid sequence\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 31\u001b[0m \n\u001b[1;32m 32\u001b[0m },\n\u001b[1;32m 33\u001b[0m {\n\u001b[1;32m 34\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtcr_full\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 35\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mamino acid\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 36\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtcr amino acid sequence\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m },\n\u001b[1;32m 39\u001b[0m ],\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlicense\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCC BY 4.0\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# license under which the original dataset was published\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlinks\u001b[39m\u001b[38;5;124m\"\u001b[39m: [ \u001b[38;5;66;03m# list of relevant links (original dataset, other uses, etc.)\u001b[39;00m\n\u001b[1;32m 42\u001b[0m {\n\u001b[1;32m 43\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124murl\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://tdcommons.ai/multi_pred_tasks/tcrepitope/\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 44\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moriginal data set link\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 45\u001b[0m },\n\u001b[1;32m 46\u001b[0m {\n\u001b[1;32m 47\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124murl\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://doi.org/10.1093/bioinformatics/btab294\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 48\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcorresponding publication\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 49\u001b[0m },\n\u001b[1;32m 50\u001b[0m ],\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_points\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mlen\u001b[39m(\u001b[43mdf\u001b[49m), \u001b[38;5;66;03m# number of datapoints in this dataset\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbibtex\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\n\u001b[1;32m 53\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"@article{weber2021titan,\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;124;03m title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\u001b[39;00m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;124;03m author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;124;03m journal={Bioinformatics},\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03m volume={56},\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m number={4},\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;124;03m pages={i237-i234},\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;124;03m year={2021},\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;124;03m publisher={Oxford Academic}\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;124;03m }\"\"\"\u001b[39;00m,\n\u001b[1;32m 63\u001b[0m ],\n\u001b[1;32m 64\u001b[0m }\n", + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" + ] + } + ], "source": [ "meta = {\n", " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", @@ -983,7 +995,7 @@ "\n", " },\n", " {\n", - " \"id\": \"tcr_aa\",\n", + " \"id\": \"tcr_full\",\n", " \"type\": \"amino acid\",\n", " \"description\": \"tcr amino acid sequence\",\n", "\n", diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index bf5f70eb5..5b1be5e26 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -29,7 +29,7 @@ def get_and_transform_data(): "epitope_aa", "epitope_smiles", "tcr", - "tcr_aa", + "tcr_full", "binding", ] df.columns = fields_clean @@ -79,9 +79,15 @@ def get_and_transform_data(): }, { - "id": "tcr_full_aa", + "id": "tcr", + "type": "Other", + "description": "hypervariable CDR3 loop", + + }, + { + "id": "tcr_full", "type": "Other", - "description": "tcr amino acid sequence", + "description": "tcr full amino acid sequence", }, ], From 853aff3109bc2d37d061c6bd16004ea96fa5febc Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Fri, 10 Mar 2023 09:41:50 -0800 Subject: [PATCH 04/16] add more synonyms to meta.yaml to include the binding site --- data/tcr_epitope_binding/transform.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 5b1be5e26..14d53a734 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -62,6 +62,8 @@ def get_and_transform_data(): "tcr binding affinity", "binding affinity", "binding", + "epitope binding affinity", + "epitope binding" ], }, From 47df0aa09a68d66f9f8f929fac65a232b309b692 Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Sat, 18 Mar 2023 19:48:26 -0700 Subject: [PATCH 05/16] add splits --- data/tcr_epitope_binding/.DS_Store | Bin 0 -> 6148 bytes .../example_processing_and_templates.ipynb | 99 +++++++++--------- data/tcr_epitope_binding/meta.yaml | 27 ++--- data/tcr_epitope_binding/transform.py | 23 ++-- 4 files changed, 77 insertions(+), 72 deletions(-) create mode 100644 data/tcr_epitope_binding/.DS_Store diff --git a/data/tcr_epitope_binding/.DS_Store b/data/tcr_epitope_binding/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 51\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_points\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mlen\u001b[39m(\u001b[43mdf\u001b[49m), \u001b[38;5;66;03m# number of datapoints in this dataset\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbibtex\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\n\u001b[1;32m 53\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"@article{weber2021titan,\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;124;03m title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\u001b[39;00m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;124;03m author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;124;03m journal={Bioinformatics},\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03m volume={56},\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m number={4},\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;124;03m pages={i237-i234},\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;124;03m year={2021},\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;124;03m publisher={Oxford Academic}\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;124;03m }\"\"\"\u001b[39;00m,\n\u001b[1;32m 63\u001b[0m ],\n\u001b[1;32m 64\u001b[0m }\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], + "outputs": [], "source": [ "meta = {\n", " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", @@ -1012,6 +1000,7 @@ " \"description\": \"corresponding publication\",\n", " },\n", " ],\n", + " \"split col\": \"split\",\n", " \"num_points\": len(df), # number of datapoints in this dataset\n", " \"bibtex\": [\n", " \"\"\"@article{weber2021titan,\n", @@ -1089,7 +1078,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:10 meta.yaml\r\n" + "-rw-r--r-- 1 cody staff 1.5K Mar 18 19:44 meta.yaml\r\n" ] } ], @@ -1131,7 +1120,7 @@ "- id: epitope_aa\r\n", " type: amino acid\r\n", " description: epitope amino acid sequence\r\n", - "- id: tcr_aa\r\n", + "- id: tcr_full\r\n", " type: amino acid\r\n", " description: tcr amino acid sequence\r\n", "license: CC BY 4.0\r\n", @@ -1140,6 +1129,7 @@ " description: original data set link\r\n", "- url: https://doi.org/10.1093/bioinformatics/btab294\r\n", " description: corresponding publication\r\n", + "split col: split\r\n", "num_points: 47182\r\n", "bibtex:\r\n", "- |-\r\n", @@ -1182,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", "metadata": { "tags": [] @@ -1198,21 +1188,21 @@ ], "source": [ "%%writefile $path_file\n", - "import pandas as pd\n", - "import yaml\n", - "from tdc.multi_pred import TCREpitopeBinding \n", - "\n", "def get_and_transform_data():\n", " # get raw data\n", " data = TCREpitopeBinding(name = 'weber', path = './data')\n", - " fn_data_original = \"data_original.csv\"\n", - " data.get_data().to_csv(fn_data_original, index=False)\n", + " \n", + " split = data.get_split()\n", + " df_train=split['train']\n", + " df_valid=split['valid']\n", + " df_test=split['test']\n", + " df_train['split']=\"train\"\n", + " df_valid['split']=\"valid\"\n", + " df_test['split']=\"test\"\n", + " df=pd.concat(df_train,df_valid,df_test,axis=0)\n", "\n", " # create dataframe\n", - " df = pd.read_csv(\n", - " fn_data_original,\n", - " delimiter=\",\",\n", - " ) # not necessary but ensure we can load the saved data\n", + " not necessary but ensure we can load the saved data\n", "\n", " # check if fields are the same\n", " fields_orig = df.columns.tolist()\n", @@ -1220,8 +1210,9 @@ " \"epitope_aa\",\n", " \"epitope_smi\",\n", " \"tcr\",\n", - " \"tcr_full\",\n", + " \"tcr_aa\",\n", " \"label\",\n", + " \"split\"\n", " ]\n", "\n", " # overwrite column names = fields\n", @@ -1229,8 +1220,9 @@ " \"epitope_aa\",\n", " \"epitope_smiles\",\n", " \"tcr\",\n", - " \"tcr_full_aa\",\n", + " \"tcr_full\",\n", " \"binding\",\n", + " \"split\"\n", " ]\n", " df.columns = fields_clean\n", "\n", @@ -1262,6 +1254,8 @@ " \"tcr binding affinity\",\n", " \"binding affinity\",\n", " \"binding\",\n", + " \"epitope binding affinity\",\n", + " \"epitope binding\"\n", "\n", " ],\n", " },\n", @@ -1279,9 +1273,15 @@ " \n", " },\n", " {\n", - " \"id\": \"tcr_full_aa\",\n", + " \"id\": \"tcr\",\n", " \"type\": \"Other\",\n", - " \"description\": \"tcr amino acid sequence\",\n", + " \"description\": \"hypervariable CDR3 loop\",\n", + " \n", + " },\n", + " {\n", + " \"id\": \"tcr_full\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"tcr full amino acid sequence\",\n", " \n", " },\n", " ],\n", @@ -1296,6 +1296,7 @@ " \"description\": \"corresponding publication\",\n", " },\n", " ],\n", + " \"split col\": \"split\"\n", " \"num_points\": len(df), # number of datapoints in this dataset\n", " \"bibtex\": [\n", " \"\"\"@article{weber2021titan,\n", @@ -1347,10 +1348,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found local copy...\n", - "Loading...\n", - "Done!\n", - "Finished processing tcr_epitope_binding dataset!\n" + " File \"/Users/cody/chemnlp/data/tcr_epitope_binding/transform.py\", line 15\r\n", + " not necessary but ensure we can load the saved data\r\n", + " ^\r\n", + "IndentationError: unindent does not match any outer indentation level\r\n" ] } ], diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml index 52d4bc8c9..8c6f5bee3 100644 --- a/data/tcr_epitope_binding/meta.yaml +++ b/data/tcr_epitope_binding/meta.yaml @@ -1,9 +1,9 @@ name: tcr_epitope_binding description: |- T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation - and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). - A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. - This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. + and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). + A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. + This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. targets: - id: binding description: TCR epitope binding. @@ -20,7 +20,7 @@ identifiers: - id: epitope_aa type: amino acid description: epitope amino acid sequence -- id: tcr_aa +- id: tcr_full type: amino acid description: tcr amino acid sequence license: CC BY 4.0 @@ -29,16 +29,17 @@ links: description: original data set link - url: https://doi.org/10.1093/bioinformatics/btab294 description: corresponding publication +split col: split num_points: 47182 bibtex: - |- @article{weber2021titan, - title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, - author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, - journal={Bioinformatics}, - volume={56}, - number={4}, - pages={i237-i234}, - year={2021}, - publisher={Oxford Academic} - } + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + } diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 14d53a734..78ed8d88e 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -1,18 +1,18 @@ -import pandas as pd -import yaml -from tdc.multi_pred import TCREpitopeBinding - def get_and_transform_data(): # get raw data data = TCREpitopeBinding(name = 'weber', path = './data') - fn_data_original = "data_original.csv" - data.get_data().to_csv(fn_data_original, index=False) + + split = data.get_split() + df_train=split['train'] + df_valid=split['valid'] + df_test=split['test'] + df_train['split']="train" + df_valid['split']="valid" + df_test['split']="test" + df=pd.concat(df_train,df_valid,df_test,axis=0) # create dataframe - df = pd.read_csv( - fn_data_original, - delimiter=",", - ) # not necessary but ensure we can load the saved data + not necessary but ensure we can load the saved data # check if fields are the same fields_orig = df.columns.tolist() @@ -22,6 +22,7 @@ def get_and_transform_data(): "tcr", "tcr_aa", "label", + "split" ] # overwrite column names = fields @@ -31,6 +32,7 @@ def get_and_transform_data(): "tcr", "tcr_full", "binding", + "split" ] df.columns = fields_clean @@ -104,6 +106,7 @@ def get_and_transform_data(): "description": "corresponding publication", }, ], + "split col": "split" "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@article{weber2021titan, From d6e05d33dfc32b18d6bad1a564854039c3e1c936 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 Mar 2023 02:48:36 +0000 Subject: [PATCH 06/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/tcr_epitope_binding/meta.yaml | 73 ++++++++++++++------------- data/tcr_epitope_binding/transform.py | 8 +-- 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml index 8c6f5bee3..4b3677e43 100644 --- a/data/tcr_epitope_binding/meta.yaml +++ b/data/tcr_epitope_binding/meta.yaml @@ -1,45 +1,46 @@ +--- name: tcr_epitope_binding description: |- - T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation - and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). - A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. - This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. + T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation + and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). + A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. + This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. targets: -- id: binding - description: TCR epitope binding. - units: '' - type: binary classification - names: - - tcr binding affinity - - binding affinity - - binding + - id: binding + description: TCR epitope binding. + units: '' + type: binary classification + names: + - tcr binding affinity + - binding affinity + - binding identifiers: -- id: epitope_smiles - type: SMILES - description: 'epitope smiles ' -- id: epitope_aa - type: amino acid - description: epitope amino acid sequence -- id: tcr_full - type: amino acid - description: tcr amino acid sequence + - id: epitope_smiles + type: SMILES + description: 'epitope smiles ' + - id: epitope_aa + type: amino acid + description: epitope amino acid sequence + - id: tcr_full + type: amino acid + description: tcr amino acid sequence license: CC BY 4.0 links: -- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ - description: original data set link -- url: https://doi.org/10.1093/bioinformatics/btab294 - description: corresponding publication + - url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ + description: original data set link + - url: https://doi.org/10.1093/bioinformatics/btab294 + description: corresponding publication split col: split num_points: 47182 bibtex: -- |- - @article{weber2021titan, - title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, - author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, - journal={Bioinformatics}, - volume={56}, - number={4}, - pages={i237-i234}, - year={2021}, - publisher={Oxford Academic} - } + - |- + @article{weber2021titan, + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + } diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 78ed8d88e..2e1b594c1 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -1,7 +1,7 @@ def get_and_transform_data(): # get raw data data = TCREpitopeBinding(name = 'weber', path = './data') - + split = data.get_split() df_train=split['train'] df_valid=split['valid'] @@ -80,19 +80,19 @@ def get_and_transform_data(): "id": "epitope_aa", "type": "Other", "description": "epitope amino acid sequence", - + }, { "id": "tcr", "type": "Other", "description": "hypervariable CDR3 loop", - + }, { "id": "tcr_full", "type": "Other", "description": "tcr full amino acid sequence", - + }, ], "license": "CC BY 4.0", # license under which the original dataset was published From 54f6cd5852cabc53b6cfbb730c4ec7d5ca2a209d Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Tue, 28 Mar 2023 00:14:14 -0700 Subject: [PATCH 07/16] add benchmarks --- .../example_processing_and_templates.ipynb | 8 +++++++- data/tcr_epitope_binding/transform.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/data/tcr_epitope_binding/example_processing_and_templates.ipynb b/data/tcr_epitope_binding/example_processing_and_templates.ipynb index a1f66122e..c1c8c25ec 100644 --- a/data/tcr_epitope_binding/example_processing_and_templates.ipynb +++ b/data/tcr_epitope_binding/example_processing_and_templates.ipynb @@ -1000,7 +1000,13 @@ " \"description\": \"corresponding publication\",\n", " },\n", " ],\n", - " \"split col\": \"split\",\n", + " \"benchmarks\": [\n", + " {\n", + " \"name\": \"TDC\",\n", + " \"link\": \"https://tdcommons.ai/\",\n", + " \"split_column\": \"split\",\n", + " },\n", + " ],\n", " \"num_points\": len(df), # number of datapoints in this dataset\n", " \"bibtex\": [\n", " \"\"\"@article{weber2021titan,\n", diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 78ed8d88e..da4065e59 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -11,8 +11,7 @@ def get_and_transform_data(): df_test['split']="test" df=pd.concat(df_train,df_valid,df_test,axis=0) - # create dataframe - not necessary but ensure we can load the saved data + # create dataframenot necessary but ensure we can load the saved data # check if fields are the same fields_orig = df.columns.tolist() @@ -106,7 +105,13 @@ def get_and_transform_data(): "description": "corresponding publication", }, ], - "split col": "split" + "benchmarks": [ + { + "name": "TDC", + "link": "https://tdcommons.ai/", + "split_column": "split", + }, + ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@article{weber2021titan, From 60ed1d1b52691fa48e025377b253e0b161753673 Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Tue, 28 Mar 2023 00:15:15 -0700 Subject: [PATCH 08/16] remove file --- data/tcr_epitope_binding/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 data/tcr_epitope_binding/.DS_Store diff --git a/data/tcr_epitope_binding/.DS_Store b/data/tcr_epitope_binding/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Tue, 28 Mar 2023 00:24:51 -0700 Subject: [PATCH 09/16] add imports to transform.py --- data/tcr_epitope_binding/transform.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index af303b1c4..dfa38b84a 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -1,3 +1,8 @@ + +import pandas as pd +import yaml +from tdc.multi_pred import TCREpitopeBinding + def get_and_transform_data(): # get raw data data = TCREpitopeBinding(name = 'weber', path = './data') From 4d9ed5abfb5aede6c799feb78f14be813a84f1c4 Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Tue, 28 Mar 2023 00:32:04 -0700 Subject: [PATCH 10/16] fix long lines --- data/tcr_epitope_binding/transform.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index dfa38b84a..474c1f6ea 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -54,10 +54,11 @@ def get_and_transform_data(): # create meta yaml meta = { "name": "tcr_epitope_binding", # unique identifier, we will also use this for directory names - "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation - and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). - A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. - This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.""", + "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, + activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic + peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide + protection against a wide range of pathogens.This new task aims to predict the binding affinity + given a pair of TCR sequence and epitope sequence.""", "targets": [ { "id": "binding", # name of the column in a tabular dataset From 5cac6a0ed345dac8d6cf8b10e0f359201fd2a44e Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Tue, 28 Mar 2023 10:08:41 +0200 Subject: [PATCH 11/16] feat: apply black changes --- data/tcr_epitope_binding/transform.py | 38 ++++++++++++--------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 474c1f6ea..8bb454ab9 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -1,20 +1,20 @@ - import pandas as pd import yaml -from tdc.multi_pred import TCREpitopeBinding +from tdc.multi_pred import TCREpitopeBinding + def get_and_transform_data(): # get raw data - data = TCREpitopeBinding(name = 'weber', path = './data') + data = TCREpitopeBinding(name="weber", path="./data") split = data.get_split() - df_train=split['train'] - df_valid=split['valid'] - df_test=split['test'] - df_train['split']="train" - df_valid['split']="valid" - df_test['split']="test" - df=pd.concat(df_train,df_valid,df_test,axis=0) + df_train = split["train"] + df_valid = split["valid"] + df_test = split["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat(df_train, df_valid, df_test, axis=0) # create dataframenot necessary but ensure we can load the saved data @@ -26,7 +26,7 @@ def get_and_transform_data(): "tcr", "tcr_aa", "label", - "split" + "split", ] # overwrite column names = fields @@ -36,7 +36,7 @@ def get_and_transform_data(): "tcr", "tcr_full", "binding", - "split" + "split", ] df.columns = fields_clean @@ -54,10 +54,10 @@ def get_and_transform_data(): # create meta yaml meta = { "name": "tcr_epitope_binding", # unique identifier, we will also use this for directory names - "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, - activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic - peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide - protection against a wide range of pathogens.This new task aims to predict the binding affinity + "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, + activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic + peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide + protection against a wide range of pathogens.This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.""", "targets": [ { @@ -70,8 +70,7 @@ def get_and_transform_data(): "binding affinity", "binding", "epitope binding affinity", - "epitope binding" - + "epitope binding", ], }, ], @@ -85,19 +84,16 @@ def get_and_transform_data(): "id": "epitope_aa", "type": "Other", "description": "epitope amino acid sequence", - }, { "id": "tcr", "type": "Other", "description": "hypervariable CDR3 loop", - }, { "id": "tcr_full", "type": "Other", "description": "tcr full amino acid sequence", - }, ], "license": "CC BY 4.0", # license under which the original dataset was published From a6feb33c565bf30837bcf598c7a16112a4c7110e Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Tue, 28 Mar 2023 01:29:51 -0700 Subject: [PATCH 12/16] added brackets to concat and tcr_full-> tcr_full --- .../example_processing_and_templates.ipynb | 104 +++++++++++------- data/tcr_epitope_binding/meta.yaml | 65 ++++++----- data/tcr_epitope_binding/transform.py | 23 ++-- 3 files changed, 110 insertions(+), 82 deletions(-) diff --git a/data/tcr_epitope_binding/example_processing_and_templates.ipynb b/data/tcr_epitope_binding/example_processing_and_templates.ipynb index c1c8c25ec..9303c464e 100644 --- a/data/tcr_epitope_binding/example_processing_and_templates.ipynb +++ b/data/tcr_epitope_binding/example_processing_and_templates.ipynb @@ -78,8 +78,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Downloading...\n", - "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16.0M/16.0M [00:04<00:00, 3.59MiB/s]\n", + "Found local copy...\n", "Loading...\n", "Done!\n" ] @@ -113,12 +112,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 30688\r\n", - "drwxr-xr-x 3 cody staff 96B Mar 18 19:44 \u001b[34mdata\u001b[m\u001b[m\r\n", - "-rw-r--r-- 1 cody staff 15M Mar 18 19:44 data_original.csv\r\n", - "-rw-r--r-- 1 cody staff 55K Mar 18 19:30 example_processing_and_templates.ipynb\r\n", - "-rw-r--r-- 1 cody staff 1.5K Mar 18 19:30 meta.yaml\r\n", - "-rw-r--r--@ 1 cody staff 5.1K Mar 18 19:42 transform.py\r\n" + "total 30664\r\n", + "drwxr-xr-x 3 cody staff 96B Mar 28 01:21 \u001b[34mdata\u001b[m\u001b[m\r\n", + "-rw-r--r-- 1 cody staff 15M Mar 28 01:22 data_original.csv\r\n", + "-rw-r--r-- 1 cody staff 47K Mar 28 01:18 example_processing_and_templates.ipynb\r\n", + "-rw-r--r--@ 1 cody staff 5.3K Mar 28 01:18 transform.py\r\n" ] } ], @@ -530,7 +528,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "-rw-r--r-- 1 cody staff 15M Mar 18 19:44 data_clean.csv\r\n" + "-rw-r--r-- 1 cody staff 15M Mar 28 01:23 data_clean.csv\r\n" ] } ], @@ -952,10 +950,11 @@ "source": [ "meta = {\n", " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n", - " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\n", - " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\n", - " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\n", - " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\"\"\",\n", + " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, \n", + " activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic \n", + " peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide \n", + " protection against a wide range of pathogens.This new task aims to predict the binding affinity \n", + " given a pair of TCR sequence and epitope sequence.\"\"\",\n", " \"targets\": [\n", " {\n", " \"id\": \"binding\", # name of the column in a tabular dataset\n", @@ -966,6 +965,8 @@ " \"tcr binding affinity\",\n", " \"binding affinity\",\n", " \"binding\",\n", + " \"epitope binding affinity\",\n", + " \"epitope binding\"\n", "\n", " ],\n", " },\n", @@ -974,18 +975,24 @@ " {\n", " \"id\": \"epitope_smiles\", # column name\n", " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n", - " \"description\": \"epitope smiles \", # description (optional, except for \"OTHER\")\n", + " \"description\": \"epitope smiles\", # description (optional, except for \"OTHER\")\n", " },\n", " {\n", " \"id\": \"epitope_aa\",\n", - " \"type\": \"amino acid\",\n", + " \"type\": \"Other\",\n", " \"description\": \"epitope amino acid sequence\",\n", "\n", " },\n", " {\n", + " \"id\": \"tcr\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"hypervariable CDR3 loop\",\n", + "\n", + " },\n", + " {\n", " \"id\": \"tcr_full\",\n", - " \"type\": \"amino acid\",\n", - " \"description\": \"tcr amino acid sequence\",\n", + " \"type\": \"Other\",\n", + " \"description\": \"tcr full amino acid sequence\",\n", "\n", " },\n", " ],\n", @@ -1084,7 +1091,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "-rw-r--r-- 1 cody staff 1.5K Mar 18 19:44 meta.yaml\r\n" + "-rw-r--r-- 1 cody staff 1.7K Mar 28 01:23 meta.yaml\r\n" ] } ], @@ -1105,11 +1112,12 @@ "output_type": "stream", "text": [ "name: tcr_epitope_binding\r\n", - "description: |-\r\n", - " T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\r\n", - " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\r\n", - " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\r\n", - " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\r\n", + "description: \"T-cells are an integral part of the adaptive immune system, whose survival,\\\r\n", + " \\ proliferation, \\n activationand function are all governed by the interaction\\\r\n", + " \\ of their T-cell receptor (TCR) with immunogenic \\n peptides (epitopes).A large\\\r\n", + " \\ repertoire of T-cell receptors with different specificity is needed to provide\\\r\n", + " \\ \\n protection against a wide range of pathogens.This new task aims to predict\\\r\n", + " \\ the binding affinity \\n given a pair of TCR sequence and epitope sequence.\"\r\n", "targets:\r\n", "- id: binding\r\n", " description: TCR epitope binding.\r\n", @@ -1119,23 +1127,31 @@ " - tcr binding affinity\r\n", " - binding affinity\r\n", " - binding\r\n", + " - epitope binding affinity\r\n", + " - epitope binding\r\n", "identifiers:\r\n", "- id: epitope_smiles\r\n", " type: SMILES\r\n", - " description: 'epitope smiles '\r\n", + " description: epitope smiles\r\n", "- id: epitope_aa\r\n", - " type: amino acid\r\n", + " type: Other\r\n", " description: epitope amino acid sequence\r\n", + "- id: tcr\r\n", + " type: Other\r\n", + " description: hypervariable CDR3 loop\r\n", "- id: tcr_full\r\n", - " type: amino acid\r\n", - " description: tcr amino acid sequence\r\n", + " type: Other\r\n", + " description: tcr full amino acid sequence\r\n", "license: CC BY 4.0\r\n", "links:\r\n", "- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/\r\n", " description: original data set link\r\n", "- url: https://doi.org/10.1093/bioinformatics/btab294\r\n", " description: corresponding publication\r\n", - "split col: split\r\n", + "benchmarks:\r\n", + "- name: TDC\r\n", + " link: https://tdcommons.ai/\r\n", + " split_column: split\r\n", "num_points: 47182\r\n", "bibtex:\r\n", "- |-\r\n", @@ -1166,7 +1182,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 38, "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b", "metadata": { "tags": [] @@ -1178,7 +1194,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 41, "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972", "metadata": { "tags": [] @@ -1194,6 +1210,9 @@ ], "source": [ "%%writefile $path_file\n", + "import pandas as pd\n", + "import yaml\n", + "from tdc.multi_pred import TCREpitopeBinding \n", "def get_and_transform_data():\n", " # get raw data\n", " data = TCREpitopeBinding(name = 'weber', path = './data')\n", @@ -1205,10 +1224,9 @@ " df_train['split']=\"train\"\n", " df_valid['split']=\"valid\"\n", " df_test['split']=\"test\"\n", - " df=pd.concat(df_train,df_valid,df_test,axis=0)\n", + " df=pd.concat([df_train,df_valid,df_test],axis=0)\n", "\n", - " # create dataframe\n", - " not necessary but ensure we can load the saved data\n", + " # create dataframenot necessary but ensure we can load the saved data\n", "\n", " # check if fields are the same\n", " fields_orig = df.columns.tolist()\n", @@ -1216,7 +1234,7 @@ " \"epitope_aa\",\n", " \"epitope_smi\",\n", " \"tcr\",\n", - " \"tcr_aa\",\n", + " \"tcr_full\",\n", " \"label\",\n", " \"split\"\n", " ]\n", @@ -1302,7 +1320,13 @@ " \"description\": \"corresponding publication\",\n", " },\n", " ],\n", - " \"split col\": \"split\"\n", + " \"benchmarks\": [\n", + " {\n", + " \"name\": \"TDC\",\n", + " \"link\": \"https://tdcommons.ai/\",\n", + " \"split_column\": \"split\",\n", + " },\n", + " ],\n", " \"num_points\": len(df), # number of datapoints in this dataset\n", " \"bibtex\": [\n", " \"\"\"@article{weber2021titan,\n", @@ -1344,7 +1368,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 42, "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6", "metadata": { "tags": [] @@ -1354,10 +1378,10 @@ "name": "stdout", "output_type": "stream", "text": [ - " File \"/Users/cody/chemnlp/data/tcr_epitope_binding/transform.py\", line 15\r\n", - " not necessary but ensure we can load the saved data\r\n", - " ^\r\n", - "IndentationError: unindent does not match any outer indentation level\r\n" + "Found local copy...\n", + "Loading...\n", + "Done!\n", + "Finished processing tcr_epitope_binding dataset!\n" ] } ], diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml index 4b3677e43..be1067720 100644 --- a/data/tcr_epitope_binding/meta.yaml +++ b/data/tcr_epitope_binding/meta.yaml @@ -1,40 +1,47 @@ ---- name: tcr_epitope_binding description: |- - T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation - and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). - A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. - This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. + T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation + and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). + A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. + This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. targets: - - id: binding - description: TCR epitope binding. - units: '' - type: binary classification - names: - - tcr binding affinity - - binding affinity - - binding +- id: binding + description: TCR epitope binding. + units: '' + type: binary classification + names: + - tcr binding affinity + - binding affinity + - binding + - epitope binding affinity + - epitope binding identifiers: - - id: epitope_smiles - type: SMILES - description: 'epitope smiles ' - - id: epitope_aa - type: amino acid - description: epitope amino acid sequence - - id: tcr_full - type: amino acid - description: tcr amino acid sequence +- id: epitope_smiles + type: SMILES + description: epitope smiles +- id: epitope_aa + type: Other + description: epitope amino acid sequence +- id: tcr + type: Other + description: hypervariable CDR3 loop +- id: tcr_full + type: Other + description: tcr full amino acid sequence license: CC BY 4.0 links: - - url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ - description: original data set link - - url: https://doi.org/10.1093/bioinformatics/btab294 - description: corresponding publication -split col: split +- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ + description: original data set link +- url: https://doi.org/10.1093/bioinformatics/btab294 + description: corresponding publication +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split num_points: 47182 bibtex: - - |- - @article{weber2021titan, +- |- + @article{weber2021titan, title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, journal={Bioinformatics}, diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 474c1f6ea..690774159 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -1,12 +1,10 @@ - import pandas as pd import yaml from tdc.multi_pred import TCREpitopeBinding - def get_and_transform_data(): # get raw data data = TCREpitopeBinding(name = 'weber', path = './data') - + split = data.get_split() df_train=split['train'] df_valid=split['valid'] @@ -14,7 +12,7 @@ def get_and_transform_data(): df_train['split']="train" df_valid['split']="valid" df_test['split']="test" - df=pd.concat(df_train,df_valid,df_test,axis=0) + df=pd.concat([df_train,df_valid,df_test],axis=0) # create dataframenot necessary but ensure we can load the saved data @@ -24,7 +22,7 @@ def get_and_transform_data(): "epitope_aa", "epitope_smi", "tcr", - "tcr_aa", + "tcr_full", "label", "split" ] @@ -54,11 +52,10 @@ def get_and_transform_data(): # create meta yaml meta = { "name": "tcr_epitope_binding", # unique identifier, we will also use this for directory names - "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, - activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic - peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide - protection against a wide range of pathogens.This new task aims to predict the binding affinity - given a pair of TCR sequence and epitope sequence.""", + "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation + and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). + A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. + This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.""", "targets": [ { "id": "binding", # name of the column in a tabular dataset @@ -85,19 +82,19 @@ def get_and_transform_data(): "id": "epitope_aa", "type": "Other", "description": "epitope amino acid sequence", - + }, { "id": "tcr", "type": "Other", "description": "hypervariable CDR3 loop", - + }, { "id": "tcr_full", "type": "Other", "description": "tcr full amino acid sequence", - + }, ], "license": "CC BY 4.0", # license under which the original dataset was published From 94ed08b13a19a3fffa157b229bbbcdd53ffb707c Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Tue, 28 Mar 2023 10:34:26 +0200 Subject: [PATCH 13/16] feat: fix pandas concat, column names, and yaml setup. --- data/tcr_epitope_binding/meta.yaml | 53 +++++++++++++++++---------- data/tcr_epitope_binding/transform.py | 15 ++++++-- 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml index 4b3677e43..e7daa32ac 100644 --- a/data/tcr_epitope_binding/meta.yaml +++ b/data/tcr_epitope_binding/meta.yaml @@ -1,46 +1,61 @@ --- name: tcr_epitope_binding description: |- - T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation - and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes). - A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens. - This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence. + T-cells are an integral part of the adaptive immune system, whose survival, proliferation, + activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic + peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide + protection against a wide range of pathogens.This new task aims to predict the binding affinity + given a pair of TCR sequence and epitope sequence. targets: - id: binding description: TCR epitope binding. units: '' - type: binary classification + type: boolean names: - tcr binding affinity - binding affinity - binding + - epitope binding affinity + - epitope binding identifiers: - id: epitope_smiles type: SMILES - description: 'epitope smiles ' + description: epitope smiles - id: epitope_aa - type: amino acid + type: Other + names: + - epitope amino acid sequence description: epitope amino acid sequence + - id: tcr + type: Other + names: + - hypervariable CDR3 loop + description: hypervariable CDR3 loop - id: tcr_full - type: amino acid - description: tcr amino acid sequence + type: Other + names: + - tcr full amino acid sequence + description: tcr full amino acid sequence license: CC BY 4.0 links: - url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/ description: original data set link - url: https://doi.org/10.1093/bioinformatics/btab294 description: corresponding publication -split col: split +benchmarks: + - name: TDC + link: https://tdcommons.ai/ + split_column: split num_points: 47182 bibtex: - |- @article{weber2021titan, - title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, - author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, - journal={Bioinformatics}, - volume={56}, - number={4}, - pages={i237-i234}, - year={2021}, - publisher={Oxford Academic} - } + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + } diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 8bb454ab9..77d25067b 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -14,7 +14,7 @@ def get_and_transform_data(): df_train["split"] = "train" df_valid["split"] = "valid" df_test["split"] = "test" - df = pd.concat(df_train, df_valid, df_test, axis=0) + df = pd.concat([df_train, df_valid, df_test], axis=0) # create dataframenot necessary but ensure we can load the saved data @@ -24,7 +24,7 @@ def get_and_transform_data(): "epitope_aa", "epitope_smi", "tcr", - "tcr_aa", + "tcr_full", "label", "split", ] @@ -64,7 +64,7 @@ def get_and_transform_data(): "id": "binding", # name of the column in a tabular dataset "description": "TCR epitope binding.", # description of what this column means "units": "", # units of the values in this column (leave empty if unitless) - "type": "binary classification", # can be "categorical", "ordinal", "continuous" + "type": "boolean", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) "tcr binding affinity", "binding affinity", @@ -83,16 +83,25 @@ def get_and_transform_data(): { "id": "epitope_aa", "type": "Other", + "names": [ + "epitope amino acid sequence", + ], "description": "epitope amino acid sequence", }, { "id": "tcr", "type": "Other", + "names": [ + "hypervariable CDR3 loop", + ], "description": "hypervariable CDR3 loop", }, { "id": "tcr_full", "type": "Other", + "names": [ + "tcr full amino acid sequence", + ], "description": "tcr full amino acid sequence", }, ], From 26a79b3ce0875809e0517656d9eacf80216b13ab Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Tue, 28 Mar 2023 01:36:47 -0700 Subject: [PATCH 14/16] commit to merge --- data/tcr_epitope_binding/transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 8bb454ab9..e0cfd496b 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -14,7 +14,7 @@ def get_and_transform_data(): df_train["split"] = "train" df_valid["split"] = "valid" df_test["split"] = "test" - df = pd.concat(df_train, df_valid, df_test, axis=0) + df = pd.concat([df_train, df_valid, df_test], axis=0) # create dataframenot necessary but ensure we can load the saved data @@ -24,7 +24,7 @@ def get_and_transform_data(): "epitope_aa", "epitope_smi", "tcr", - "tcr_aa", + "tcr_full", "label", "split", ] From 8eecc95927d0fbefa0249a9720242dd54de002c4 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Tue, 28 Mar 2023 10:36:22 +0200 Subject: [PATCH 15/16] feat: nice bibtex for yaml export. --- data/tcr_epitope_binding/meta.yaml | 18 +++++++++--------- data/tcr_epitope_binding/transform.py | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml index e7daa32ac..08ed284c4 100644 --- a/data/tcr_epitope_binding/meta.yaml +++ b/data/tcr_epitope_binding/meta.yaml @@ -50,12 +50,12 @@ num_points: 47182 bibtex: - |- @article{weber2021titan, - title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, - author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, - journal={Bioinformatics}, - volume={56}, - number={4}, - pages={i237-i234}, - year={2021}, - publisher={Oxford Academic} - } + title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, + author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, + journal={Bioinformatics}, + volume={56}, + number={4}, + pages={i237-i234}, + year={2021}, + publisher={Oxford Academic} + } diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 77d25067b..3f6f4685d 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -126,15 +126,15 @@ def get_and_transform_data(): "num_points": len(df), # number of datapoints in this dataset "bibtex": [ """@article{weber2021titan, - title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, - author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, - journal={Bioinformatics}, - volume={56}, - number={4}, - pages={i237-i234}, - year={2021}, - publisher={Oxford Academic} - }""", +title={TITAN: T-cell receptor specificity prediction with bimodal attention network}, +author={Weber Anna,Born Janis, Martinez Maria Rodriguez}, +journal={Bioinformatics}, +volume={56}, +number={4}, +pages={i237-i234}, +year={2021}, +publisher={Oxford Academic} +}""", ], } From 39c674e948bda6649ed6468cdfe943088ba263c5 Mon Sep 17 00:00:00 2001 From: Cody Jackson Date: Tue, 28 Mar 2023 21:07:52 -0700 Subject: [PATCH 16/16] add templates --- .../example_processing_and_templates.ipynb | 38 ++++++++++++++++++ data/tcr_epitope_binding/transform.py | 39 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/data/tcr_epitope_binding/example_processing_and_templates.ipynb b/data/tcr_epitope_binding/example_processing_and_templates.ipynb index 9303c464e..59528d11f 100644 --- a/data/tcr_epitope_binding/example_processing_and_templates.ipynb +++ b/data/tcr_epitope_binding/example_processing_and_templates.ipynb @@ -996,6 +996,44 @@ "\n", " },\n", " ],\n", + " \"templates\": [\n", + " {\n", + " \"prompt\": \"Please answer the following question.\\nPredict the for .\",\n", + " \"completion\": \"\"\n", + "\n", + " }\n", + " ],\n", + " \"fields\": [\n", + " {\n", + " \"exp_value\": [\n", + " \"values\": [ \n", + " {\n", + " \"name\": \"exp_value\",\n", + " \"column\": \"exp_value\",\n", + " \"text\": \"binding\"\n", + " }\n", + " ]\n", + " ],\n", + " \"molecule\": [\n", + " \"values\": [\n", + " {\n", + " \"name\": \"epitope_smiles\",\n", + " \"column\": \"epitope_smiles\",\n", + " \"text\": \"SMILES\"\n", + "\n", + " },\n", + " {\n", + " \"name\": \"tcr_full\",\n", + " \"column\": \"tcr_full\",\n", + " \"text\":\n", + "\n", + " },\n", + "\n", + " ]\n", + " ],\n", + "\n", + " }\n", + " ],\n", " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n", " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n", " {\n", diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py index 3f6f4685d..947bf53e2 100644 --- a/data/tcr_epitope_binding/transform.py +++ b/data/tcr_epitope_binding/transform.py @@ -105,6 +105,45 @@ def get_and_transform_data(): "description": "tcr full amino acid sequence", }, ], + "templates": [ + { + "prompt": "Please answer the following question.\nPredict the for .", + "completion": "" + } + ], + "fields": [ + { + "exp_value": [ + "values": [ + { + "name": "exp_value", + "column": "exp_value", + "text": "binding" + } + ] + ], + "molecule": [ + "values": [ + { + "name": "epitope_smiles", + "column": "epitope_smiles", + "text": "SMILES" + + }, + { + "name": "tcr_full", + "column": "tcr_full", + "text": + + }, + + ] + ], + + } + ], + + "license": "CC BY 4.0", # license under which the original dataset was published "links": [ # list of relevant links (original dataset, other uses, etc.) {