diff --git a/data/tcr_epitope_binding/example_processing_and_templates.ipynb b/data/tcr_epitope_binding/example_processing_and_templates.ipynb
new file mode 100644
index 000000000..59528d11f
--- /dev/null
+++ b/data/tcr_epitope_binding/example_processing_and_templates.ipynb
@@ -0,0 +1,1486 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "27c08f91-3fa0-4800-8f6a-96a96f665cad",
+ "metadata": {},
+ "source": [
+ "# TCR-Epitope binding, Weber et al."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6ef172b9-aad2-47da-bf4c-844a2a07ee8c",
+ "metadata": {},
+ "source": [
+ "Original data repository: https://tdcommons.ai/multi_pred_tasks/tcrepitope/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7d18c95d-2ec6-45e1-addc-54a890097b8e",
+ "metadata": {},
+ "source": [
+ "# Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "cf59e3e9-8061-4022-9eae-e978311b4155",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import yaml\n",
+ "from tdc.multi_pred import TCREpitopeBinding "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a6751ff9-2e3e-4d01-8395-7a5ae0c200d7",
+ "metadata": {},
+ "source": [
+ "# Data processing"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a1169ad2-e4bb-41c6-9625-6d1644c44a5b",
+ "metadata": {},
+ "source": [
+ "## Download data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "7bb8eb5e-f513-40d2-a68c-7cda1a51ad31",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "fn_data_original = \"data_original.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "b39a142e-ccbc-49d2-98b0-a5f9bde9fd27",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Found local copy...\n",
+ "Loading...\n",
+ "Done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "data = TCREpitopeBinding(name = 'weber', path = './data')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "26d9f62a-07f5-4113-8161-d5dfcf0bfb71",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "data.get_data().to_csv(fn_data_original, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "43873fc3-20a8-487d-a7c5-33bd58414159",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total 30664\r\n",
+ "drwxr-xr-x 3 cody staff 96B Mar 28 01:21 \u001b[34mdata\u001b[m\u001b[m\r\n",
+ "-rw-r--r-- 1 cody staff 15M Mar 28 01:22 data_original.csv\r\n",
+ "-rw-r--r-- 1 cody staff 47K Mar 28 01:18 example_processing_and_templates.ipynb\r\n",
+ "-rw-r--r--@ 1 cody staff 5.3K Mar 28 01:18 transform.py\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls -lh"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9cda29a-a133-4f0e-992b-e77c9070ee93",
+ "metadata": {},
+ "source": [
+ "## Load original data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "77f614e7-b133-40bc-8759-2d930e4c120e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "epitope_aa,epitope_smi,tcr,tcr_full,label\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASEGTSSYEQYFGPGTRLTVT,1\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!head -n 5 {fn_data_original}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "8f5a0387-f9e3-4e1a-8d14-5df618195f70",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fn_data_original, delimiter=\",\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "55b0bd63-62a0-469e-9d8a-e9ada3fe01c4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " epitope_aa | \n",
+ " epitope_smi | \n",
+ " tcr | \n",
+ " tcr_full | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGTGKTYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGEGRSYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSATILAGVPYGEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSASEGTSSYEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CASSFDREVTGELFF | \n",
+ " GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " epitope_aa epitope_smi \\\n",
+ "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "\n",
+ " tcr tcr_full label \n",
+ "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... 1 \n",
+ "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... 1 \n",
+ "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... 1 \n",
+ "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... 1 \n",
+ "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... 1 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95158ac1-05d7-4a21-b8e4-7f720306d331",
+ "metadata": {},
+ "source": [
+ "## Add column = field names\n",
+ "Clean column names (`fields_clean`) and keep original names (`fields_orig`)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "ec2458e5-455f-4f03-8ce9-c0d12e9ed371",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['epitope_aa', 'epitope_smi', 'tcr', 'tcr_full', 'label']"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fields_orig = df.columns.tolist()\n",
+ "fields_orig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "a46dd8ff-37b3-4894-8226-3bf98226dd09",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ " fields_clean = [\n",
+ " \"epitope_aa\",\n",
+ " \"epitope_smiles\",\n",
+ " \"tcr\",\n",
+ " \"tcr_full\",\n",
+ " \"binding\",\n",
+ " ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "785d37cb-1fb4-4a91-a923-d5a78a37f36a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df.columns = fields_clean"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "1bf212cb-1653-457b-9f5d-416d4dd14b53",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " epitope_aa | \n",
+ " epitope_smiles | \n",
+ " tcr | \n",
+ " tcr_full | \n",
+ " binding | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGTGKTYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGEGRSYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSATILAGVPYGEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSASEGTSSYEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CASSFDREVTGELFF | \n",
+ " GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " epitope_aa epitope_smiles \\\n",
+ "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "\n",
+ " tcr tcr_full \\\n",
+ "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n",
+ "\n",
+ " binding \n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 1 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bfaeb22-26fb-4964-a71f-cae8335e5372",
+ "metadata": {},
+ "source": [
+ "## Data cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "7e746003-cb1f-434f-bba6-00f0c439c4ac",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df.epitope_aa = (\n",
+ " df.epitope_aa.str.strip()\n",
+ ") # remove leading and trailing white space characters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "d544fa60-343e-40e1-bd0c-4750f07a7145",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "assert not df.duplicated().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc6f52c1-e0f6-48b3-95f4-e36d9a5ecde8",
+ "metadata": {},
+ "source": [
+ "## Save to csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "d6d5efa5-b4b4-4a25-8626-e10f3d691e83",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "fn_data_csv = \"data_clean.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "727f8d7b-cbb6-43c7-9eab-9d4d65be6b3f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df.to_csv(fn_data_csv, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "63c8d4a4-906e-418d-be39-879365b4dfa0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "-rw-r--r-- 1 cody staff 15M Mar 28 01:23 data_clean.csv\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls -lh {fn_data_csv}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "a51b9001-25d7-4e0e-a607-477cfc4a9f1c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "epitope_aa,epitope_smiles,tcr,tcr_full,binding\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGTGKTYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGTGKTYEQYFGPGTRLTVT,1\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSVWGEGRSYEQYF,SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLIATANQGSEATYESGFVIDKFPISRPNLTFSTLTVSNMSPEDSSIYLCSVWGEGRSYEQYFGPGTRLTVT,1\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSATILAGVPYGEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSATILAGVPYGEQYFGPGTRLTVT,1\r\n",
+ "FLKEKGGL,CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)Cc1ccccc1)C(=O)O,CSASEGTSSYEQYF,GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMATSNEGSKATYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYICSASEGTSSYEQYFGPGTRLTVT,1\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!head -n 5 {fn_data_csv}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "1a512943-4909-4d56-867d-50c151d8d607",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " epitope_aa | \n",
+ " epitope_smiles | \n",
+ " tcr | \n",
+ " tcr_full | \n",
+ " binding | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGTGKTYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGEGRSYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSATILAGVPYGEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSASEGTSSYEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CASSFDREVTGELFF | \n",
+ " GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " epitope_aa epitope_smiles \\\n",
+ "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "\n",
+ " tcr tcr_full \\\n",
+ "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n",
+ "\n",
+ " binding \n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 1 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f3d730ce-fce0-49df-9eb8-b917e945fa9a",
+ "metadata": {},
+ "source": [
+ "## Load from csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "077b0c5f-8772-4879-9317-3fa28799689b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "fn_data_csv = \"data_clean.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "6eaef0e6-2115-4793-ac43-a196b25d47a0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fn_data_csv)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "43619e7c-9c82-4ff0-ae25-403861304635",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " epitope_aa | \n",
+ " epitope_smiles | \n",
+ " tcr | \n",
+ " tcr_full | \n",
+ " binding | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGTGKTYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGEGRSYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSATILAGVPYGEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSASEGTSSYEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CASSFDREVTGELFF | \n",
+ " GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " epitope_aa epitope_smiles \\\n",
+ "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "\n",
+ " tcr tcr_full \\\n",
+ "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n",
+ "\n",
+ " binding \n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 1 "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2f06e57c-02c5-493b-af65-c8bb9ac59421",
+ "metadata": {},
+ "source": [
+ "# meta YAML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "49771077-471d-4d71-a9a7-d6b094bbc4f3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " epitope_aa | \n",
+ " epitope_smiles | \n",
+ " tcr | \n",
+ " tcr_full | \n",
+ " binding | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGTGKTYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSVWGEGRSYEQYF | \n",
+ " SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSATILAGVPYGEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CSASEGTSSYEQYF | \n",
+ " GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " FLKEKGGL | \n",
+ " CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... | \n",
+ " CASSFDREVTGELFF | \n",
+ " GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " epitope_aa epitope_smiles \\\n",
+ "0 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "1 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "2 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "3 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "4 FLKEKGGL CC(C)C[C@H](NC(=O)CNC(=O)CNC(=O)[C@H](CCCCN)NC... \n",
+ "\n",
+ " tcr tcr_full \\\n",
+ "0 CSVWGTGKTYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "1 CSVWGEGRSYEQYF SAVISQKPSRDICQRGTSLTIQCQVDSQVTMMFWYRQQPGQSLTLI... \n",
+ "2 CSATILAGVPYGEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "3 CSASEGTSSYEQYF GAVVSQHPSWVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLML... \n",
+ "4 CASSFDREVTGELFF GAGVSQTPSNKVTEKGKYVELRCDPISGHTALYWYRQSLGQGPEFL... \n",
+ "\n",
+ " binding \n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 1 "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "d3890961-444e-4a26-b8fc-ed8c4e959af9",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "meta = {\n",
+ " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n",
+ " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, \n",
+ " activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic \n",
+ " peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide \n",
+ " protection against a wide range of pathogens.This new task aims to predict the binding affinity \n",
+ " given a pair of TCR sequence and epitope sequence.\"\"\",\n",
+ " \"targets\": [\n",
+ " {\n",
+ " \"id\": \"binding\", # name of the column in a tabular dataset\n",
+ " \"description\": \"TCR epitope binding.\", # description of what this column means\n",
+ " \"units\": \"\", # units of the values in this column (leave empty if unitless)\n",
+ " \"type\": \"binary classification\", # can be \"categorical\", \"ordinal\", \"continuous\"\n",
+ " \"names\": [ # names for the property (to sample from for building the prompts)\n",
+ " \"tcr binding affinity\",\n",
+ " \"binding affinity\",\n",
+ " \"binding\",\n",
+ " \"epitope binding affinity\",\n",
+ " \"epitope binding\"\n",
+ "\n",
+ " ],\n",
+ " },\n",
+ " ],\n",
+ " \"identifiers\": [\n",
+ " {\n",
+ " \"id\": \"epitope_smiles\", # column name\n",
+ " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n",
+ " \"description\": \"epitope smiles\", # description (optional, except for \"OTHER\")\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"epitope_aa\",\n",
+ " \"type\": \"Other\",\n",
+ " \"description\": \"epitope amino acid sequence\",\n",
+ "\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"tcr\",\n",
+ " \"type\": \"Other\",\n",
+ " \"description\": \"hypervariable CDR3 loop\",\n",
+ "\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"tcr_full\",\n",
+ " \"type\": \"Other\",\n",
+ " \"description\": \"tcr full amino acid sequence\",\n",
+ "\n",
+ " },\n",
+ " ],\n",
+ " \"templates\": [\n",
+ " {\n",
+ " \"prompt\": \"Please answer the following question.\\nPredict the for .\",\n",
+ " \"completion\": \"\"\n",
+ "\n",
+ " }\n",
+ " ],\n",
+ " \"fields\": [\n",
+ " {\n",
+ " \"exp_value\": [\n",
+ " \"values\": [ \n",
+ " {\n",
+ " \"name\": \"exp_value\",\n",
+ " \"column\": \"exp_value\",\n",
+ " \"text\": \"binding\"\n",
+ " }\n",
+ " ]\n",
+ " ],\n",
+ " \"molecule\": [\n",
+ " \"values\": [\n",
+ " {\n",
+ " \"name\": \"epitope_smiles\",\n",
+ " \"column\": \"epitope_smiles\",\n",
+ " \"text\": \"SMILES\"\n",
+ "\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"tcr_full\",\n",
+ " \"column\": \"tcr_full\",\n",
+ " \"text\":\n",
+ "\n",
+ " },\n",
+ "\n",
+ " ]\n",
+ " ],\n",
+ "\n",
+ " }\n",
+ " ],\n",
+ " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n",
+ " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n",
+ " {\n",
+ " \"url\": \"https://tdcommons.ai/multi_pred_tasks/tcrepitope/\",\n",
+ " \"description\": \"original data set link\",\n",
+ " },\n",
+ " {\n",
+ " \"url\": \"https://doi.org/10.1093/bioinformatics/btab294\",\n",
+ " \"description\": \"corresponding publication\",\n",
+ " },\n",
+ " ],\n",
+ " \"benchmarks\": [\n",
+ " {\n",
+ " \"name\": \"TDC\",\n",
+ " \"link\": \"https://tdcommons.ai/\",\n",
+ " \"split_column\": \"split\",\n",
+ " },\n",
+ " ],\n",
+ " \"num_points\": len(df), # number of datapoints in this dataset\n",
+ " \"bibtex\": [\n",
+ " \"\"\"@article{weber2021titan,\n",
+ " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\n",
+ " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\n",
+ " journal={Bioinformatics},\n",
+ " volume={56},\n",
+ " number={4},\n",
+ " pages={i237-i234},\n",
+ " year={2021},\n",
+ " publisher={Oxford Academic}\n",
+ " }\"\"\",\n",
+ " ],\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "ec455cf0-962a-4c0d-bb3e-066e415ffd9b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def str_presenter(dumper, data):\n",
+ " \"\"\"configures yaml for dumping multiline strings\n",
+ " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n",
+ " \"\"\"\n",
+ " if data.count(\"\\n\") > 0: # check for multiline string\n",
+ " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n",
+ " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n",
+ "\n",
+ "\n",
+ "yaml.add_representer(str, str_presenter)\n",
+ "yaml.representer.SafeRepresenter.add_representer(\n",
+ " str, str_presenter\n",
+ ") # to use with safe_dum"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "580bbd79-4845-4515-be94-3e4a9815d048",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "fn_meta = \"meta.yaml\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "873fa5dd-9b60-40f5-b537-4d7a206414ea",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "with open(fn_meta, \"w\") as f:\n",
+ " yaml.dump(meta, f, sort_keys=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "d01686c0-6746-4fc4-b019-350270dfc26f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "-rw-r--r-- 1 cody staff 1.7K Mar 28 01:23 meta.yaml\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls -lh {fn_meta}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "ef6063c5-7a8b-4344-bccf-a073443feebf",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "name: tcr_epitope_binding\r\n",
+ "description: \"T-cells are an integral part of the adaptive immune system, whose survival,\\\r\n",
+ " \\ proliferation, \\n activationand function are all governed by the interaction\\\r\n",
+ " \\ of their T-cell receptor (TCR) with immunogenic \\n peptides (epitopes).A large\\\r\n",
+ " \\ repertoire of T-cell receptors with different specificity is needed to provide\\\r\n",
+ " \\ \\n protection against a wide range of pathogens.This new task aims to predict\\\r\n",
+ " \\ the binding affinity \\n given a pair of TCR sequence and epitope sequence.\"\r\n",
+ "targets:\r\n",
+ "- id: binding\r\n",
+ " description: TCR epitope binding.\r\n",
+ " units: ''\r\n",
+ " type: binary classification\r\n",
+ " names:\r\n",
+ " - tcr binding affinity\r\n",
+ " - binding affinity\r\n",
+ " - binding\r\n",
+ " - epitope binding affinity\r\n",
+ " - epitope binding\r\n",
+ "identifiers:\r\n",
+ "- id: epitope_smiles\r\n",
+ " type: SMILES\r\n",
+ " description: epitope smiles\r\n",
+ "- id: epitope_aa\r\n",
+ " type: Other\r\n",
+ " description: epitope amino acid sequence\r\n",
+ "- id: tcr\r\n",
+ " type: Other\r\n",
+ " description: hypervariable CDR3 loop\r\n",
+ "- id: tcr_full\r\n",
+ " type: Other\r\n",
+ " description: tcr full amino acid sequence\r\n",
+ "license: CC BY 4.0\r\n",
+ "links:\r\n",
+ "- url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/\r\n",
+ " description: original data set link\r\n",
+ "- url: https://doi.org/10.1093/bioinformatics/btab294\r\n",
+ " description: corresponding publication\r\n",
+ "benchmarks:\r\n",
+ "- name: TDC\r\n",
+ " link: https://tdcommons.ai/\r\n",
+ " split_column: split\r\n",
+ "num_points: 47182\r\n",
+ "bibtex:\r\n",
+ "- |-\r\n",
+ " @article{weber2021titan,\r\n",
+ " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\r\n",
+ " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\r\n",
+ " journal={Bioinformatics},\r\n",
+ " volume={56},\r\n",
+ " number={4},\r\n",
+ " pages={i237-i234},\r\n",
+ " year={2021},\r\n",
+ " publisher={Oxford Academic}\r\n",
+ " }\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!cat {fn_meta}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd3f930a-638b-4bb7-a1d2-80688f2f6891",
+ "metadata": {},
+ "source": [
+ "# create transform.py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "9aab00fd-58a8-40b0-be30-1e269e0d323b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "path_file = \"transform.py\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "8368bb20-8e1c-4b7d-b0e2-b39da36b5972",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overwriting transform.py\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%writefile $path_file\n",
+ "import pandas as pd\n",
+ "import yaml\n",
+ "from tdc.multi_pred import TCREpitopeBinding \n",
+ "def get_and_transform_data():\n",
+ " # get raw data\n",
+ " data = TCREpitopeBinding(name = 'weber', path = './data')\n",
+ " \n",
+ " split = data.get_split()\n",
+ " df_train=split['train']\n",
+ " df_valid=split['valid']\n",
+ " df_test=split['test']\n",
+ " df_train['split']=\"train\"\n",
+ " df_valid['split']=\"valid\"\n",
+ " df_test['split']=\"test\"\n",
+ " df=pd.concat([df_train,df_valid,df_test],axis=0)\n",
+ "\n",
+ " # create dataframenot necessary but ensure we can load the saved data\n",
+ "\n",
+ " # check if fields are the same\n",
+ " fields_orig = df.columns.tolist()\n",
+ " assert fields_orig == [\n",
+ " \"epitope_aa\",\n",
+ " \"epitope_smi\",\n",
+ " \"tcr\",\n",
+ " \"tcr_full\",\n",
+ " \"label\",\n",
+ " \"split\"\n",
+ " ]\n",
+ "\n",
+ " # overwrite column names = fields\n",
+ " fields_clean = [\n",
+ " \"epitope_aa\",\n",
+ " \"epitope_smiles\",\n",
+ " \"tcr\",\n",
+ " \"tcr_full\",\n",
+ " \"binding\",\n",
+ " \"split\"\n",
+ " ]\n",
+ " df.columns = fields_clean\n",
+ "\n",
+ " # data cleaning\n",
+ " df.epitope_aa = (\n",
+ " df.epitope_aa.str.strip()\n",
+ " ) # remove leading and trailing white space characters\n",
+ "\n",
+ " assert not df.duplicated().sum()\n",
+ "\n",
+ " # save to csv\n",
+ " fn_data_csv = \"data_clean.csv\"\n",
+ " df.to_csv(fn_data_csv, index=False)\n",
+ "\n",
+ " # create meta yaml\n",
+ " meta = {\n",
+ " \"name\": \"tcr_epitope_binding\", # unique identifier, we will also use this for directory names\n",
+ " \"description\": \"\"\"T-cells are an integral part of the adaptive immune system, whose survival, proliferation, activation\n",
+ " and function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic peptides (epitopes).\n",
+ " A large repertoire of T-cell receptors with different specificity is needed to provide protection against a wide range of pathogens.\n",
+ " This new task aims to predict the binding affinity given a pair of TCR sequence and epitope sequence.\"\"\",\n",
+ " \"targets\": [\n",
+ " {\n",
+ " \"id\": \"binding\", # name of the column in a tabular dataset\n",
+ " \"description\": \"TCR epitope binding.\", # description of what this column means\n",
+ " \"units\": \"\", # units of the values in this column (leave empty if unitless)\n",
+ " \"type\": \"binary classification\", # can be \"categorical\", \"ordinal\", \"continuous\"\n",
+ " \"names\": [ # names for the property (to sample from for building the prompts)\n",
+ " \"tcr binding affinity\",\n",
+ " \"binding affinity\",\n",
+ " \"binding\",\n",
+ " \"epitope binding affinity\",\n",
+ " \"epitope binding\"\n",
+ "\n",
+ " ],\n",
+ " },\n",
+ " ],\n",
+ " \"identifiers\": [\n",
+ " {\n",
+ " \"id\": \"epitope_smiles\", # column name\n",
+ " \"type\": \"SMILES\", # can be \"SMILES\", \"SELFIES\", \"IUPAC\", \"OTHER\"\n",
+ " \"description\": \"epitope smiles\", # description (optional, except for \"OTHER\")\n",
+ " },\n",
+ " {\n",
+ " \"id\": \"epitope_aa\",\n",
+ " \"type\": \"Other\",\n",
+ " \"description\": \"epitope amino acid sequence\",\n",
+ " \n",
+ " },\n",
+ " {\n",
+ " \"id\": \"tcr\",\n",
+ " \"type\": \"Other\",\n",
+ " \"description\": \"hypervariable CDR3 loop\",\n",
+ " \n",
+ " },\n",
+ " {\n",
+ " \"id\": \"tcr_full\",\n",
+ " \"type\": \"Other\",\n",
+ " \"description\": \"tcr full amino acid sequence\",\n",
+ " \n",
+ " },\n",
+ " ],\n",
+ " \"license\": \"CC BY 4.0\", # license under which the original dataset was published\n",
+ " \"links\": [ # list of relevant links (original dataset, other uses, etc.)\n",
+ " {\n",
+ " \"url\": \"https://tdcommons.ai/multi_pred_tasks/tcrepitope/\",\n",
+ " \"description\": \"original data set link\",\n",
+ " },\n",
+ " {\n",
+ " \"url\": \"https://doi.org/10.1093/bioinformatics/btab294\",\n",
+ " \"description\": \"corresponding publication\",\n",
+ " },\n",
+ " ],\n",
+ " \"benchmarks\": [\n",
+ " {\n",
+ " \"name\": \"TDC\",\n",
+ " \"link\": \"https://tdcommons.ai/\",\n",
+ " \"split_column\": \"split\",\n",
+ " },\n",
+ " ],\n",
+ " \"num_points\": len(df), # number of datapoints in this dataset\n",
+ " \"bibtex\": [\n",
+ " \"\"\"@article{weber2021titan,\n",
+ " title={TITAN: T-cell receptor specificity prediction with bimodal attention network},\n",
+ " author={Weber Anna,Born Janis, Martinez Maria Rodriguez},\n",
+ " journal={Bioinformatics},\n",
+ " volume={56},\n",
+ " number={4},\n",
+ " pages={i237-i234},\n",
+ " year={2021},\n",
+ " publisher={Oxford Academic}\n",
+ " }\"\"\",\n",
+ " ],\n",
+ " }\n",
+ "\n",
+ " def str_presenter(dumper, data):\n",
+ " \"\"\"configures yaml for dumping multiline strings\n",
+ " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n",
+ " \"\"\"\n",
+ "\n",
+ " if data.count(\"\\n\") > 0: # check for multiline string\n",
+ " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n",
+ " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n",
+ "\n",
+ " yaml.add_representer(str, str_presenter)\n",
+ " yaml.representer.SafeRepresenter.add_representer(\n",
+ " str, str_presenter\n",
+ " ) # to use with safe_dum\n",
+ " fn_meta = \"meta.yaml\"\n",
+ " with open(fn_meta, \"w\") as f:\n",
+ " yaml.dump(meta, f, sort_keys=False)\n",
+ "\n",
+ " print(f\"Finished processing {meta['name']} dataset!\")\n",
+ "\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " get_and_transform_data()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "d0474f26-70f3-4655-b81a-df4ada90e7a6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found local copy...\n",
+ "Loading...\n",
+ "Done!\n",
+ "Finished processing tcr_epitope_binding dataset!\n"
+ ]
+ }
+ ],
+ "source": [
+ "!python3 transform.py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "953e7bee-bd5e-41d0-a2be-506e0bc97727",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total 61592\r\n",
+ "drwxr-xr-x 3 cody staff 96B Mar 4 10:09 \u001b[34mdata\u001b[m\u001b[m/\r\n",
+ "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_clean.csv\r\n",
+ "-rw-r--r-- 1 cody staff 15M Mar 4 10:10 data_original.csv\r\n",
+ "-rw-r--r-- 1 cody staff 45K Mar 4 10:06 example_processing_and_templates.ipynb\r\n",
+ "-rw-r--r--@ 1 cody staff 1.5K Mar 4 10:10 meta.yaml\r\n",
+ "-rw-r--r--@ 1 cody staff 4.8K Mar 4 10:10 transform.py\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "ls -lh # fmt: skip"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b08ed06-ba66-4f76-bde1-368ea77d1739",
+ "metadata": {},
+ "source": [
+ "# End"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/tcr_epitope_binding/meta.yaml b/data/tcr_epitope_binding/meta.yaml
new file mode 100644
index 000000000..47b52ccd0
--- /dev/null
+++ b/data/tcr_epitope_binding/meta.yaml
@@ -0,0 +1,60 @@
+name: tcr_epitope_binding
+description: |-
+ T-cells are an integral part of the adaptive immune system, whose survival, proliferation,
+ activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic
+ peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide
+ protection against a wide range of pathogens.This new task aims to predict the binding affinity
+ given a pair of TCR sequence and epitope sequence.
+targets:
+ - id: binding
+ description: TCR epitope binding.
+ units: ''
+ type: boolean
+ names:
+ - tcr binding affinity
+ - binding affinity
+ - binding
+ - epitope binding affinity
+ - epitope binding
+identifiers:
+ - id: epitope_smiles
+ type: SMILES
+ description: epitope smiles
+ - id: epitope_aa
+ type: Other
+ names:
+ - epitope amino acid sequence
+ description: epitope amino acid sequence
+ - id: tcr
+ type: Other
+ names:
+ - hypervariable CDR3 loop
+ description: hypervariable CDR3 loop
+ - id: tcr_full
+ type: Other
+ names:
+ - tcr full amino acid sequence
+ description: tcr full amino acid sequence
+license: CC BY 4.0
+links:
+ - url: https://tdcommons.ai/multi_pred_tasks/tcrepitope/
+ description: original data set link
+ - url: https://doi.org/10.1093/bioinformatics/btab294
+ description: corresponding publication
+benchmarks:
+ - name: TDC
+ link: https://tdcommons.ai/
+ split_column: split
+num_points: 47182
+bibtex:
+ - |-
+ @article{weber2021titan,
+ title={TITAN: T-cell receptor specificity prediction with bimodal attention network},
+ author={Weber Anna,Born Janis, Martinez Maria Rodriguez},
+ journal={Bioinformatics},
+ volume={56},
+ number={4},
+ pages={i237-i234},
+ year={2021},
+ publisher={Oxford Academic}
+ }
diff --git a/data/tcr_epitope_binding/transform.py b/data/tcr_epitope_binding/transform.py
new file mode 100644
index 000000000..947bf53e2
--- /dev/null
+++ b/data/tcr_epitope_binding/transform.py
@@ -0,0 +1,201 @@
+import pandas as pd
+import yaml
+from tdc.multi_pred import TCREpitopeBinding
+
+
+def get_and_transform_data():
+ # get raw data
+ data = TCREpitopeBinding(name="weber", path="./data")
+
+ split = data.get_split()
+ df_train = split["train"]
+ df_valid = split["valid"]
+ df_test = split["test"]
+ df_train["split"] = "train"
+ df_valid["split"] = "valid"
+ df_test["split"] = "test"
+ df = pd.concat([df_train, df_valid, df_test], axis=0)
+
+ # create dataframenot necessary but ensure we can load the saved data
+
+ # check if fields are the same
+ fields_orig = df.columns.tolist()
+ assert fields_orig == [
+ "epitope_aa",
+ "epitope_smi",
+ "tcr",
+ "tcr_full",
+ "label",
+ "split",
+ ]
+
+ # overwrite column names = fields
+ fields_clean = [
+ "epitope_aa",
+ "epitope_smiles",
+ "tcr",
+ "tcr_full",
+ "binding",
+ "split",
+ ]
+ df.columns = fields_clean
+
+ # data cleaning
+ df.epitope_aa = (
+ df.epitope_aa.str.strip()
+ ) # remove leading and trailing white space characters
+
+ assert not df.duplicated().sum()
+
+ # save to csv
+ fn_data_csv = "data_clean.csv"
+ df.to_csv(fn_data_csv, index=False)
+
+ # create meta yaml
+ meta = {
+ "name": "tcr_epitope_binding", # unique identifier, we will also use this for directory names
+ "description": """T-cells are an integral part of the adaptive immune system, whose survival, proliferation,
+ activationand function are all governed by the interaction of their T-cell receptor (TCR) with immunogenic
+ peptides (epitopes).A large repertoire of T-cell receptors with different specificity is needed to provide
+ protection against a wide range of pathogens.This new task aims to predict the binding affinity
+ given a pair of TCR sequence and epitope sequence.""",
+ "targets": [
+ {
+ "id": "binding", # name of the column in a tabular dataset
+ "description": "TCR epitope binding.", # description of what this column means
+ "units": "", # units of the values in this column (leave empty if unitless)
+ "type": "boolean", # can be "categorical", "ordinal", "continuous"
+ "names": [ # names for the property (to sample from for building the prompts)
+ "tcr binding affinity",
+ "binding affinity",
+ "binding",
+ "epitope binding affinity",
+ "epitope binding",
+ ],
+ },
+ ],
+ "identifiers": [
+ {
+ "id": "epitope_smiles", # column name
+ "type": "SMILES", # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
+ "description": "epitope smiles", # description (optional, except for "OTHER")
+ },
+ {
+ "id": "epitope_aa",
+ "type": "Other",
+ "names": [
+ "epitope amino acid sequence",
+ ],
+ "description": "epitope amino acid sequence",
+ },
+ {
+ "id": "tcr",
+ "type": "Other",
+ "names": [
+ "hypervariable CDR3 loop",
+ ],
+ "description": "hypervariable CDR3 loop",
+ },
+ {
+ "id": "tcr_full",
+ "type": "Other",
+ "names": [
+ "tcr full amino acid sequence",
+ ],
+ "description": "tcr full amino acid sequence",
+ },
+ ],
+ "templates": [
+ {
+ "prompt": "Please answer the following question.\nPredict the for .",
+ "completion": ""
+ }
+ ],
+ "fields": [
+ {
+ "exp_value": [
+ "values": [
+ {
+ "name": "exp_value",
+ "column": "exp_value",
+ "text": "binding"
+ }
+ ]
+ ],
+ "molecule": [
+ "values": [
+ {
+ "name": "epitope_smiles",
+ "column": "epitope_smiles",
+ "text": "SMILES"
+
+ },
+ {
+ "name": "tcr_full",
+ "column": "tcr_full",
+ "text":
+
+ },
+
+ ]
+ ],
+
+ }
+ ],
+
+
+ "license": "CC BY 4.0", # license under which the original dataset was published
+ "links": [ # list of relevant links (original dataset, other uses, etc.)
+ {
+ "url": "https://tdcommons.ai/multi_pred_tasks/tcrepitope/",
+ "description": "original data set link",
+ },
+ {
+ "url": "https://doi.org/10.1093/bioinformatics/btab294",
+ "description": "corresponding publication",
+ },
+ ],
+ "benchmarks": [
+ {
+ "name": "TDC",
+ "link": "https://tdcommons.ai/",
+ "split_column": "split",
+ },
+ ],
+ "num_points": len(df), # number of datapoints in this dataset
+ "bibtex": [
+ """@article{weber2021titan,
+title={TITAN: T-cell receptor specificity prediction with bimodal attention network},
+author={Weber Anna,Born Janis, Martinez Maria Rodriguez},
+journal={Bioinformatics},
+volume={56},
+number={4},
+pages={i237-i234},
+year={2021},
+publisher={Oxford Academic}
+}""",
+ ],
+ }
+
+ def str_presenter(dumper, data):
+ """configures yaml for dumping multiline strings
+ Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+ """
+
+ if data.count("\n") > 0: # check for multiline string
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+ yaml.add_representer(str, str_presenter)
+ yaml.representer.SafeRepresenter.add_representer(
+ str, str_presenter
+ ) # to use with safe_dum
+ fn_meta = "meta.yaml"
+ with open(fn_meta, "w") as f:
+ yaml.dump(meta, f, sort_keys=False)
+
+ print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+ get_and_transform_data()