From f8ae9fab2e66a61b1598975ab56afe6f2593c076 Mon Sep 17 00:00:00 2001
From: pskvins <pskvins@snu.ac.kr>
Date: Thu, 30 Jan 2025 15:49:44 +0900
Subject: [PATCH 1/7] Now foldseek is the default option

---
 src/util/arg_parser.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/util/arg_parser.rs b/src/util/arg_parser.rs
index 1ea97a9..f747a5e 100644
--- a/src/util/arg_parser.rs
+++ b/src/util/arg_parser.rs
@@ -66,10 +66,10 @@ pub enum Commands {
         #[arg(short, long, default_value="false")]
         gpu: bool,
         /// Use python script instead. hidden option
-        #[arg(long, default_value="true", hide = true)]
+        #[arg(long, default_value="false", hide = true)]
         use_python: bool,
         /// Use foldseek for createdb. hidden option
-        #[arg(long, default_value="false", hide = true)]
+        #[arg(long, default_value="true", hide = true)]
         use_foldseek: bool,
         /// Use AFDB lookup for foldseek createdb. Useful for large databases
         #[arg(long, default_value="false")]
@@ -250,10 +250,10 @@ pub enum Commands {
         #[arg(short, long, default_value="false")]
         gpu: bool,
         /// Use python script instead. hidden option
-        #[arg(long, default_value="true", hide = true)]
+        #[arg(long, default_value="false", hide = true)]
         use_python: bool,
         /// Use foldseek for createdb. hidden option
-        #[arg(long, default_value="false", hide = true)]
+        #[arg(long, default_value="true", hide = true)]
         use_foldseek: bool,
         /// Use AFDB lookup for foldseek createdb. Useful for large databases
         #[arg(long, default_value="false")]
@@ -318,10 +318,10 @@ pub enum Commands {
         #[arg(short, long, default_value="false")]
         gpu: bool,
         /// Use python script instead. hidden option
-        #[arg(long, default_value="true", hide = true)]
+        #[arg(long, default_value="false", hide = true)]
         use_python: bool,
         /// Use foldseek for createdb. hidden option
-        #[arg(long, default_value="false", hide = true)]
+        #[arg(long, default_value="true", hide = true)]
         use_foldseek: bool,
         /// Use AFDB lookup for foldseek createdb. Useful for large databases
         #[arg(long, default_value="false")]

From 9a14ad861e2304bbf9479b2458cb6da744bf79e0 Mon Sep 17 00:00:00 2001
From: pskvins <pskvins@snu.ac.kr>
Date: Thu, 30 Jan 2025 15:57:59 +0900
Subject: [PATCH 2/7] Updated README according to foldseek default

---
 README.md | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 7f7d3b0..97d7ba8 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Kim, D., Park, S., & Steinegger, M. (2024). Unicore enables scalable and accurat
 - [Unicore](#unicore)
 - [Quick Start with Conda](#quick-start-with-conda)
   - [GPU acceleration with CUDA](#gpu-acceleration-with-cuda)
-  - [GPU acceleration with Foldseek-ProstT5 (beta)](#gpu-acceleration-with-foldseek-prostt5-beta)
+  - [GPU acceleration with Foldseek-ProstT5](#gpu-acceleration-with-foldseek-prostt5)
 - [Tutorial](#tutorial)
 - [Manual](#manual)
   - [Input](#input)
@@ -29,24 +29,15 @@ conda install -c bioconda unicore
 unicore -v
 ```
 
-### GPU acceleration with CUDA
-`createdb` module can be greatly acclerated with ProstT5-GPU.
-If you have a Linux machine with CUDA-compatible GPU, please install this additional package:
-```
-conda install -c conda-forge pytorch-gpu
-```
-
-### GPU acceleration with Foldseek-ProstT5 (beta)
-> Note. This feature is under development and may not work in some environments. We will provide an update after the stable release of Foldseek-ProstT5.
-
-Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.29, and `nvidia-driver` ≥525.60.13)<br>
+### GPU acceleration with Foldseek-ProstT5
+Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.17, and `nvidia-driver` ≥525.60.13)<br>
 To use it, please install it by running the following command:
 ```
 wget https://mmseqs.com/foldseek/foldseek-linux-gpu.tar.gz; tar xvfz foldseek-linux-gpu.tar.gz; export PATH=$(pwd)/foldseek/bin/:$PATH
 ```
-Then, add `--use-foldseek` and `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU:
+Then, add `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU:
 ```
-unicore easy-core --use-foldseek --gpu <INPUT> <OUTPUT> <MODEL> <TMP>
+unicore easy-core --gpu <INPUT> <OUTPUT> <MODEL> <TMP>
 ```
 
 <hr>
@@ -148,7 +139,10 @@ unicore createdb data db/proteome_db /path/to/prostt5/weights
 ```
 This will create a Foldseek database in the `db` folder.
 
-If you have foldseek installed with CUDA, you can run the ProstT5 in the module with foldseek by adding `--use-foldseek` option.
+If you want to select the GPU devices, please use the `CUDA_VISIBLE_DEVICES` environment variable.
+
+* `CUDA_VISIBLE_DEVICES=0` to use GPU 0.
+* `CUDA_VISIBLE_DEVICES=0,1` to use GPU 0 and 1.
 
 #### cluster
 `cluster` module takes a `createdb` output database, runs Foldseek clustering, and outputs the cluster results.
@@ -220,8 +214,6 @@ unicore gene-tree --realign --threshold 30 --name /path/to/hashed/gene/names tre
 * [Foldseek](https://foldseek.com) (version ≥ 9)
 * [Foldmason](https://foldmason.foldseek.com)
 * [IQ-TREE](http://www.iqtree.org/)
-* pytorch, transformers, sentencepiece, protobuf
-  - These are required for users who cannot build foldseek with CUDA. Please install them with `pip install torch transformers sentencepiece protobuf`.
 ### Optional requirements
 * [MAFFT](https://mafft.cbrc.jp/alignment/software/)
 * [Fasttree](http://www.microbesonline.org/fasttree/) or [RAxML](https://cme.h-its.org/exelixis/web/software/raxml/)

From 8fe9599274ee5c83a276c45f93614f27c28f37d8 Mon Sep 17 00:00:00 2001
From: pskvins <pskvins@snu.ac.kr>
Date: Thu, 30 Jan 2025 16:07:36 +0900
Subject: [PATCH 3/7] Removed Rust code lines related to use_python

---
 src/modules/createdb.rs | 118 +++++++++-------------------------------
 src/util/arg_parser.rs  |  32 +----------
 2 files changed, 27 insertions(+), 123 deletions(-)

diff --git a/src/modules/createdb.rs b/src/modules/createdb.rs
index b126265..d8dd0d6 100644
--- a/src/modules/createdb.rs
+++ b/src/modules/createdb.rs
@@ -26,18 +26,11 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box<dyn std::error
     let overwrite = args.createdb_overwrite.unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - overwrite".to_string())); });
     let max_len = args.createdb_max_len.unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - max_len".to_string())); });
     let gpu = args.createdb_gpu.unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - gpu".to_string())); });
-    let use_python = args.createdb_use_python.unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - use_python".to_string())); });
-    let use_foldseek = args.createdb_use_foldseek.unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - use_foldseek".to_string())); });
     let afdb_lookup = args.createdb_afdb_lookup.unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - afdb_lookup".to_string())); });
     let afdb_local = args.createdb_afdb_local.clone().unwrap_or_else(|| { err::error(err::ERR_ARGPARSE, Some("createdb - afdb_local".to_string())); });
     let threads = crate::envs::variables::threads();
     let foldseek_verbosity = (match var::verbosity() { 4 => 3, 3 => 2, _ => var::verbosity() }).to_string();
 
-    // Either use_foldseek or use_python must be true
-    if !use_foldseek && !use_python {
-        err::error(err::ERR_ARGPARSE, Some("Either use_foldseek or use_python must be true".to_string()));
-    }
-
     // Check afdb_lookup
     let afdb_local = if afdb_lookup && !afdb_local.is_some() {
         err::error(err::ERR_ARGPARSE, Some("afdb-lookup is provided but afdb-local is not given".to_string()));
@@ -135,44 +128,37 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box<dyn std::error
         fasta::write_fasta(&combined_aa, &fasta_data)?;
     }
 
-    if use_foldseek {
-        // Added use_foldseek temporarily.
-        // TODO: Remove use_foldseek when foldseek is ready
-        let foldseek_path = match &bin.get("foldseek") {
-            Some(bin) => &bin.path,
-            _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); }
-        };
-
-        // Check if old weights exist
-        if Path::new(&model).join("cnn.safetensors").exists() || Path::new(&model).join(format!("model{}cnn.safetensors", SEP)).exists() {
-            err::error(err::ERR_GENERAL, Some("Old weight files detected from the given path. Please provide different path for the model weights".to_string()));
-        }
-        // Check if weights exist
-        if !Path::new(&model).join("prostt5-f16.gguf").exists() {
-            // Download the model
-            std::fs::create_dir_all(format!("{}{}tmp", model, SEP))?;
-            let mut cmd = std::process::Command::new(foldseek_path);
-            let mut cmd = cmd
-                .arg("databases").arg("ProstT5").arg(&model).arg(format!("{}{}tmp", model, SEP)).arg("--threads").arg(threads.to_string());
-            cmd::run(&mut cmd);
-        }
+    // Use foldseek to create the database
+    let foldseek_path = match &bin.get("foldseek") {
+        Some(bin) => &bin.path,
+        _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); }
+    };
 
-        // Run foldseek createdb
+    // Check if old weights exist
+    if Path::new(&model).join("cnn.safetensors").exists() || Path::new(&model).join(format!("model{}cnn.safetensors", SEP)).exists() {
+        err::error(err::ERR_GENERAL, Some("Old weight files detected from the given path. Please provide different path for the model weights".to_string()));
+    }
+    // Check if weights exist
+    if !Path::new(&model).join("prostt5-f16.gguf").exists() {
+        // Download the model
+        std::fs::create_dir_all(format!("{}{}tmp", model, SEP))?;
         let mut cmd = std::process::Command::new(foldseek_path);
-        let cmd = cmd
-            .arg("createdb").arg(&combined_aa).arg(&output)
-            .arg("--prostt5-model").arg(&model)
-            .arg("--threads").arg(threads.to_string());
-        let mut cmd = if gpu {
-            cmd.arg("--gpu").arg("1")
-        } else { cmd };
+        let mut cmd = cmd
+            .arg("databases").arg("ProstT5").arg(&model).arg(format!("{}{}tmp", model, SEP)).arg("--threads").arg(threads.to_string());
         cmd::run(&mut cmd);
-    } else if use_python {
-        let _ = _run_python(&combined_aa, &curr_dir, &parent, &output, &model, keep, bin, threads.to_string());
-    } else {
-        err::error(err::ERR_GENERAL, Some("Either use_foldseek or use_python must be true".to_string()));
     }
 
+    // Run foldseek createdb
+    let mut cmd = std::process::Command::new(foldseek_path);
+    let cmd = cmd
+        .arg("createdb").arg(&combined_aa).arg(&output)
+        .arg("--prostt5-model").arg(&model)
+        .arg("--threads").arg(threads.to_string());
+    let mut cmd = if gpu {
+        cmd.arg("--gpu").arg("1")
+    } else { cmd };
+    cmd::run(&mut cmd);
+
     if afdb_lookup {
         let foldseek_path = match &bin.get("foldseek") {
             Some(bin) => &bin.path,
@@ -221,57 +207,5 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box<dyn std::error
     chkpnt::write_checkpoint(&checkpoint_file, "1")?;
     
 
-    Ok(())
-}
-
-fn _run_python(combined_aa: &String, curr_dir: &str, parent: &str, output: &str, model: &str, keep: bool, bin: &crate::envs::variables::BinaryPaths, threads: String) -> Result<(), Box<dyn std::error::Error>> {
-    let input_3di = format!("{}{}{}{}combined_3di.fasta", curr_dir, SEP, parent, SEP);
-    let inter_prob = format!("{}{}{}{}output_probabilities.csv", curr_dir, SEP, parent, SEP);
-    let output_3di = format!("{}{}{}_ss", curr_dir, SEP, output);
-    let foldseek_verbosity = (match var::verbosity() { 4 => 3, 3 => 2, _ => var::verbosity() }).to_string();
-
-    // Run python script
-    let mut cmd = std::process::Command::new("python");
-    let mut cmd = cmd
-        .arg(var::locate_encoder_py())
-        .arg("-i").arg(&combined_aa)
-        .arg("-o").arg(&input_3di)
-        .arg("--model").arg(&model)
-        .arg("--half").arg("0")
-        .arg("--threads").arg(threads);
-    cmd::run(&mut cmd);
-
-    // Build foldseek db
-    let foldseek_path = match &bin.get("foldseek") {
-        Some(bin) => &bin.path,
-        _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); }
-    };
-    let mut cmd = std::process::Command::new(foldseek_path);
-    let mut cmd = cmd
-        .arg("base:createdb").arg(&combined_aa).arg(&output)
-        .arg("--shuffle").arg("0")
-        .arg("-v").arg(foldseek_verbosity.as_str());
-
-    cmd::run(&mut cmd);
-
-    // Build foldseek 3di db
-    let mut cmd = std::process::Command::new(foldseek_path);
-    let mut cmd = cmd
-        .arg("base:createdb").arg(&input_3di).arg(&output_3di)
-        .arg("--shuffle").arg("0")
-        .arg("-v").arg(foldseek_verbosity.as_str());
-    cmd::run(&mut cmd);
-
-    // Delete intermediate files
-    if !keep {
-        // std::fs::remove_file(mapping_file)?;
-        // std::fs::remove_file(combined_aa)?;
-        std::fs::remove_file(input_3di)?;
-        std::fs::remove_file(inter_prob)?;
-    }
-
-    // // Write the checkpoint file
-    // chkpnt::write_checkpoint(&format!("{}/createdb.chk", parent), "1")?;
-
     Ok(())
 }
diff --git a/src/util/arg_parser.rs b/src/util/arg_parser.rs
index f747a5e..d763fcd 100644
--- a/src/util/arg_parser.rs
+++ b/src/util/arg_parser.rs
@@ -65,12 +65,6 @@ pub enum Commands {
         /// Use GPU for foldseek createdb
         #[arg(short, long, default_value="false")]
         gpu: bool,
-        /// Use python script instead. hidden option
-        #[arg(long, default_value="false", hide = true)]
-        use_python: bool,
-        /// Use foldseek for createdb. hidden option
-        #[arg(long, default_value="true", hide = true)]
-        use_foldseek: bool,
         /// Use AFDB lookup for foldseek createdb. Useful for large databases
         #[arg(long, default_value="false")]
         afdb_lookup: bool,
@@ -249,12 +243,6 @@ pub enum Commands {
         /// Use GPU for foldseek createdb
         #[arg(short, long, default_value="false")]
         gpu: bool,
-        /// Use python script instead. hidden option
-        #[arg(long, default_value="false", hide = true)]
-        use_python: bool,
-        /// Use foldseek for createdb. hidden option
-        #[arg(long, default_value="true", hide = true)]
-        use_foldseek: bool,
         /// Use AFDB lookup for foldseek createdb. Useful for large databases
         #[arg(long, default_value="false")]
         afdb_lookup: bool,
@@ -317,12 +305,6 @@ pub enum Commands {
         /// Use GPU for foldseek createdb
         #[arg(short, long, default_value="false")]
         gpu: bool,
-        /// Use python script instead. hidden option
-        #[arg(long, default_value="false", hide = true)]
-        use_python: bool,
-        /// Use foldseek for createdb. hidden option
-        #[arg(long, default_value="true", hide = true)]
-        use_foldseek: bool,
         /// Use AFDB lookup for foldseek createdb. Useful for large databases
         #[arg(long, default_value="false")]
         afdb_lookup: bool,
@@ -376,8 +358,6 @@ pub struct Args {
     pub createdb_overwrite: Option<bool>,
     pub createdb_max_len: Option<Option<usize>>,
     pub createdb_gpu: Option<bool>,
-    pub createdb_use_python: Option<bool>,
-    pub createdb_use_foldseek: Option<bool>,
     pub createdb_afdb_lookup: Option<bool>,
     pub createdb_afdb_local: Option<Option<String>>,
 
@@ -480,16 +460,6 @@ impl Args {
             Some(EasyCore { gpu, .. }) => Some(*gpu),
             Some(EasySearch { gpu, .. }) => Some(*gpu), _ => None,
         };
-        let createdb_use_python = match &args.command {
-            Some(Createdb { use_python, .. }) => Some(*use_python),
-            Some(EasyCore { use_python, .. }) => Some(*use_python),
-            Some(EasySearch { use_python, .. }) => Some(*use_python), _ => None,
-        };
-        let createdb_use_foldseek = match &args.command {
-            Some(Createdb { use_foldseek, .. }) => Some(*use_foldseek),
-            Some(EasyCore { use_foldseek, .. }) => Some(*use_foldseek),
-            Some(EasySearch { use_foldseek, .. }) => Some(*use_foldseek), _ => None,
-        };
         let createdb_afdb_lookup = match &args.command {
             Some(Createdb { afdb_lookup, .. }) => Some(*afdb_lookup),
             Some(EasyCore { afdb_lookup, .. }) => Some(*afdb_lookup),
@@ -641,7 +611,7 @@ impl Args {
 
         Args {
             command: args.command, version: args.version, threads, verbosity,
-            createdb_input, createdb_output, createdb_model, createdb_keep, createdb_overwrite, createdb_max_len, createdb_gpu, createdb_use_python, createdb_use_foldseek, createdb_afdb_lookup, createdb_afdb_local,
+            createdb_input, createdb_output, createdb_model, createdb_keep, createdb_overwrite, createdb_max_len, createdb_gpu, createdb_afdb_lookup, createdb_afdb_local,
             profile_input_db, profile_input_tsv, profile_output, profile_threshold, profile_print_copiness,
             search_input, search_target, search_output, search_tmp, search_keep_aln_db, search_search_options,
             cluster_input, cluster_output, cluster_tmp, cluster_keep_cluster_db, cluster_cluster_options,

From 0635bedce426f0645a3902549f6868b18f92dca2 Mon Sep 17 00:00:00 2001
From: pskvins <pskvins@snu.ac.kr>
Date: Thu, 30 Jan 2025 16:08:55 +0900
Subject: [PATCH 4/7] removed python script

---
 src/py/predict_3Di_encoderOnly.py | 442 ------------------------------
 1 file changed, 442 deletions(-)
 delete mode 100644 src/py/predict_3Di_encoderOnly.py

diff --git a/src/py/predict_3Di_encoderOnly.py b/src/py/predict_3Di_encoderOnly.py
deleted file mode 100644
index 30544cc..0000000
--- a/src/py/predict_3Di_encoderOnly.py
+++ /dev/null
@@ -1,442 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Jun 16 14:27:44 2023
-
-@author: mheinzinger
-"""
-
-import argparse
-import time
-from pathlib import Path
-
-from urllib import request
-import shutil
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import T5EncoderModel, T5Tokenizer
-from tqdm import tqdm
-
-class bcolors:
-    HEADER = '\033[95m'
-    OKBLUE = '\033[94m'
-    OKCYAN = '\033[96m'
-    OKGREEN = '\033[92m'
-    WARNING = '\033[93m'
-    FAIL = '\033[91m'
-    ENDC = '\033[0m'
-    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
-
-
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-print("Using device: {}".format(device))
-
-
-# Convolutional neural network (two convolutional layers)
-class CNN(nn.Module):
-    def __init__(self):
-        super(CNN, self).__init__()
-
-        self.classifier = nn.Sequential(
-            nn.Conv2d(1024, 32, kernel_size=(7, 1), padding=(3, 0)),  # 7x32
-            nn.ReLU(),
-            nn.Dropout(0.0),
-            nn.Conv2d(32, 20, kernel_size=(7, 1), padding=(3, 0))
-        )
-
-    def forward(self, x):
-        """
-            L = protein length
-            B = batch-size
-            F = number of features (1024 for embeddings)
-            N = number of classes (20 for 3Di)
-        """
-        x = x.permute(0, 2, 1).unsqueeze(
-            dim=-1)  # IN: X = (B x L x F); OUT: (B x F x L, 1)
-        Yhat = self.classifier(x)  # OUT: Yhat_consurf = (B x N x L x 1)
-        Yhat = Yhat.squeeze(dim=-1)  # IN: (B x N x L x 1); OUT: ( B x N x L )
-        return Yhat
-
-
-def get_T5_model(model_dir):
-    print("Loading T5 from: {}".format(model_dir))
-    model = T5EncoderModel.from_pretrained(
-        "Rostlab/ProstT5_fp16", cache_dir=model_dir).to(device)
-    model = model.eval()
-    vocab = T5Tokenizer.from_pretrained(
-        "Rostlab/ProstT5_fp16", do_lower_case=False, cache_dir=model_dir, legacy=True)
-    return model, vocab
-
-
-def read_fasta(fasta_path, split_char, id_field):
-    '''
-        Reads in fasta file containing multiple sequences.
-        Returns dictionary of holding multiple sequences or only single 
-        sequence, depending on input file.
-    '''
-
-    sequences = dict()
-    with open(fasta_path, 'r') as fasta_f:
-        for line in fasta_f:
-            # get uniprot ID from header and create new entry
-            if line.startswith('>'):
-                uniprot_id = line.replace(
-                    '>', '').strip().split(split_char)[id_field]
-                # replace tokens that are mis-interpreted when loading h5
-                # uniprot_id = uniprot_id.replace("/", "_").replace(".", "_")
-                sequences[uniprot_id] = ''
-            else:
-                s = ''.join(line.split()).replace("-", "")
-
-                if s.islower():  # sanity check to avoid mix-up of 3Di and AA input
-                    print("The input file was in lower-case which indicates 3Di-input." +
-                          "This predictor only operates on amino-acid-input (upper-case)." +
-                          "Exiting now ..."
-                          )
-                    return None
-                else:
-                    sequences[uniprot_id] += s
-    return sequences
-
-
-def write_probs(predictions, out_path, probs_name="output_probabilities.csv"):
-    out_path = out_path.parent / probs_name
-    with open(out_path, 'w+') as out_f:
-        out_f.write('\n'.join(
-            ["{},{}".format(seq_id, prob)
-             for seq_id, (N, prob) in predictions.items()
-             ]
-        ))
-    print(f"Finished writing probabilities to {out_path}")
-    return None
-
-
-def write_predictions(predictions, out_path):
-    ss_mapping = {
-        0: "A",
-        1: "C",
-        2: "D",
-        3: "E",
-        4: "F",
-        5: "G",
-        6: "H",
-        7: "I",
-        8: "K",
-        9: "L",
-        10: "M",
-        11: "N",
-        12: "P",
-        13: "Q",
-        14: "R",
-        15: "S",
-        16: "T",
-        17: "V",
-        18: "W",
-        19: "Y"
-    }
-
-    with open(out_path, 'w+') as out_f:
-        out_f.write('\n'.join(
-            [">{}\n{}".format(
-                seq_id, "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats))))
-             for seq_id, (yhats, _) in predictions.items()
-             ]
-        ))
-    print(f"Finished writing results to {out_path}")
-    return None
-
-
-def toCPU(tensor):
-    if len(tensor.shape) > 1:
-        return tensor.detach().cpu().squeeze(dim=-1).numpy()
-    else:
-        return tensor.detach().cpu().numpy()
-
-
-def download_file(url, local_path):
-    if not local_path.parent.is_dir():
-        local_path.parent.mkdir()
-
-    print("Downloading: {}".format(url))
-    req = request.Request(url, headers={
-        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
-    })
-
-    with request.urlopen(req) as response, open(local_path, 'wb') as outfile:
-        shutil.copyfileobj(response, outfile)
-    return None
-
-
-def load_predictor(weights_link="https://github.com/mheinzinger/ProstT5/raw/main/cnn_chkpnt/model.pt"):
-    model = CNN()
-    checkpoint_p = Path.cwd() / "cnn_chkpnt" / "model.pt"
-    # if no pre-trained model is available, yet --> download it
-    if not checkpoint_p.exists():
-        download_file(weights_link, checkpoint_p)
-
-    # Torch load will map back to device from state, which often is GPU:0.
-    # to overcome, need to explicitly map to active device
-    global device
-
-    #state = torch.load(checkpoint_p, map_location=device)
-    state = torch.load(checkpoint_p, map_location=device, weights_only=True)
-
-    model.load_state_dict(state["state_dict"])
-
-    model = model.eval()
-    model = model.to(device)
-
-    return model
-
-
-def get_embeddings(seq_path, out_path, model_dir, split_char, id_field, half_precision, output_probs, probs_name,
-                   max_residues=4000, max_seq_len=1000, max_batch=500):
-
-    seq_dict = dict()
-    predictions = dict()
-
-    # Read in fasta
-    seq_dict = read_fasta(seq_path, split_char, id_field)
-    prefix = "<AA2fold>"
-
-    model, vocab = get_T5_model(model_dir)
-    predictor = load_predictor()
-
-    if half_precision:
-        model.half()
-        predictor.half()
-        print("Using models in half-precision.")
-    else:
-        model.to(torch.float32)
-        predictor.to(torch.float32)
-        print("Using models in full-precision.")
-
-    print('########################################')
-    print('Example sequence: {}\n{}'.format(next(iter(
-        seq_dict.keys())), next(iter(seq_dict.values()))))
-    print('########################################')
-    print('Total number of sequences: {}'.format(len(seq_dict)))
-
-    avg_length = sum([len(seq) for _, seq in seq_dict.items()]) / len(seq_dict)
-    n_long = sum([1 for _, seq in seq_dict.items() if len(seq) > max_seq_len])
-    seq_dict_names = list(seq_dict.keys())
-    # sort sequences by length to trigger OOM at the beginning
-    seq_dict = sorted(seq_dict.items(), key=lambda kv: len(
-        seq_dict[kv[0]]), reverse=True)
-
-    print("Average sequence length: {}".format(avg_length))
-    print("Number of sequences >{}: {}".format(max_seq_len, n_long))
-
-    start = time.time()
-    batch = list()
-    standard_aa = "ACDEFGHIKLMNPQRSTVWY"
-    standard_aa_dict = {aa: aa for aa in standard_aa}
-    count = 0
-    seq_idx = 0
-    for (pdb_id, seq) in tqdm(seq_dict):
-        seq_idx += 1
-        # replace the non-standard amino acids with 'X'
-        seq = ''.join([standard_aa_dict.get(aa, 'X') for aa in seq])
-        seq_len = len(seq)
-        seq = prefix + ' ' + ' '.join(list(seq))
-        batch.append((pdb_id, seq, seq_len))
-
-        # count residues in current batch and add the last sequence length to
-        # avoid that batches with (n_res_batch > max_residues) get processed
-        n_res_batch = sum([s_len for _, _, s_len in batch]) + seq_len
-
-        if len(batch) >= max_batch or n_res_batch >= max_residues or seq_idx == len(seq_dict) or seq_len > max_seq_len:
-            count += len(batch)
-            pdb_ids, seqs, seq_lens = zip(*batch)
-            batch = list()
-
-            token_encoding = vocab.batch_encode_plus(seqs,
-                                                     add_special_tokens=True,
-                                                     padding="longest",
-                                                     return_tensors='pt'
-                                                     ).to(device)
-            try:
-                with torch.no_grad():
-                    embedding_repr = model(token_encoding.input_ids,
-                                           attention_mask=token_encoding.attention_mask
-                                           )
-            except RuntimeError:
-                print("RuntimeError during embedding for {} (L={})".format(
-                    pdb_id, seq_len)
-                )
-                continue
-
-            # ProtT5 appends a special tokens at the end of each sequence
-            # Mask this also out during inference while taking into account the prefix
-            for idx, s_len in enumerate(seq_lens):
-                token_encoding.attention_mask[idx, s_len+1] = 0
-
-            # extract last hidden states (=embeddings)
-            residue_embedding = embedding_repr.last_hidden_state.detach()
-            # mask out padded elements in the attention output (can be non-zero) for further processing/prediction
-            residue_embedding = residue_embedding * \
-                token_encoding.attention_mask.unsqueeze(dim=-1)
-            # slice off embedding of special token prepended before to each sequence
-            residue_embedding = residue_embedding[:, 1:]
-
-            # IN: X = (B x L x F) - OUT: ( B x N x L )
-            prediction = predictor(residue_embedding)
-            if output_probs:  # compute max probabilities per token/residue if requested
-                probabilities = toCPU(torch.max(
-                    F.softmax(prediction, dim=1), dim=1, keepdim=True)[0])
-            
-            prediction = toCPU(torch.max(prediction, dim=1, keepdim=True)[
-                               1]).astype(np.byte)
-
-            # batch-size x seq_len x embedding_dim
-            # extra token is added at the end of the seq
-            for batch_idx, identifier in enumerate(pdb_ids):
-                s_len = seq_lens[batch_idx]
-                # slice off padding and special token appended to the end of the sequence
-                pred = prediction[batch_idx, :, 0:s_len].squeeze()
-                if output_probs:  # average over per-residue max.-probabilities
-                    prob = int( 100* np.mean(probabilities[batch_idx, :, 0:s_len]))
-                    predictions[identifier] = (pred, prob)
-                else:
-                    predictions[identifier] = (pred, None)
-                assert s_len == len(predictions[identifier][0]), print(
-                    f"Length mismatch for {identifier}: is:{len(predictions[identifier])} vs should:{s_len}")
-                if len(predictions) == 1:
-                    print(
-                        f"Example: predicted for protein {identifier} with length {s_len}: {predictions[identifier]}")
-            # print(f"Batch complete - total {count}")
-
-
-    end = time.time()
-    print('\n############# STATS #############')
-    print('Total number of predictions: {}'.format(len(predictions)))
-    print('Total time: {:.2f}[s]; time/prot: {:.4f}[s]; avg. len= {:.2f}'.format(
-        end-start, (end-start)/len(predictions), avg_length))
-    print("Writing results now to disk ...")
-
-    # Sort the prediction as the input fasta file only if the name exists
-    predictions = {seq_name: predictions[seq_name]
-                   for seq_name in seq_dict_names if seq_name in predictions}
-
-    write_predictions(predictions, out_path)
-    if output_probs:
-        write_probs(predictions, out_path, probs_name=probs_name)
-
-    return True
-
-
-def create_arg_parser():
-    """"Creates and returns the ArgumentParser object."""
-
-    # Instantiate the parser
-    parser = argparse.ArgumentParser(description=(
-        'predict_3Di_encoderOnly.py translates amino acid sequences to 3Di sequences. ' +
-        'Example: python predict_3Di_encoderOnly.py --input /path/to/some_AA_sequences.fasta --output /path/to/some_3Di_sequences.fasta --model /path/to/tmp/checkpoint/dir')
-    )
-
-    # Required positional argument
-    parser.add_argument('-i', '--input', required=True, type=str,
-                        help='A path to a fasta-formatted text file containing protein sequence(s).')
-
-    # Required positional argument
-    parser.add_argument('-o', '--output', required=True, type=str,
-                        help='A path for saving the 3Di translations in FASTA format.')
-
-    # Required positional argument
-    parser.add_argument('--model', required=True, type=str,
-                        help='A path to a directory for saving the checkpoint of the pre-trained model.')
-
-    # Optional argument
-    parser.add_argument('--split_char', type=str,
-                        default='!',
-                        help='The character for splitting the FASTA header in order to retrieve ' +
-                        "the protein identifier. Should be used in conjunction with --id." +
-                        "Default: '!' ")
-
-    # Optional argument
-    parser.add_argument('--id', type=int,
-                        default=0,
-                        help='The index for the uniprot identifier field after splitting the ' +
-                        "FASTA header after each symbole in ['|', '#', ':', ' ']." +
-                        'Default: 0')
-
-    parser.add_argument('--half', type=int,
-                        default=1,
-                        help="Whether to use half_precision or not. Default: 1 (half-precision)")
-    
-    parser.add_argument('--output_probs', type=int,
-                        default=1,
-                        help="Whether to output probabilities/reliability. Default: 1 (output them).")
-
-    parser.add_argument('--probs_name', type=str,
-                        default="output_probabilities.csv",
-                        help="Name of the file to store the probabilities. Default: output_probabilities.csv")
-
-    parser.add_argument('--threads', type=int,
-                        default=1,
-                        help="Number of threads to use for prediction. Default: 1")
-
-    return parser
-
-
-def main():
-    parser = create_arg_parser()
-    args = parser.parse_args()
-
-    seq_path = Path(args.input)  # path to input FASTAS
-    out_path = Path(args.output)  # path where predictions should be written to
-    model_dir = args.model  # path/repo_link to checkpoint
-
-    # Check if seq_path is in fasta format
-    if not seq_path.is_file():
-        print(f"{bcolors.FAIL}{seq_path} is not a file{bcolors.ENDC}")
-        exit(1)
-
-    # Check if seq_path is empty
-    if seq_path.stat().st_size == 0:
-        print(f"{bcolors.FAIL}{seq_path} is empty{bcolors.ENDC}")
-        exit(1)
-
-    with open(seq_path, 'r') as seq_f:
-        for line in seq_f:
-            # Skip all the lines starts with '#'
-            while line.startswith('#'):
-                continue
-            if line.startswith('>'):
-                break
-            else:
-                print(f"{bcolors.FAIL}{seq_path} does not seem to be in FASTA format (doesn't start with '>')\nPlease check your input files. Only files in fasta/fastq[.gz|bz2] are supported{bcolors.ENDC}")
-                exit(1)
-
-    if out_path.is_file():
-        print("Output file is already existing and will be overwritten ...")
-
-    split_char = args.split_char
-    id_field = args.id
-
-    half_precision = False if int(args.half) == 0 else True
-    assert not (half_precision and device == torch.device("cpu")), print(
-        "Running fp16 on CPU is not supported, yet")
-    
-    output_probs = False if int(args.output_probs) == 0 else True
-
-    torch.set_num_threads(args.threads) # Set number of threads for torch
-
-    get_embeddings(
-        seq_path,
-        out_path,
-        model_dir,
-        split_char,
-        id_field,
-        half_precision,
-        output_probs,
-        args.probs_name
-    )
-
-
-if __name__ == '__main__':
-    main()

From f730e3a6a41aa49b099620acf7bd2a1d4c419091 Mon Sep 17 00:00:00 2001
From: pskvins <pskvins@snu.ac.kr>
Date: Thu, 30 Jan 2025 16:11:09 +0900
Subject: [PATCH 5/7] Removed python-related lines

---
 src/envs/variables.rs | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/envs/variables.rs b/src/envs/variables.rs
index 577a668..af83a35 100644
--- a/src/envs/variables.rs
+++ b/src/envs/variables.rs
@@ -74,15 +74,6 @@ pub fn locate_path_cfg() -> String {
         err::error(err::ERR_GENERAL, Some("Could not locate path.cfg".to_string()));
     }
 }
-pub fn locate_encoder_py() -> String {
-    if File::open(format!("{}{}etc{}predict_3Di_encoderOnly.py", parent_dir(), SEP, SEP)).is_ok() {
-        format!("{}{}etc{}predict_3Di_encoderOnly.py", parent_dir(), SEP, SEP)
-    } else if File::open(format!("{}{}src{}py{}predict_3Di_encoderOnly.py", src_parent_dir(), SEP, SEP, SEP)).is_ok() {
-        format!("{}{}src{}py{}predict_3Di_encoderOnly.py", src_parent_dir(), SEP, SEP, SEP)
-    } else {
-        err::error(err::ERR_GENERAL, Some("Could not locate path.cfg".to_string()));
-    }
-}
 
 // binary paths
 const VALID_BINARY: [&str; 7] = [

From ab0c43feb50e26b68ccaf0ac66b6d256fc1d83af Mon Sep 17 00:00:00 2001
From: Daniel DW Kim <endix1029@gmail.com>
Date: Wed, 5 Feb 2025 15:40:00 +0900
Subject: [PATCH 6/7] update readme to indicate compiled foldseek binary

---
 README.md | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 97d7ba8..e6694cc 100644
--- a/README.md
+++ b/README.md
@@ -30,12 +30,13 @@ unicore -v
 ```
 
 ### GPU acceleration with Foldseek-ProstT5
-Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.17, and `nvidia-driver` ≥525.60.13)<br>
-To use it, please install it by running the following command:
-```
-wget https://mmseqs.com/foldseek/foldseek-linux-gpu.tar.gz; tar xvfz foldseek-linux-gpu.tar.gz; export PATH=$(pwd)/foldseek/bin/:$PATH
-```
-Then, add `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU:
+Foldseek features GPU-acceleration for ProstT5 prediction under following requirements:
+ * Turing or newer NVIDIA GPU
+ * `foldseek` ≥10
+ * `glibc` ≥2.17
+ * `nvidia-driver` ≥525.60.13
+
+Apply `--gpu` option to either `easy-core` or `createdb` module to use it, e.g.
 ```
 unicore easy-core --gpu <INPUT> <OUTPUT> <MODEL> <TMP>
 ```
@@ -52,7 +53,7 @@ unzip unicore_example.zip
 If you cloned the repository, you can find the example dataset in the `example/data` folder.
 
 ### Download ProstT5 weights
-You need to first download the ProstT5 weights to run the `createdb` module.
+You can preliminarily download the ProstT5 weights required to run the `createdb` module.
 ```
 foldseek databases ProstT5 weights tmp
 ```
@@ -133,8 +134,6 @@ This module runs much faster with GPU. Please install `cuda` for GPU acceleratio
 
 To run the module, please use the following command:
 ```
-// Download ProstT5 weights as below if you haven't already
-// foldseek databases ProstT5 /path/to/prostt5/weights tmp
 unicore createdb data db/proteome_db /path/to/prostt5/weights
 ```
 This will create a Foldseek database in the `db` folder.
@@ -211,7 +210,7 @@ unicore gene-tree --realign --threshold 30 --name /path/to/hashed/gene/names tre
 ## Build from Source
 ### Minimum requirements
 * [Cargo](https://www.rust-lang.org/tools/install) (Rust)
-* [Foldseek](https://foldseek.com) (version ≥ 9)
+* [Foldseek](https://foldseek.com) (version ≥ 10)
 * [Foldmason](https://foldmason.foldseek.com)
 * [IQ-TREE](http://www.iqtree.org/)
 ### Optional requirements
@@ -232,5 +231,5 @@ With these tools installed, you can install and run `unicore` by:
 git clone https://github.com/steineggerlab/unicore.git
 cd unicore
 cargo build --release
-bin/unicore help
+bin/unicore -v
 ```

From cd2534d8bfe6e916a93191f091ed6165a4c2a959 Mon Sep 17 00:00:00 2001
From: Daniel DW Kim <endix1029@gmail.com>
Date: Wed, 5 Feb 2025 15:42:22 +0900
Subject: [PATCH 7/7] fix table of contents

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index e6694cc..3580fbe 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@ Kim, D., Park, S., & Steinegger, M. (2024). Unicore enables scalable and accurat
 ## Table of Contents
 - [Unicore](#unicore)
 - [Quick Start with Conda](#quick-start-with-conda)
-  - [GPU acceleration with CUDA](#gpu-acceleration-with-cuda)
   - [GPU acceleration with Foldseek-ProstT5](#gpu-acceleration-with-foldseek-prostt5)
 - [Tutorial](#tutorial)
 - [Manual](#manual)