From f8ae9fab2e66a61b1598975ab56afe6f2593c076 Mon Sep 17 00:00:00 2001 From: pskvins Date: Thu, 30 Jan 2025 15:49:44 +0900 Subject: [PATCH 1/7] Now foldseek is the default option --- src/util/arg_parser.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/util/arg_parser.rs b/src/util/arg_parser.rs index 1ea97a9..f747a5e 100644 --- a/src/util/arg_parser.rs +++ b/src/util/arg_parser.rs @@ -66,10 +66,10 @@ pub enum Commands { #[arg(short, long, default_value="false")] gpu: bool, /// Use python script instead. hidden option - #[arg(long, default_value="true", hide = true)] + #[arg(long, default_value="false", hide = true)] use_python: bool, /// Use foldseek for createdb. hidden option - #[arg(long, default_value="false", hide = true)] + #[arg(long, default_value="true", hide = true)] use_foldseek: bool, /// Use AFDB lookup for foldseek createdb. Useful for large databases #[arg(long, default_value="false")] @@ -250,10 +250,10 @@ pub enum Commands { #[arg(short, long, default_value="false")] gpu: bool, /// Use python script instead. hidden option - #[arg(long, default_value="true", hide = true)] + #[arg(long, default_value="false", hide = true)] use_python: bool, /// Use foldseek for createdb. hidden option - #[arg(long, default_value="false", hide = true)] + #[arg(long, default_value="true", hide = true)] use_foldseek: bool, /// Use AFDB lookup for foldseek createdb. Useful for large databases #[arg(long, default_value="false")] @@ -318,10 +318,10 @@ pub enum Commands { #[arg(short, long, default_value="false")] gpu: bool, /// Use python script instead. hidden option - #[arg(long, default_value="true", hide = true)] + #[arg(long, default_value="false", hide = true)] use_python: bool, /// Use foldseek for createdb. hidden option - #[arg(long, default_value="false", hide = true)] + #[arg(long, default_value="true", hide = true)] use_foldseek: bool, /// Use AFDB lookup for foldseek createdb. Useful for large databases #[arg(long, default_value="false")] From 9a14ad861e2304bbf9479b2458cb6da744bf79e0 Mon Sep 17 00:00:00 2001 From: pskvins Date: Thu, 30 Jan 2025 15:57:59 +0900 Subject: [PATCH 2/7] Updated README according to foldseek default --- README.md | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7f7d3b0..97d7ba8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Kim, D., Park, S., & Steinegger, M. (2024). Unicore enables scalable and accurat - [Unicore](#unicore) - [Quick Start with Conda](#quick-start-with-conda) - [GPU acceleration with CUDA](#gpu-acceleration-with-cuda) - - [GPU acceleration with Foldseek-ProstT5 (beta)](#gpu-acceleration-with-foldseek-prostt5-beta) + - [GPU acceleration with Foldseek-ProstT5](#gpu-acceleration-with-foldseek-prostt5) - [Tutorial](#tutorial) - [Manual](#manual) - [Input](#input) @@ -29,24 +29,15 @@ conda install -c bioconda unicore unicore -v ``` -### GPU acceleration with CUDA -`createdb` module can be greatly acclerated with ProstT5-GPU. -If you have a Linux machine with CUDA-compatible GPU, please install this additional package: -``` -conda install -c conda-forge pytorch-gpu -``` - -### GPU acceleration with Foldseek-ProstT5 (beta) -> Note. This feature is under development and may not work in some environments. We will provide an update after the stable release of Foldseek-ProstT5. - -Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.29, and `nvidia-driver` ≥525.60.13)
+### GPU acceleration with Foldseek-ProstT5 +Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.17, and `nvidia-driver` ≥525.60.13)
To use it, please install it by running the following command: ``` wget https://mmseqs.com/foldseek/foldseek-linux-gpu.tar.gz; tar xvfz foldseek-linux-gpu.tar.gz; export PATH=$(pwd)/foldseek/bin/:$PATH ``` -Then, add `--use-foldseek` and `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU: +Then, add `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU: ``` -unicore easy-core --use-foldseek --gpu +unicore easy-core --gpu ```
@@ -148,7 +139,10 @@ unicore createdb data db/proteome_db /path/to/prostt5/weights ``` This will create a Foldseek database in the `db` folder. -If you have foldseek installed with CUDA, you can run the ProstT5 in the module with foldseek by adding `--use-foldseek` option. +If you want to select the GPU devices, please use the `CUDA_VISIBLE_DEVICES` environment variable. + +* `CUDA_VISIBLE_DEVICES=0` to use GPU 0. +* `CUDA_VISIBLE_DEVICES=0,1` to use GPU 0 and 1. #### cluster `cluster` module takes a `createdb` output database, runs Foldseek clustering, and outputs the cluster results. @@ -220,8 +214,6 @@ unicore gene-tree --realign --threshold 30 --name /path/to/hashed/gene/names tre * [Foldseek](https://foldseek.com) (version ≥ 9) * [Foldmason](https://foldmason.foldseek.com) * [IQ-TREE](http://www.iqtree.org/) -* pytorch, transformers, sentencepiece, protobuf - - These are required for users who cannot build foldseek with CUDA. Please install them with `pip install torch transformers sentencepiece protobuf`. ### Optional requirements * [MAFFT](https://mafft.cbrc.jp/alignment/software/) * [Fasttree](http://www.microbesonline.org/fasttree/) or [RAxML](https://cme.h-its.org/exelixis/web/software/raxml/) From 8fe9599274ee5c83a276c45f93614f27c28f37d8 Mon Sep 17 00:00:00 2001 From: pskvins Date: Thu, 30 Jan 2025 16:07:36 +0900 Subject: [PATCH 3/7] Removed Rust code lines related to use_python --- src/modules/createdb.rs | 118 +++++++++------------------------------- src/util/arg_parser.rs | 32 +---------- 2 files changed, 27 insertions(+), 123 deletions(-) diff --git a/src/modules/createdb.rs b/src/modules/createdb.rs index b126265..d8dd0d6 100644 --- a/src/modules/createdb.rs +++ b/src/modules/createdb.rs @@ -26,18 +26,11 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box 3, 3 => 2, _ => var::verbosity() }).to_string(); - // Either use_foldseek or use_python must be true - if !use_foldseek && !use_python { - err::error(err::ERR_ARGPARSE, Some("Either use_foldseek or use_python must be true".to_string())); - } - // Check afdb_lookup let afdb_local = if afdb_lookup && !afdb_local.is_some() { err::error(err::ERR_ARGPARSE, Some("afdb-lookup is provided but afdb-local is not given".to_string())); @@ -135,44 +128,37 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box &bin.path, - _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); } - }; - - // Check if old weights exist - if Path::new(&model).join("cnn.safetensors").exists() || Path::new(&model).join(format!("model{}cnn.safetensors", SEP)).exists() { - err::error(err::ERR_GENERAL, Some("Old weight files detected from the given path. Please provide different path for the model weights".to_string())); - } - // Check if weights exist - if !Path::new(&model).join("prostt5-f16.gguf").exists() { - // Download the model - std::fs::create_dir_all(format!("{}{}tmp", model, SEP))?; - let mut cmd = std::process::Command::new(foldseek_path); - let mut cmd = cmd - .arg("databases").arg("ProstT5").arg(&model).arg(format!("{}{}tmp", model, SEP)).arg("--threads").arg(threads.to_string()); - cmd::run(&mut cmd); - } + // Use foldseek to create the database + let foldseek_path = match &bin.get("foldseek") { + Some(bin) => &bin.path, + _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); } + }; - // Run foldseek createdb + // Check if old weights exist + if Path::new(&model).join("cnn.safetensors").exists() || Path::new(&model).join(format!("model{}cnn.safetensors", SEP)).exists() { + err::error(err::ERR_GENERAL, Some("Old weight files detected from the given path. Please provide different path for the model weights".to_string())); + } + // Check if weights exist + if !Path::new(&model).join("prostt5-f16.gguf").exists() { + // Download the model + std::fs::create_dir_all(format!("{}{}tmp", model, SEP))?; let mut cmd = std::process::Command::new(foldseek_path); - let cmd = cmd - .arg("createdb").arg(&combined_aa).arg(&output) - .arg("--prostt5-model").arg(&model) - .arg("--threads").arg(threads.to_string()); - let mut cmd = if gpu { - cmd.arg("--gpu").arg("1") - } else { cmd }; + let mut cmd = cmd + .arg("databases").arg("ProstT5").arg(&model).arg(format!("{}{}tmp", model, SEP)).arg("--threads").arg(threads.to_string()); cmd::run(&mut cmd); - } else if use_python { - let _ = _run_python(&combined_aa, &curr_dir, &parent, &output, &model, keep, bin, threads.to_string()); - } else { - err::error(err::ERR_GENERAL, Some("Either use_foldseek or use_python must be true".to_string())); } + // Run foldseek createdb + let mut cmd = std::process::Command::new(foldseek_path); + let cmd = cmd + .arg("createdb").arg(&combined_aa).arg(&output) + .arg("--prostt5-model").arg(&model) + .arg("--threads").arg(threads.to_string()); + let mut cmd = if gpu { + cmd.arg("--gpu").arg("1") + } else { cmd }; + cmd::run(&mut cmd); + if afdb_lookup { let foldseek_path = match &bin.get("foldseek") { Some(bin) => &bin.path, @@ -221,57 +207,5 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box Result<(), Box> { - let input_3di = format!("{}{}{}{}combined_3di.fasta", curr_dir, SEP, parent, SEP); - let inter_prob = format!("{}{}{}{}output_probabilities.csv", curr_dir, SEP, parent, SEP); - let output_3di = format!("{}{}{}_ss", curr_dir, SEP, output); - let foldseek_verbosity = (match var::verbosity() { 4 => 3, 3 => 2, _ => var::verbosity() }).to_string(); - - // Run python script - let mut cmd = std::process::Command::new("python"); - let mut cmd = cmd - .arg(var::locate_encoder_py()) - .arg("-i").arg(&combined_aa) - .arg("-o").arg(&input_3di) - .arg("--model").arg(&model) - .arg("--half").arg("0") - .arg("--threads").arg(threads); - cmd::run(&mut cmd); - - // Build foldseek db - let foldseek_path = match &bin.get("foldseek") { - Some(bin) => &bin.path, - _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); } - }; - let mut cmd = std::process::Command::new(foldseek_path); - let mut cmd = cmd - .arg("base:createdb").arg(&combined_aa).arg(&output) - .arg("--shuffle").arg("0") - .arg("-v").arg(foldseek_verbosity.as_str()); - - cmd::run(&mut cmd); - - // Build foldseek 3di db - let mut cmd = std::process::Command::new(foldseek_path); - let mut cmd = cmd - .arg("base:createdb").arg(&input_3di).arg(&output_3di) - .arg("--shuffle").arg("0") - .arg("-v").arg(foldseek_verbosity.as_str()); - cmd::run(&mut cmd); - - // Delete intermediate files - if !keep { - // std::fs::remove_file(mapping_file)?; - // std::fs::remove_file(combined_aa)?; - std::fs::remove_file(input_3di)?; - std::fs::remove_file(inter_prob)?; - } - - // // Write the checkpoint file - // chkpnt::write_checkpoint(&format!("{}/createdb.chk", parent), "1")?; - Ok(()) } diff --git a/src/util/arg_parser.rs b/src/util/arg_parser.rs index f747a5e..d763fcd 100644 --- a/src/util/arg_parser.rs +++ b/src/util/arg_parser.rs @@ -65,12 +65,6 @@ pub enum Commands { /// Use GPU for foldseek createdb #[arg(short, long, default_value="false")] gpu: bool, - /// Use python script instead. hidden option - #[arg(long, default_value="false", hide = true)] - use_python: bool, - /// Use foldseek for createdb. hidden option - #[arg(long, default_value="true", hide = true)] - use_foldseek: bool, /// Use AFDB lookup for foldseek createdb. Useful for large databases #[arg(long, default_value="false")] afdb_lookup: bool, @@ -249,12 +243,6 @@ pub enum Commands { /// Use GPU for foldseek createdb #[arg(short, long, default_value="false")] gpu: bool, - /// Use python script instead. hidden option - #[arg(long, default_value="false", hide = true)] - use_python: bool, - /// Use foldseek for createdb. hidden option - #[arg(long, default_value="true", hide = true)] - use_foldseek: bool, /// Use AFDB lookup for foldseek createdb. Useful for large databases #[arg(long, default_value="false")] afdb_lookup: bool, @@ -317,12 +305,6 @@ pub enum Commands { /// Use GPU for foldseek createdb #[arg(short, long, default_value="false")] gpu: bool, - /// Use python script instead. hidden option - #[arg(long, default_value="false", hide = true)] - use_python: bool, - /// Use foldseek for createdb. hidden option - #[arg(long, default_value="true", hide = true)] - use_foldseek: bool, /// Use AFDB lookup for foldseek createdb. Useful for large databases #[arg(long, default_value="false")] afdb_lookup: bool, @@ -376,8 +358,6 @@ pub struct Args { pub createdb_overwrite: Option, pub createdb_max_len: Option>, pub createdb_gpu: Option, - pub createdb_use_python: Option, - pub createdb_use_foldseek: Option, pub createdb_afdb_lookup: Option, pub createdb_afdb_local: Option>, @@ -480,16 +460,6 @@ impl Args { Some(EasyCore { gpu, .. }) => Some(*gpu), Some(EasySearch { gpu, .. }) => Some(*gpu), _ => None, }; - let createdb_use_python = match &args.command { - Some(Createdb { use_python, .. }) => Some(*use_python), - Some(EasyCore { use_python, .. }) => Some(*use_python), - Some(EasySearch { use_python, .. }) => Some(*use_python), _ => None, - }; - let createdb_use_foldseek = match &args.command { - Some(Createdb { use_foldseek, .. }) => Some(*use_foldseek), - Some(EasyCore { use_foldseek, .. }) => Some(*use_foldseek), - Some(EasySearch { use_foldseek, .. }) => Some(*use_foldseek), _ => None, - }; let createdb_afdb_lookup = match &args.command { Some(Createdb { afdb_lookup, .. }) => Some(*afdb_lookup), Some(EasyCore { afdb_lookup, .. }) => Some(*afdb_lookup), @@ -641,7 +611,7 @@ impl Args { Args { command: args.command, version: args.version, threads, verbosity, - createdb_input, createdb_output, createdb_model, createdb_keep, createdb_overwrite, createdb_max_len, createdb_gpu, createdb_use_python, createdb_use_foldseek, createdb_afdb_lookup, createdb_afdb_local, + createdb_input, createdb_output, createdb_model, createdb_keep, createdb_overwrite, createdb_max_len, createdb_gpu, createdb_afdb_lookup, createdb_afdb_local, profile_input_db, profile_input_tsv, profile_output, profile_threshold, profile_print_copiness, search_input, search_target, search_output, search_tmp, search_keep_aln_db, search_search_options, cluster_input, cluster_output, cluster_tmp, cluster_keep_cluster_db, cluster_cluster_options, From 0635bedce426f0645a3902549f6868b18f92dca2 Mon Sep 17 00:00:00 2001 From: pskvins Date: Thu, 30 Jan 2025 16:08:55 +0900 Subject: [PATCH 4/7] removed python script --- src/py/predict_3Di_encoderOnly.py | 442 ------------------------------ 1 file changed, 442 deletions(-) delete mode 100644 src/py/predict_3Di_encoderOnly.py diff --git a/src/py/predict_3Di_encoderOnly.py b/src/py/predict_3Di_encoderOnly.py deleted file mode 100644 index 30544cc..0000000 --- a/src/py/predict_3Di_encoderOnly.py +++ /dev/null @@ -1,442 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Fri Jun 16 14:27:44 2023 - -@author: mheinzinger -""" - -import argparse -import time -from pathlib import Path - -from urllib import request -import shutil - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import T5EncoderModel, T5Tokenizer -from tqdm import tqdm - -class bcolors: - HEADER = '\033[95m' - OKBLUE = '\033[94m' - OKCYAN = '\033[96m' - OKGREEN = '\033[92m' - WARNING = '\033[93m' - FAIL = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - - -device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') -print("Using device: {}".format(device)) - - -# Convolutional neural network (two convolutional layers) -class CNN(nn.Module): - def __init__(self): - super(CNN, self).__init__() - - self.classifier = nn.Sequential( - nn.Conv2d(1024, 32, kernel_size=(7, 1), padding=(3, 0)), # 7x32 - nn.ReLU(), - nn.Dropout(0.0), - nn.Conv2d(32, 20, kernel_size=(7, 1), padding=(3, 0)) - ) - - def forward(self, x): - """ - L = protein length - B = batch-size - F = number of features (1024 for embeddings) - N = number of classes (20 for 3Di) - """ - x = x.permute(0, 2, 1).unsqueeze( - dim=-1) # IN: X = (B x L x F); OUT: (B x F x L, 1) - Yhat = self.classifier(x) # OUT: Yhat_consurf = (B x N x L x 1) - Yhat = Yhat.squeeze(dim=-1) # IN: (B x N x L x 1); OUT: ( B x N x L ) - return Yhat - - -def get_T5_model(model_dir): - print("Loading T5 from: {}".format(model_dir)) - model = T5EncoderModel.from_pretrained( - "Rostlab/ProstT5_fp16", cache_dir=model_dir).to(device) - model = model.eval() - vocab = T5Tokenizer.from_pretrained( - "Rostlab/ProstT5_fp16", do_lower_case=False, cache_dir=model_dir, legacy=True) - return model, vocab - - -def read_fasta(fasta_path, split_char, id_field): - ''' - Reads in fasta file containing multiple sequences. - Returns dictionary of holding multiple sequences or only single - sequence, depending on input file. - ''' - - sequences = dict() - with open(fasta_path, 'r') as fasta_f: - for line in fasta_f: - # get uniprot ID from header and create new entry - if line.startswith('>'): - uniprot_id = line.replace( - '>', '').strip().split(split_char)[id_field] - # replace tokens that are mis-interpreted when loading h5 - # uniprot_id = uniprot_id.replace("/", "_").replace(".", "_") - sequences[uniprot_id] = '' - else: - s = ''.join(line.split()).replace("-", "") - - if s.islower(): # sanity check to avoid mix-up of 3Di and AA input - print("The input file was in lower-case which indicates 3Di-input." + - "This predictor only operates on amino-acid-input (upper-case)." + - "Exiting now ..." - ) - return None - else: - sequences[uniprot_id] += s - return sequences - - -def write_probs(predictions, out_path, probs_name="output_probabilities.csv"): - out_path = out_path.parent / probs_name - with open(out_path, 'w+') as out_f: - out_f.write('\n'.join( - ["{},{}".format(seq_id, prob) - for seq_id, (N, prob) in predictions.items() - ] - )) - print(f"Finished writing probabilities to {out_path}") - return None - - -def write_predictions(predictions, out_path): - ss_mapping = { - 0: "A", - 1: "C", - 2: "D", - 3: "E", - 4: "F", - 5: "G", - 6: "H", - 7: "I", - 8: "K", - 9: "L", - 10: "M", - 11: "N", - 12: "P", - 13: "Q", - 14: "R", - 15: "S", - 16: "T", - 17: "V", - 18: "W", - 19: "Y" - } - - with open(out_path, 'w+') as out_f: - out_f.write('\n'.join( - [">{}\n{}".format( - seq_id, "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats)))) - for seq_id, (yhats, _) in predictions.items() - ] - )) - print(f"Finished writing results to {out_path}") - return None - - -def toCPU(tensor): - if len(tensor.shape) > 1: - return tensor.detach().cpu().squeeze(dim=-1).numpy() - else: - return tensor.detach().cpu().numpy() - - -def download_file(url, local_path): - if not local_path.parent.is_dir(): - local_path.parent.mkdir() - - print("Downloading: {}".format(url)) - req = request.Request(url, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' - }) - - with request.urlopen(req) as response, open(local_path, 'wb') as outfile: - shutil.copyfileobj(response, outfile) - return None - - -def load_predictor(weights_link="https://github.com/mheinzinger/ProstT5/raw/main/cnn_chkpnt/model.pt"): - model = CNN() - checkpoint_p = Path.cwd() / "cnn_chkpnt" / "model.pt" - # if no pre-trained model is available, yet --> download it - if not checkpoint_p.exists(): - download_file(weights_link, checkpoint_p) - - # Torch load will map back to device from state, which often is GPU:0. - # to overcome, need to explicitly map to active device - global device - - #state = torch.load(checkpoint_p, map_location=device) - state = torch.load(checkpoint_p, map_location=device, weights_only=True) - - model.load_state_dict(state["state_dict"]) - - model = model.eval() - model = model.to(device) - - return model - - -def get_embeddings(seq_path, out_path, model_dir, split_char, id_field, half_precision, output_probs, probs_name, - max_residues=4000, max_seq_len=1000, max_batch=500): - - seq_dict = dict() - predictions = dict() - - # Read in fasta - seq_dict = read_fasta(seq_path, split_char, id_field) - prefix = "" - - model, vocab = get_T5_model(model_dir) - predictor = load_predictor() - - if half_precision: - model.half() - predictor.half() - print("Using models in half-precision.") - else: - model.to(torch.float32) - predictor.to(torch.float32) - print("Using models in full-precision.") - - print('########################################') - print('Example sequence: {}\n{}'.format(next(iter( - seq_dict.keys())), next(iter(seq_dict.values())))) - print('########################################') - print('Total number of sequences: {}'.format(len(seq_dict))) - - avg_length = sum([len(seq) for _, seq in seq_dict.items()]) / len(seq_dict) - n_long = sum([1 for _, seq in seq_dict.items() if len(seq) > max_seq_len]) - seq_dict_names = list(seq_dict.keys()) - # sort sequences by length to trigger OOM at the beginning - seq_dict = sorted(seq_dict.items(), key=lambda kv: len( - seq_dict[kv[0]]), reverse=True) - - print("Average sequence length: {}".format(avg_length)) - print("Number of sequences >{}: {}".format(max_seq_len, n_long)) - - start = time.time() - batch = list() - standard_aa = "ACDEFGHIKLMNPQRSTVWY" - standard_aa_dict = {aa: aa for aa in standard_aa} - count = 0 - seq_idx = 0 - for (pdb_id, seq) in tqdm(seq_dict): - seq_idx += 1 - # replace the non-standard amino acids with 'X' - seq = ''.join([standard_aa_dict.get(aa, 'X') for aa in seq]) - seq_len = len(seq) - seq = prefix + ' ' + ' '.join(list(seq)) - batch.append((pdb_id, seq, seq_len)) - - # count residues in current batch and add the last sequence length to - # avoid that batches with (n_res_batch > max_residues) get processed - n_res_batch = sum([s_len for _, _, s_len in batch]) + seq_len - - if len(batch) >= max_batch or n_res_batch >= max_residues or seq_idx == len(seq_dict) or seq_len > max_seq_len: - count += len(batch) - pdb_ids, seqs, seq_lens = zip(*batch) - batch = list() - - token_encoding = vocab.batch_encode_plus(seqs, - add_special_tokens=True, - padding="longest", - return_tensors='pt' - ).to(device) - try: - with torch.no_grad(): - embedding_repr = model(token_encoding.input_ids, - attention_mask=token_encoding.attention_mask - ) - except RuntimeError: - print("RuntimeError during embedding for {} (L={})".format( - pdb_id, seq_len) - ) - continue - - # ProtT5 appends a special tokens at the end of each sequence - # Mask this also out during inference while taking into account the prefix - for idx, s_len in enumerate(seq_lens): - token_encoding.attention_mask[idx, s_len+1] = 0 - - # extract last hidden states (=embeddings) - residue_embedding = embedding_repr.last_hidden_state.detach() - # mask out padded elements in the attention output (can be non-zero) for further processing/prediction - residue_embedding = residue_embedding * \ - token_encoding.attention_mask.unsqueeze(dim=-1) - # slice off embedding of special token prepended before to each sequence - residue_embedding = residue_embedding[:, 1:] - - # IN: X = (B x L x F) - OUT: ( B x N x L ) - prediction = predictor(residue_embedding) - if output_probs: # compute max probabilities per token/residue if requested - probabilities = toCPU(torch.max( - F.softmax(prediction, dim=1), dim=1, keepdim=True)[0]) - - prediction = toCPU(torch.max(prediction, dim=1, keepdim=True)[ - 1]).astype(np.byte) - - # batch-size x seq_len x embedding_dim - # extra token is added at the end of the seq - for batch_idx, identifier in enumerate(pdb_ids): - s_len = seq_lens[batch_idx] - # slice off padding and special token appended to the end of the sequence - pred = prediction[batch_idx, :, 0:s_len].squeeze() - if output_probs: # average over per-residue max.-probabilities - prob = int( 100* np.mean(probabilities[batch_idx, :, 0:s_len])) - predictions[identifier] = (pred, prob) - else: - predictions[identifier] = (pred, None) - assert s_len == len(predictions[identifier][0]), print( - f"Length mismatch for {identifier}: is:{len(predictions[identifier])} vs should:{s_len}") - if len(predictions) == 1: - print( - f"Example: predicted for protein {identifier} with length {s_len}: {predictions[identifier]}") - # print(f"Batch complete - total {count}") - - - end = time.time() - print('\n############# STATS #############') - print('Total number of predictions: {}'.format(len(predictions))) - print('Total time: {:.2f}[s]; time/prot: {:.4f}[s]; avg. len= {:.2f}'.format( - end-start, (end-start)/len(predictions), avg_length)) - print("Writing results now to disk ...") - - # Sort the prediction as the input fasta file only if the name exists - predictions = {seq_name: predictions[seq_name] - for seq_name in seq_dict_names if seq_name in predictions} - - write_predictions(predictions, out_path) - if output_probs: - write_probs(predictions, out_path, probs_name=probs_name) - - return True - - -def create_arg_parser(): - """"Creates and returns the ArgumentParser object.""" - - # Instantiate the parser - parser = argparse.ArgumentParser(description=( - 'predict_3Di_encoderOnly.py translates amino acid sequences to 3Di sequences. ' + - 'Example: python predict_3Di_encoderOnly.py --input /path/to/some_AA_sequences.fasta --output /path/to/some_3Di_sequences.fasta --model /path/to/tmp/checkpoint/dir') - ) - - # Required positional argument - parser.add_argument('-i', '--input', required=True, type=str, - help='A path to a fasta-formatted text file containing protein sequence(s).') - - # Required positional argument - parser.add_argument('-o', '--output', required=True, type=str, - help='A path for saving the 3Di translations in FASTA format.') - - # Required positional argument - parser.add_argument('--model', required=True, type=str, - help='A path to a directory for saving the checkpoint of the pre-trained model.') - - # Optional argument - parser.add_argument('--split_char', type=str, - default='!', - help='The character for splitting the FASTA header in order to retrieve ' + - "the protein identifier. Should be used in conjunction with --id." + - "Default: '!' ") - - # Optional argument - parser.add_argument('--id', type=int, - default=0, - help='The index for the uniprot identifier field after splitting the ' + - "FASTA header after each symbole in ['|', '#', ':', ' ']." + - 'Default: 0') - - parser.add_argument('--half', type=int, - default=1, - help="Whether to use half_precision or not. Default: 1 (half-precision)") - - parser.add_argument('--output_probs', type=int, - default=1, - help="Whether to output probabilities/reliability. Default: 1 (output them).") - - parser.add_argument('--probs_name', type=str, - default="output_probabilities.csv", - help="Name of the file to store the probabilities. Default: output_probabilities.csv") - - parser.add_argument('--threads', type=int, - default=1, - help="Number of threads to use for prediction. Default: 1") - - return parser - - -def main(): - parser = create_arg_parser() - args = parser.parse_args() - - seq_path = Path(args.input) # path to input FASTAS - out_path = Path(args.output) # path where predictions should be written to - model_dir = args.model # path/repo_link to checkpoint - - # Check if seq_path is in fasta format - if not seq_path.is_file(): - print(f"{bcolors.FAIL}{seq_path} is not a file{bcolors.ENDC}") - exit(1) - - # Check if seq_path is empty - if seq_path.stat().st_size == 0: - print(f"{bcolors.FAIL}{seq_path} is empty{bcolors.ENDC}") - exit(1) - - with open(seq_path, 'r') as seq_f: - for line in seq_f: - # Skip all the lines starts with '#' - while line.startswith('#'): - continue - if line.startswith('>'): - break - else: - print(f"{bcolors.FAIL}{seq_path} does not seem to be in FASTA format (doesn't start with '>')\nPlease check your input files. Only files in fasta/fastq[.gz|bz2] are supported{bcolors.ENDC}") - exit(1) - - if out_path.is_file(): - print("Output file is already existing and will be overwritten ...") - - split_char = args.split_char - id_field = args.id - - half_precision = False if int(args.half) == 0 else True - assert not (half_precision and device == torch.device("cpu")), print( - "Running fp16 on CPU is not supported, yet") - - output_probs = False if int(args.output_probs) == 0 else True - - torch.set_num_threads(args.threads) # Set number of threads for torch - - get_embeddings( - seq_path, - out_path, - model_dir, - split_char, - id_field, - half_precision, - output_probs, - args.probs_name - ) - - -if __name__ == '__main__': - main() From f730e3a6a41aa49b099620acf7bd2a1d4c419091 Mon Sep 17 00:00:00 2001 From: pskvins Date: Thu, 30 Jan 2025 16:11:09 +0900 Subject: [PATCH 5/7] Removed python-related lines --- src/envs/variables.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/envs/variables.rs b/src/envs/variables.rs index 577a668..af83a35 100644 --- a/src/envs/variables.rs +++ b/src/envs/variables.rs @@ -74,15 +74,6 @@ pub fn locate_path_cfg() -> String { err::error(err::ERR_GENERAL, Some("Could not locate path.cfg".to_string())); } } -pub fn locate_encoder_py() -> String { - if File::open(format!("{}{}etc{}predict_3Di_encoderOnly.py", parent_dir(), SEP, SEP)).is_ok() { - format!("{}{}etc{}predict_3Di_encoderOnly.py", parent_dir(), SEP, SEP) - } else if File::open(format!("{}{}src{}py{}predict_3Di_encoderOnly.py", src_parent_dir(), SEP, SEP, SEP)).is_ok() { - format!("{}{}src{}py{}predict_3Di_encoderOnly.py", src_parent_dir(), SEP, SEP, SEP) - } else { - err::error(err::ERR_GENERAL, Some("Could not locate path.cfg".to_string())); - } -} // binary paths const VALID_BINARY: [&str; 7] = [ From ab0c43feb50e26b68ccaf0ac66b6d256fc1d83af Mon Sep 17 00:00:00 2001 From: Daniel DW Kim Date: Wed, 5 Feb 2025 15:40:00 +0900 Subject: [PATCH 6/7] update readme to indicate compiled foldseek binary --- README.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 97d7ba8..e6694cc 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,13 @@ unicore -v ``` ### GPU acceleration with Foldseek-ProstT5 -Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.17, and `nvidia-driver` ≥525.60.13)
-To use it, please install it by running the following command: -``` -wget https://mmseqs.com/foldseek/foldseek-linux-gpu.tar.gz; tar xvfz foldseek-linux-gpu.tar.gz; export PATH=$(pwd)/foldseek/bin/:$PATH -``` -Then, add `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU: +Foldseek features GPU-acceleration for ProstT5 prediction under following requirements: + * Turing or newer NVIDIA GPU + * `foldseek` ≥10 + * `glibc` ≥2.17 + * `nvidia-driver` ≥525.60.13 + +Apply `--gpu` option to either `easy-core` or `createdb` module to use it, e.g. ``` unicore easy-core --gpu ``` @@ -52,7 +53,7 @@ unzip unicore_example.zip If you cloned the repository, you can find the example dataset in the `example/data` folder. ### Download ProstT5 weights -You need to first download the ProstT5 weights to run the `createdb` module. +You can preliminarily download the ProstT5 weights required to run the `createdb` module. ``` foldseek databases ProstT5 weights tmp ``` @@ -133,8 +134,6 @@ This module runs much faster with GPU. Please install `cuda` for GPU acceleratio To run the module, please use the following command: ``` -// Download ProstT5 weights as below if you haven't already -// foldseek databases ProstT5 /path/to/prostt5/weights tmp unicore createdb data db/proteome_db /path/to/prostt5/weights ``` This will create a Foldseek database in the `db` folder. @@ -211,7 +210,7 @@ unicore gene-tree --realign --threshold 30 --name /path/to/hashed/gene/names tre ## Build from Source ### Minimum requirements * [Cargo](https://www.rust-lang.org/tools/install) (Rust) -* [Foldseek](https://foldseek.com) (version ≥ 9) +* [Foldseek](https://foldseek.com) (version ≥ 10) * [Foldmason](https://foldmason.foldseek.com) * [IQ-TREE](http://www.iqtree.org/) ### Optional requirements @@ -232,5 +231,5 @@ With these tools installed, you can install and run `unicore` by: git clone https://github.com/steineggerlab/unicore.git cd unicore cargo build --release -bin/unicore help +bin/unicore -v ``` From cd2534d8bfe6e916a93191f091ed6165a4c2a959 Mon Sep 17 00:00:00 2001 From: Daniel DW Kim Date: Wed, 5 Feb 2025 15:42:22 +0900 Subject: [PATCH 7/7] fix table of contents --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index e6694cc..3580fbe 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ Kim, D., Park, S., & Steinegger, M. (2024). Unicore enables scalable and accurat ## Table of Contents - [Unicore](#unicore) - [Quick Start with Conda](#quick-start-with-conda) - - [GPU acceleration with CUDA](#gpu-acceleration-with-cuda) - [GPU acceleration with Foldseek-ProstT5](#gpu-acceleration-with-foldseek-prostt5) - [Tutorial](#tutorial) - [Manual](#manual)