diff --git a/README.md b/README.md
index 7f7d3b0..3580fbe 100644
--- a/README.md
+++ b/README.md
@@ -11,8 +11,7 @@ Kim, D., Park, S., & Steinegger, M. (2024). Unicore enables scalable and accurat
## Table of Contents
- [Unicore](#unicore)
- [Quick Start with Conda](#quick-start-with-conda)
- - [GPU acceleration with CUDA](#gpu-acceleration-with-cuda)
- - [GPU acceleration with Foldseek-ProstT5 (beta)](#gpu-acceleration-with-foldseek-prostt5-beta)
+ - [GPU acceleration with Foldseek-ProstT5](#gpu-acceleration-with-foldseek-prostt5)
- [Tutorial](#tutorial)
- [Manual](#manual)
- [Input](#input)
@@ -29,24 +28,16 @@ conda install -c bioconda unicore
unicore -v
```
-### GPU acceleration with CUDA
-`createdb` module can be greatly acclerated with ProstT5-GPU.
-If you have a Linux machine with CUDA-compatible GPU, please install this additional package:
-```
-conda install -c conda-forge pytorch-gpu
-```
+### GPU acceleration with Foldseek-ProstT5
+Foldseek features GPU-acceleration for ProstT5 prediction under following requirements:
+ * Turing or newer NVIDIA GPU
+ * `foldseek` ≥10
+ * `glibc` ≥2.17
+ * `nvidia-driver` ≥525.60.13
-### GPU acceleration with Foldseek-ProstT5 (beta)
-> Note. This feature is under development and may not work in some environments. We will provide an update after the stable release of Foldseek-ProstT5.
-
-Foldseek provides a GPU-compatible static binary for ProstT5 prediction (requires Linux with AVX2 support, `glibc` ≥2.29, and `nvidia-driver` ≥525.60.13)
-To use it, please install it by running the following command:
-```
-wget https://mmseqs.com/foldseek/foldseek-linux-gpu.tar.gz; tar xvfz foldseek-linux-gpu.tar.gz; export PATH=$(pwd)/foldseek/bin/:$PATH
+Apply `--gpu` option to either `easy-core` or `createdb` module to use it, e.g.
```
-Then, add `--use-foldseek` and `--gpu` options to either `easy-core` or `createdb` module to use Foldseek implementation of ProstT5-GPU:
-```
-unicore easy-core --use-foldseek --gpu
+unicore easy-core --gpu
```
@@ -61,7 +52,7 @@ unzip unicore_example.zip
If you cloned the repository, you can find the example dataset in the `example/data` folder.
### Download ProstT5 weights
-You need to first download the ProstT5 weights to run the `createdb` module.
+You can preliminarily download the ProstT5 weights required to run the `createdb` module.
```
foldseek databases ProstT5 weights tmp
```
@@ -142,13 +133,14 @@ This module runs much faster with GPU. Please install `cuda` for GPU acceleratio
To run the module, please use the following command:
```
-// Download ProstT5 weights as below if you haven't already
-// foldseek databases ProstT5 /path/to/prostt5/weights tmp
unicore createdb data db/proteome_db /path/to/prostt5/weights
```
This will create a Foldseek database in the `db` folder.
-If you have foldseek installed with CUDA, you can run the ProstT5 in the module with foldseek by adding `--use-foldseek` option.
+If you want to select the GPU devices, please use the `CUDA_VISIBLE_DEVICES` environment variable.
+
+* `CUDA_VISIBLE_DEVICES=0` to use GPU 0.
+* `CUDA_VISIBLE_DEVICES=0,1` to use GPU 0 and 1.
#### cluster
`cluster` module takes a `createdb` output database, runs Foldseek clustering, and outputs the cluster results.
@@ -217,11 +209,9 @@ unicore gene-tree --realign --threshold 30 --name /path/to/hashed/gene/names tre
## Build from Source
### Minimum requirements
* [Cargo](https://www.rust-lang.org/tools/install) (Rust)
-* [Foldseek](https://foldseek.com) (version ≥ 9)
+* [Foldseek](https://foldseek.com) (version ≥ 10)
* [Foldmason](https://foldmason.foldseek.com)
* [IQ-TREE](http://www.iqtree.org/)
-* pytorch, transformers, sentencepiece, protobuf
- - These are required for users who cannot build foldseek with CUDA. Please install them with `pip install torch transformers sentencepiece protobuf`.
### Optional requirements
* [MAFFT](https://mafft.cbrc.jp/alignment/software/)
* [Fasttree](http://www.microbesonline.org/fasttree/) or [RAxML](https://cme.h-its.org/exelixis/web/software/raxml/)
@@ -240,5 +230,5 @@ With these tools installed, you can install and run `unicore` by:
git clone https://github.com/steineggerlab/unicore.git
cd unicore
cargo build --release
-bin/unicore help
+bin/unicore -v
```
diff --git a/src/envs/variables.rs b/src/envs/variables.rs
index 45809cc..849a335 100644
--- a/src/envs/variables.rs
+++ b/src/envs/variables.rs
@@ -74,15 +74,6 @@ pub fn locate_path_cfg() -> String {
err::error(err::ERR_GENERAL, Some("Could not locate path.cfg".to_string()));
}
}
-pub fn locate_encoder_py() -> String {
- if File::open(format!("{}{}etc{}predict_3Di_encoderOnly.py", parent_dir(), SEP, SEP)).is_ok() {
- format!("{}{}etc{}predict_3Di_encoderOnly.py", parent_dir(), SEP, SEP)
- } else if File::open(format!("{}{}src{}py{}predict_3Di_encoderOnly.py", src_parent_dir(), SEP, SEP, SEP)).is_ok() {
- format!("{}{}src{}py{}predict_3Di_encoderOnly.py", src_parent_dir(), SEP, SEP, SEP)
- } else {
- err::error(err::ERR_GENERAL, Some("Could not locate path.cfg".to_string()));
- }
-}
// binary paths
pub const VALID_BINARY: [&str; 8] = [
diff --git a/src/modules/createdb.rs b/src/modules/createdb.rs
index b126265..d8dd0d6 100644
--- a/src/modules/createdb.rs
+++ b/src/modules/createdb.rs
@@ -26,18 +26,11 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box 3, 3 => 2, _ => var::verbosity() }).to_string();
- // Either use_foldseek or use_python must be true
- if !use_foldseek && !use_python {
- err::error(err::ERR_ARGPARSE, Some("Either use_foldseek or use_python must be true".to_string()));
- }
-
// Check afdb_lookup
let afdb_local = if afdb_lookup && !afdb_local.is_some() {
err::error(err::ERR_ARGPARSE, Some("afdb-lookup is provided but afdb-local is not given".to_string()));
@@ -135,44 +128,37 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box &bin.path,
- _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); }
- };
-
- // Check if old weights exist
- if Path::new(&model).join("cnn.safetensors").exists() || Path::new(&model).join(format!("model{}cnn.safetensors", SEP)).exists() {
- err::error(err::ERR_GENERAL, Some("Old weight files detected from the given path. Please provide different path for the model weights".to_string()));
- }
- // Check if weights exist
- if !Path::new(&model).join("prostt5-f16.gguf").exists() {
- // Download the model
- std::fs::create_dir_all(format!("{}{}tmp", model, SEP))?;
- let mut cmd = std::process::Command::new(foldseek_path);
- let mut cmd = cmd
- .arg("databases").arg("ProstT5").arg(&model).arg(format!("{}{}tmp", model, SEP)).arg("--threads").arg(threads.to_string());
- cmd::run(&mut cmd);
- }
+ // Use foldseek to create the database
+ let foldseek_path = match &bin.get("foldseek") {
+ Some(bin) => &bin.path,
+ _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); }
+ };
- // Run foldseek createdb
+ // Check if old weights exist
+ if Path::new(&model).join("cnn.safetensors").exists() || Path::new(&model).join(format!("model{}cnn.safetensors", SEP)).exists() {
+ err::error(err::ERR_GENERAL, Some("Old weight files detected from the given path. Please provide different path for the model weights".to_string()));
+ }
+ // Check if weights exist
+ if !Path::new(&model).join("prostt5-f16.gguf").exists() {
+ // Download the model
+ std::fs::create_dir_all(format!("{}{}tmp", model, SEP))?;
let mut cmd = std::process::Command::new(foldseek_path);
- let cmd = cmd
- .arg("createdb").arg(&combined_aa).arg(&output)
- .arg("--prostt5-model").arg(&model)
- .arg("--threads").arg(threads.to_string());
- let mut cmd = if gpu {
- cmd.arg("--gpu").arg("1")
- } else { cmd };
+ let mut cmd = cmd
+ .arg("databases").arg("ProstT5").arg(&model).arg(format!("{}{}tmp", model, SEP)).arg("--threads").arg(threads.to_string());
cmd::run(&mut cmd);
- } else if use_python {
- let _ = _run_python(&combined_aa, &curr_dir, &parent, &output, &model, keep, bin, threads.to_string());
- } else {
- err::error(err::ERR_GENERAL, Some("Either use_foldseek or use_python must be true".to_string()));
}
+ // Run foldseek createdb
+ let mut cmd = std::process::Command::new(foldseek_path);
+ let cmd = cmd
+ .arg("createdb").arg(&combined_aa).arg(&output)
+ .arg("--prostt5-model").arg(&model)
+ .arg("--threads").arg(threads.to_string());
+ let mut cmd = if gpu {
+ cmd.arg("--gpu").arg("1")
+ } else { cmd };
+ cmd::run(&mut cmd);
+
if afdb_lookup {
let foldseek_path = match &bin.get("foldseek") {
Some(bin) => &bin.path,
@@ -221,57 +207,5 @@ pub fn run(args: &Args, bin: &var::BinaryPaths) -> Result<(), Box Result<(), Box> {
- let input_3di = format!("{}{}{}{}combined_3di.fasta", curr_dir, SEP, parent, SEP);
- let inter_prob = format!("{}{}{}{}output_probabilities.csv", curr_dir, SEP, parent, SEP);
- let output_3di = format!("{}{}{}_ss", curr_dir, SEP, output);
- let foldseek_verbosity = (match var::verbosity() { 4 => 3, 3 => 2, _ => var::verbosity() }).to_string();
-
- // Run python script
- let mut cmd = std::process::Command::new("python");
- let mut cmd = cmd
- .arg(var::locate_encoder_py())
- .arg("-i").arg(&combined_aa)
- .arg("-o").arg(&input_3di)
- .arg("--model").arg(&model)
- .arg("--half").arg("0")
- .arg("--threads").arg(threads);
- cmd::run(&mut cmd);
-
- // Build foldseek db
- let foldseek_path = match &bin.get("foldseek") {
- Some(bin) => &bin.path,
- _none => { err::error(err::ERR_BINARY_NOT_FOUND, Some("foldseek".to_string())); }
- };
- let mut cmd = std::process::Command::new(foldseek_path);
- let mut cmd = cmd
- .arg("base:createdb").arg(&combined_aa).arg(&output)
- .arg("--shuffle").arg("0")
- .arg("-v").arg(foldseek_verbosity.as_str());
-
- cmd::run(&mut cmd);
-
- // Build foldseek 3di db
- let mut cmd = std::process::Command::new(foldseek_path);
- let mut cmd = cmd
- .arg("base:createdb").arg(&input_3di).arg(&output_3di)
- .arg("--shuffle").arg("0")
- .arg("-v").arg(foldseek_verbosity.as_str());
- cmd::run(&mut cmd);
-
- // Delete intermediate files
- if !keep {
- // std::fs::remove_file(mapping_file)?;
- // std::fs::remove_file(combined_aa)?;
- std::fs::remove_file(input_3di)?;
- std::fs::remove_file(inter_prob)?;
- }
-
- // // Write the checkpoint file
- // chkpnt::write_checkpoint(&format!("{}/createdb.chk", parent), "1")?;
-
Ok(())
}
diff --git a/src/py/predict_3Di_encoderOnly.py b/src/py/predict_3Di_encoderOnly.py
deleted file mode 100644
index 30544cc..0000000
--- a/src/py/predict_3Di_encoderOnly.py
+++ /dev/null
@@ -1,442 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Jun 16 14:27:44 2023
-
-@author: mheinzinger
-"""
-
-import argparse
-import time
-from pathlib import Path
-
-from urllib import request
-import shutil
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import T5EncoderModel, T5Tokenizer
-from tqdm import tqdm
-
-class bcolors:
- HEADER = '\033[95m'
- OKBLUE = '\033[94m'
- OKCYAN = '\033[96m'
- OKGREEN = '\033[92m'
- WARNING = '\033[93m'
- FAIL = '\033[91m'
- ENDC = '\033[0m'
- BOLD = '\033[1m'
- UNDERLINE = '\033[4m'
-
-
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-print("Using device: {}".format(device))
-
-
-# Convolutional neural network (two convolutional layers)
-class CNN(nn.Module):
- def __init__(self):
- super(CNN, self).__init__()
-
- self.classifier = nn.Sequential(
- nn.Conv2d(1024, 32, kernel_size=(7, 1), padding=(3, 0)), # 7x32
- nn.ReLU(),
- nn.Dropout(0.0),
- nn.Conv2d(32, 20, kernel_size=(7, 1), padding=(3, 0))
- )
-
- def forward(self, x):
- """
- L = protein length
- B = batch-size
- F = number of features (1024 for embeddings)
- N = number of classes (20 for 3Di)
- """
- x = x.permute(0, 2, 1).unsqueeze(
- dim=-1) # IN: X = (B x L x F); OUT: (B x F x L, 1)
- Yhat = self.classifier(x) # OUT: Yhat_consurf = (B x N x L x 1)
- Yhat = Yhat.squeeze(dim=-1) # IN: (B x N x L x 1); OUT: ( B x N x L )
- return Yhat
-
-
-def get_T5_model(model_dir):
- print("Loading T5 from: {}".format(model_dir))
- model = T5EncoderModel.from_pretrained(
- "Rostlab/ProstT5_fp16", cache_dir=model_dir).to(device)
- model = model.eval()
- vocab = T5Tokenizer.from_pretrained(
- "Rostlab/ProstT5_fp16", do_lower_case=False, cache_dir=model_dir, legacy=True)
- return model, vocab
-
-
-def read_fasta(fasta_path, split_char, id_field):
- '''
- Reads in fasta file containing multiple sequences.
- Returns dictionary of holding multiple sequences or only single
- sequence, depending on input file.
- '''
-
- sequences = dict()
- with open(fasta_path, 'r') as fasta_f:
- for line in fasta_f:
- # get uniprot ID from header and create new entry
- if line.startswith('>'):
- uniprot_id = line.replace(
- '>', '').strip().split(split_char)[id_field]
- # replace tokens that are mis-interpreted when loading h5
- # uniprot_id = uniprot_id.replace("/", "_").replace(".", "_")
- sequences[uniprot_id] = ''
- else:
- s = ''.join(line.split()).replace("-", "")
-
- if s.islower(): # sanity check to avoid mix-up of 3Di and AA input
- print("The input file was in lower-case which indicates 3Di-input." +
- "This predictor only operates on amino-acid-input (upper-case)." +
- "Exiting now ..."
- )
- return None
- else:
- sequences[uniprot_id] += s
- return sequences
-
-
-def write_probs(predictions, out_path, probs_name="output_probabilities.csv"):
- out_path = out_path.parent / probs_name
- with open(out_path, 'w+') as out_f:
- out_f.write('\n'.join(
- ["{},{}".format(seq_id, prob)
- for seq_id, (N, prob) in predictions.items()
- ]
- ))
- print(f"Finished writing probabilities to {out_path}")
- return None
-
-
-def write_predictions(predictions, out_path):
- ss_mapping = {
- 0: "A",
- 1: "C",
- 2: "D",
- 3: "E",
- 4: "F",
- 5: "G",
- 6: "H",
- 7: "I",
- 8: "K",
- 9: "L",
- 10: "M",
- 11: "N",
- 12: "P",
- 13: "Q",
- 14: "R",
- 15: "S",
- 16: "T",
- 17: "V",
- 18: "W",
- 19: "Y"
- }
-
- with open(out_path, 'w+') as out_f:
- out_f.write('\n'.join(
- [">{}\n{}".format(
- seq_id, "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats))))
- for seq_id, (yhats, _) in predictions.items()
- ]
- ))
- print(f"Finished writing results to {out_path}")
- return None
-
-
-def toCPU(tensor):
- if len(tensor.shape) > 1:
- return tensor.detach().cpu().squeeze(dim=-1).numpy()
- else:
- return tensor.detach().cpu().numpy()
-
-
-def download_file(url, local_path):
- if not local_path.parent.is_dir():
- local_path.parent.mkdir()
-
- print("Downloading: {}".format(url))
- req = request.Request(url, headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
- })
-
- with request.urlopen(req) as response, open(local_path, 'wb') as outfile:
- shutil.copyfileobj(response, outfile)
- return None
-
-
-def load_predictor(weights_link="https://github.com/mheinzinger/ProstT5/raw/main/cnn_chkpnt/model.pt"):
- model = CNN()
- checkpoint_p = Path.cwd() / "cnn_chkpnt" / "model.pt"
- # if no pre-trained model is available, yet --> download it
- if not checkpoint_p.exists():
- download_file(weights_link, checkpoint_p)
-
- # Torch load will map back to device from state, which often is GPU:0.
- # to overcome, need to explicitly map to active device
- global device
-
- #state = torch.load(checkpoint_p, map_location=device)
- state = torch.load(checkpoint_p, map_location=device, weights_only=True)
-
- model.load_state_dict(state["state_dict"])
-
- model = model.eval()
- model = model.to(device)
-
- return model
-
-
-def get_embeddings(seq_path, out_path, model_dir, split_char, id_field, half_precision, output_probs, probs_name,
- max_residues=4000, max_seq_len=1000, max_batch=500):
-
- seq_dict = dict()
- predictions = dict()
-
- # Read in fasta
- seq_dict = read_fasta(seq_path, split_char, id_field)
- prefix = ""
-
- model, vocab = get_T5_model(model_dir)
- predictor = load_predictor()
-
- if half_precision:
- model.half()
- predictor.half()
- print("Using models in half-precision.")
- else:
- model.to(torch.float32)
- predictor.to(torch.float32)
- print("Using models in full-precision.")
-
- print('########################################')
- print('Example sequence: {}\n{}'.format(next(iter(
- seq_dict.keys())), next(iter(seq_dict.values()))))
- print('########################################')
- print('Total number of sequences: {}'.format(len(seq_dict)))
-
- avg_length = sum([len(seq) for _, seq in seq_dict.items()]) / len(seq_dict)
- n_long = sum([1 for _, seq in seq_dict.items() if len(seq) > max_seq_len])
- seq_dict_names = list(seq_dict.keys())
- # sort sequences by length to trigger OOM at the beginning
- seq_dict = sorted(seq_dict.items(), key=lambda kv: len(
- seq_dict[kv[0]]), reverse=True)
-
- print("Average sequence length: {}".format(avg_length))
- print("Number of sequences >{}: {}".format(max_seq_len, n_long))
-
- start = time.time()
- batch = list()
- standard_aa = "ACDEFGHIKLMNPQRSTVWY"
- standard_aa_dict = {aa: aa for aa in standard_aa}
- count = 0
- seq_idx = 0
- for (pdb_id, seq) in tqdm(seq_dict):
- seq_idx += 1
- # replace the non-standard amino acids with 'X'
- seq = ''.join([standard_aa_dict.get(aa, 'X') for aa in seq])
- seq_len = len(seq)
- seq = prefix + ' ' + ' '.join(list(seq))
- batch.append((pdb_id, seq, seq_len))
-
- # count residues in current batch and add the last sequence length to
- # avoid that batches with (n_res_batch > max_residues) get processed
- n_res_batch = sum([s_len for _, _, s_len in batch]) + seq_len
-
- if len(batch) >= max_batch or n_res_batch >= max_residues or seq_idx == len(seq_dict) or seq_len > max_seq_len:
- count += len(batch)
- pdb_ids, seqs, seq_lens = zip(*batch)
- batch = list()
-
- token_encoding = vocab.batch_encode_plus(seqs,
- add_special_tokens=True,
- padding="longest",
- return_tensors='pt'
- ).to(device)
- try:
- with torch.no_grad():
- embedding_repr = model(token_encoding.input_ids,
- attention_mask=token_encoding.attention_mask
- )
- except RuntimeError:
- print("RuntimeError during embedding for {} (L={})".format(
- pdb_id, seq_len)
- )
- continue
-
- # ProtT5 appends a special tokens at the end of each sequence
- # Mask this also out during inference while taking into account the prefix
- for idx, s_len in enumerate(seq_lens):
- token_encoding.attention_mask[idx, s_len+1] = 0
-
- # extract last hidden states (=embeddings)
- residue_embedding = embedding_repr.last_hidden_state.detach()
- # mask out padded elements in the attention output (can be non-zero) for further processing/prediction
- residue_embedding = residue_embedding * \
- token_encoding.attention_mask.unsqueeze(dim=-1)
- # slice off embedding of special token prepended before to each sequence
- residue_embedding = residue_embedding[:, 1:]
-
- # IN: X = (B x L x F) - OUT: ( B x N x L )
- prediction = predictor(residue_embedding)
- if output_probs: # compute max probabilities per token/residue if requested
- probabilities = toCPU(torch.max(
- F.softmax(prediction, dim=1), dim=1, keepdim=True)[0])
-
- prediction = toCPU(torch.max(prediction, dim=1, keepdim=True)[
- 1]).astype(np.byte)
-
- # batch-size x seq_len x embedding_dim
- # extra token is added at the end of the seq
- for batch_idx, identifier in enumerate(pdb_ids):
- s_len = seq_lens[batch_idx]
- # slice off padding and special token appended to the end of the sequence
- pred = prediction[batch_idx, :, 0:s_len].squeeze()
- if output_probs: # average over per-residue max.-probabilities
- prob = int( 100* np.mean(probabilities[batch_idx, :, 0:s_len]))
- predictions[identifier] = (pred, prob)
- else:
- predictions[identifier] = (pred, None)
- assert s_len == len(predictions[identifier][0]), print(
- f"Length mismatch for {identifier}: is:{len(predictions[identifier])} vs should:{s_len}")
- if len(predictions) == 1:
- print(
- f"Example: predicted for protein {identifier} with length {s_len}: {predictions[identifier]}")
- # print(f"Batch complete - total {count}")
-
-
- end = time.time()
- print('\n############# STATS #############')
- print('Total number of predictions: {}'.format(len(predictions)))
- print('Total time: {:.2f}[s]; time/prot: {:.4f}[s]; avg. len= {:.2f}'.format(
- end-start, (end-start)/len(predictions), avg_length))
- print("Writing results now to disk ...")
-
- # Sort the prediction as the input fasta file only if the name exists
- predictions = {seq_name: predictions[seq_name]
- for seq_name in seq_dict_names if seq_name in predictions}
-
- write_predictions(predictions, out_path)
- if output_probs:
- write_probs(predictions, out_path, probs_name=probs_name)
-
- return True
-
-
-def create_arg_parser():
- """"Creates and returns the ArgumentParser object."""
-
- # Instantiate the parser
- parser = argparse.ArgumentParser(description=(
- 'predict_3Di_encoderOnly.py translates amino acid sequences to 3Di sequences. ' +
- 'Example: python predict_3Di_encoderOnly.py --input /path/to/some_AA_sequences.fasta --output /path/to/some_3Di_sequences.fasta --model /path/to/tmp/checkpoint/dir')
- )
-
- # Required positional argument
- parser.add_argument('-i', '--input', required=True, type=str,
- help='A path to a fasta-formatted text file containing protein sequence(s).')
-
- # Required positional argument
- parser.add_argument('-o', '--output', required=True, type=str,
- help='A path for saving the 3Di translations in FASTA format.')
-
- # Required positional argument
- parser.add_argument('--model', required=True, type=str,
- help='A path to a directory for saving the checkpoint of the pre-trained model.')
-
- # Optional argument
- parser.add_argument('--split_char', type=str,
- default='!',
- help='The character for splitting the FASTA header in order to retrieve ' +
- "the protein identifier. Should be used in conjunction with --id." +
- "Default: '!' ")
-
- # Optional argument
- parser.add_argument('--id', type=int,
- default=0,
- help='The index for the uniprot identifier field after splitting the ' +
- "FASTA header after each symbole in ['|', '#', ':', ' ']." +
- 'Default: 0')
-
- parser.add_argument('--half', type=int,
- default=1,
- help="Whether to use half_precision or not. Default: 1 (half-precision)")
-
- parser.add_argument('--output_probs', type=int,
- default=1,
- help="Whether to output probabilities/reliability. Default: 1 (output them).")
-
- parser.add_argument('--probs_name', type=str,
- default="output_probabilities.csv",
- help="Name of the file to store the probabilities. Default: output_probabilities.csv")
-
- parser.add_argument('--threads', type=int,
- default=1,
- help="Number of threads to use for prediction. Default: 1")
-
- return parser
-
-
-def main():
- parser = create_arg_parser()
- args = parser.parse_args()
-
- seq_path = Path(args.input) # path to input FASTAS
- out_path = Path(args.output) # path where predictions should be written to
- model_dir = args.model # path/repo_link to checkpoint
-
- # Check if seq_path is in fasta format
- if not seq_path.is_file():
- print(f"{bcolors.FAIL}{seq_path} is not a file{bcolors.ENDC}")
- exit(1)
-
- # Check if seq_path is empty
- if seq_path.stat().st_size == 0:
- print(f"{bcolors.FAIL}{seq_path} is empty{bcolors.ENDC}")
- exit(1)
-
- with open(seq_path, 'r') as seq_f:
- for line in seq_f:
- # Skip all the lines starts with '#'
- while line.startswith('#'):
- continue
- if line.startswith('>'):
- break
- else:
- print(f"{bcolors.FAIL}{seq_path} does not seem to be in FASTA format (doesn't start with '>')\nPlease check your input files. Only files in fasta/fastq[.gz|bz2] are supported{bcolors.ENDC}")
- exit(1)
-
- if out_path.is_file():
- print("Output file is already existing and will be overwritten ...")
-
- split_char = args.split_char
- id_field = args.id
-
- half_precision = False if int(args.half) == 0 else True
- assert not (half_precision and device == torch.device("cpu")), print(
- "Running fp16 on CPU is not supported, yet")
-
- output_probs = False if int(args.output_probs) == 0 else True
-
- torch.set_num_threads(args.threads) # Set number of threads for torch
-
- get_embeddings(
- seq_path,
- out_path,
- model_dir,
- split_char,
- id_field,
- half_precision,
- output_probs,
- args.probs_name
- )
-
-
-if __name__ == '__main__':
- main()
diff --git a/src/util/arg_parser.rs b/src/util/arg_parser.rs
index 512d9a9..d61cfcb 100644
--- a/src/util/arg_parser.rs
+++ b/src/util/arg_parser.rs
@@ -69,12 +69,6 @@ pub enum Commands {
/// Use GPU for foldseek createdb
#[arg(short, long, default_value="false")]
gpu: bool,
- /// Use python script instead. hidden option
- #[arg(long, default_value="true", hide = true)]
- use_python: bool,
- /// Use foldseek for createdb. hidden option
- #[arg(long, default_value="false", hide = true)]
- use_foldseek: bool,
/// Use AFDB lookup for foldseek createdb. Useful for large databases
#[arg(long, default_value="false")]
afdb_lookup: bool,
@@ -137,12 +131,6 @@ pub enum Commands {
/// Use GPU for foldseek createdb
#[arg(short, long, default_value="false")]
gpu: bool,
- /// Use python script instead. hidden option
- #[arg(long, default_value="true", hide = true)]
- use_python: bool,
- /// Use foldseek for createdb. hidden option
- #[arg(long, default_value="false", hide = true)]
- use_foldseek: bool,
/// Use AFDB lookup for foldseek createdb. Useful for large databases
#[arg(long, default_value="false")]
afdb_lookup: bool,
@@ -201,12 +189,6 @@ pub enum Commands {
/// Use GPU for foldseek createdb
#[arg(short, long, default_value="false")]
gpu: bool,
- /// Use python script instead. hidden option
- #[arg(long, default_value="true", hide = true)]
- use_python: bool,
- /// Use foldseek for createdb. hidden option
- #[arg(long, default_value="false", hide = true)]
- use_foldseek: bool,
/// Use AFDB lookup for foldseek createdb. Useful for large databases
#[arg(long, default_value="false")]
afdb_lookup: bool,
@@ -412,8 +394,6 @@ pub struct Args {
pub createdb_overwrite: Option,
pub createdb_max_len: Option>,
pub createdb_gpu: Option,
- pub createdb_use_python: Option,
- pub createdb_use_foldseek: Option,
pub createdb_afdb_lookup: Option,
pub createdb_afdb_local: Option>,
@@ -527,16 +507,6 @@ impl Args {
Some(EasyCore { gpu, .. }) => Some(*gpu),
Some(EasySearch { gpu, .. }) => Some(*gpu), _ => None,
};
- let createdb_use_python = match &args.command {
- Some(Createdb { use_python, .. }) => Some(*use_python),
- Some(EasyCore { use_python, .. }) => Some(*use_python),
- Some(EasySearch { use_python, .. }) => Some(*use_python), _ => None,
- };
- let createdb_use_foldseek = match &args.command {
- Some(Createdb { use_foldseek, .. }) => Some(*use_foldseek),
- Some(EasyCore { use_foldseek, .. }) => Some(*use_foldseek),
- Some(EasySearch { use_foldseek, .. }) => Some(*use_foldseek), _ => None,
- };
let createdb_afdb_lookup = match &args.command {
Some(Createdb { afdb_lookup, .. }) => Some(*afdb_lookup),
Some(EasyCore { afdb_lookup, .. }) => Some(*afdb_lookup),
@@ -716,7 +686,7 @@ impl Args {
Args {
command: args.command, version: args.version, threads, verbosity,
- createdb_input, createdb_output, createdb_model, createdb_keep, createdb_overwrite, createdb_max_len, createdb_gpu, createdb_use_python, createdb_use_foldseek, createdb_afdb_lookup, createdb_afdb_local,
+ createdb_input, createdb_output, createdb_model, createdb_keep, createdb_overwrite, createdb_max_len, createdb_gpu, createdb_afdb_lookup, createdb_afdb_local,
profile_input_db, profile_input_tsv, profile_output, profile_threshold, profile_print_copiness,
search_input, search_target, search_output, search_tmp, search_keep_aln_db, search_search_options,
cluster_input, cluster_output, cluster_tmp, cluster_keep_cluster_db, cluster_cluster_options,