diff --git a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py index deb8e087..9c9f288e 100644 --- a/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py +++ b/apps/protein_folding/helixfold3/helixfold/common/all_atom_pdb_save.py @@ -164,10 +164,13 @@ def prediction_to_mmcif(pred_atom_pos: Union[np.ndarray, paddle.Tensor], - maxit_binary: path to maxit_binary, use to convert pdb to cif - mmcif_path: path to save *.cif """ - assert maxit_binary is not None and os.path.exists(maxit_binary), ( + if os.path.isfile(maxit_binary): + raise FileNotFoundError( f'maxit_binary: {maxit_binary} not exists. ' f'link: https://sw-tools.rcsb.org/apps/MAXIT/source.html') - assert mmcif_path.endswith('.cif'), f'mmcif_path should endswith .cif; got {mmcif_path}' + + if not mmcif_path.endswith('.cif'): + raise ValueError(f'mmcif_path should endswith .cif; got {mmcif_path}') pdb_path = mmcif_path.replace('.cif', '.pdb') pdb_path = prediction_to_pdb(pred_atom_pos, FeatsDict, pdb_path) diff --git a/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml new file mode 100644 index 00000000..fd70ada0 --- /dev/null +++ b/apps/protein_folding/helixfold3/helixfold/config/helixfold.yaml @@ -0,0 +1,59 @@ +defaults: + - _self_ + +# General configuration + +bf16_infer: false # Corresponds to --bf16_infer +seed: null # Corresponds to --seed +logging_level: DEBUG # Corresponds to --logging_level +job_id: 'structure_prediction' # Corresponds to --model_name +weight_path: /mnt/db/weights/helixfold/HelixFold3-params-240814/HelixFold3-240814.pdparams # Corresponds to --init_model +precision: fp32 # Corresponds to --precision +amp_level: O1 # Corresponds to --amp_level +infer_times: 1 # Corresponds to --infer_times +diff_batch_size: -1 # Corresponds to --diff_batch_size +use_small_bfd: false # Corresponds to --use_small_bfd + +# File paths + +input: null # Corresponds to --input_json, required field +output: null # Corresponds to --output_dir, required field + + +# Binary tool paths, leave them as null to find proper ones under PATH or conda bin path +bin: + jackhmmer: null # Corresponds to --jackhmmer_binary_path + hhblits: null # Corresponds to --hhblits_binary_path + hhsearch: null # Corresponds to --hhsearch_binary_path + kalign: null # Corresponds to --kalign_binary_path + hmmsearch: null # Corresponds to --hmmsearch_binary_path + hmmbuild: null # Corresponds to --hmmbuild_binary_path + nhmmer: null # Corresponds to --nhmmer_binary_path + obabel: null + +# Database paths +db: + uniprot: /mnt/db/uniprot/uniprot.fasta # Corresponds to --uniprot_database_path, required field + pdb_seqres: /mnt/db/pdb_seqres/pdb_seqres.txt # Corresponds to --pdb_seqres_database_path, required field + uniref90: /mnt/db/uniref90/uniref90.fasta # Corresponds to --uniref90_database_path, required field + mgnify: /mnt/db/mgnify/mgy_clusters.fa # Corresponds to --mgnify_database_path, required field + bfd: /mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt # Corresponds to --bfd_database_path + small_bfd: null # Corresponds to --small_bfd_database_path + uniclust30: /mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02 # Corresponds to --uniclust30_database_path + rfam: /mnt/db/helixfold/rna/Rfam-14.9_rep_seq.fasta # Corresponds to --rfam_database_path, required field + ccd_preprocessed: /mnt/db/ccd/ccd_preprocessed_etkdg.pkl.gz # Corresponds to --ccd_preprocessed_path, required field + +# Template and PDB information +template: + mmcif_dir: /mnt/db/pdb_mmcif/mmcif_files # Corresponds to --template_mmcif_dir, required field + max_date: '2023-03-15' # Corresponds to --max_template_date, required field + obsolete_pdbs: /mnt/db/pdb_mmcif/obsolete.dat # Corresponds to --obsolete_pdbs_path, required field + +# Preset configuration +preset: + preset: full_dbs # Corresponds to --preset, choices=['reduced_dbs', 'full_dbs'] + +# Other configurations +other: + maxit_binary: /mnt/data/yinying/software/maxit/maxit-v11.100-prod-src/bin/maxit # Corresponds to --maxit_binary + no_msa_templ_feats: false # Corresponds to --no_msa_templ_feats diff --git a/apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py similarity index 100% rename from apps/protein_folding/helixfold3/infer_scripts/feature_processing_aa.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/feature_processing_aa.py diff --git a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py similarity index 97% rename from apps/protein_folding/helixfold3/infer_scripts/preprocess.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py index 41cd44ac..eb8eb14f 100644 --- a/apps/protein_folding/helixfold3/infer_scripts/preprocess.py +++ b/apps/protein_folding/helixfold3/helixfold/infer_scripts/preprocess.py @@ -5,17 +5,17 @@ 'seqs': ccd_seqs, 'msa_seqs': msa_seqs, 'count': count, - 'extra_mol_infos': {}, for which seqs has the modify residue type or smiles. + 'extra_mol_infos': {}, for which seqs has the modify residue type or smiles. """ import collections import copy +import gzip import os import json import sys import subprocess import tempfile import itertools -sys.path.append('../') import rdkit from rdkit import Chem from rdkit.Chem import AllChem @@ -52,9 +52,7 @@ 3: 'Unknown error.' } -OBABEL_BIN = os.getenv('OBABEL_BIN') -if not os.path.exists(OBABEL_BIN): - raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') + def read_json(path): @@ -144,6 +142,11 @@ def smiles_toMol_obabel(smiles): """ generate mol from smiles using obabel; """ + + OBABEL_BIN = os.getenv('OBABEL_BIN') + if not (OBABEL_BIN and os.path.isfile(OBABEL_BIN)): + raise FileNotFoundError(f'Cannot find obabel binary at {OBABEL_BIN}.') + with tempfile.NamedTemporaryFile(suffix=".mol2") as temp_file: print(f"[OBABEL] Temporary file created: {temp_file.name}") obabel_cmd = f"{OBABEL_BIN} -:'{smiles}' -omol2 -O{temp_file.name} --gen3d" diff --git a/apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py b/apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py similarity index 100% rename from apps/protein_folding/helixfold3/infer_scripts/tools/mmcif_writer.py rename to apps/protein_folding/helixfold3/helixfold/infer_scripts/tools/mmcif_writer.py diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/helixfold/inference.py similarity index 64% rename from apps/protein_folding/helixfold3/inference.py rename to apps/protein_folding/helixfold3/helixfold/inference.py index 51cf6ec6..b3fbf745 100644 --- a/apps/protein_folding/helixfold3/inference.py +++ b/apps/protein_folding/helixfold3/helixfold/inference.py @@ -16,7 +16,6 @@ import re import os import copy -import argparse import random import paddle import json @@ -25,6 +24,11 @@ import shutil import logging import numpy as np +import shutil + +from omegaconf import DictConfig +import hydra + from helixfold.common import all_atom_pdb_save from helixfold.model import config, utils from helixfold.data import pipeline_parallel as pipeline @@ -34,12 +38,14 @@ from helixfold.data.utils import atom_level_keys, map_to_continuous_indices from helixfold.data.tools import hmmsearch from helixfold.data import templates -from utils.utils import get_custom_amp_list -from utils.model import RunModel -from utils.misc import set_logging_level +from helixfold.utils.utils import get_custom_amp_list +from helixfold.utils.model import RunModel +from helixfold.utils.misc import set_logging_level from typing import Dict -from infer_scripts import feature_processing_aa, preprocess -from infer_scripts.tools import mmcif_writer +from helixfold.infer_scripts import feature_processing_aa, preprocess +from helixfold.infer_scripts.tools import mmcif_writer + +script_path=os.path.dirname(__file__) ALLOWED_LIGAND_BONDS_TYPE_MAP = preprocess.ALLOWED_LIGAND_BONDS_TYPE_MAP INVERSE_ALLOWED_LIGAND_BONDS_TYPE_MAP = { @@ -105,45 +111,57 @@ def convert_to_json_compatible(obj): return [convert_to_json_compatible(i) for i in obj] else: return obj - -def get_msa_templates_pipeline(args) -> Dict: - use_precomputed_msas = True # FLAGS.use_precomputed_msas + +def resolve_bin_path(cfg_path: str, default_binary_name: str)-> str: + """Helper function to resolve the binary path.""" + if cfg_path and os.path.isfile(cfg_path): + return cfg_path + + if cfg_val:=shutil.which(default_binary_name): + logging.warning(f'Using resolved {default_binary_name}: {cfg_val}') + return cfg_val + + raise FileNotFoundError(f"Could not find a proper binary path for {default_binary_name}: {cfg_path}.") + +def get_msa_templates_pipeline(cfg: DictConfig) -> Dict: + use_precomputed_msas = True # Assuming this is a constant or should be set globally + template_searcher = hmmsearch.Hmmsearch( - binary_path=args.hmmsearch_binary_path, - hmmbuild_binary_path=args.hmmbuild_binary_path, - database_path=args.pdb_seqres_database_path) + binary_path=resolve_bin_path(cfg.bin.hmmsearch, 'hmmsearch'), + hmmbuild_binary_path=resolve_bin_path(cfg.bin.hmmbuild, 'hmmbuild'), + database_path=cfg.db.pdb_seqres) template_featurizer = templates.HmmsearchHitFeaturizer( - mmcif_dir=args.template_mmcif_dir, - max_template_date=args.max_template_date, + mmcif_dir=cfg.template.mmcif_dir, + max_template_date=cfg.template.max_date, max_hits=MAX_TEMPLATE_HITS, - kalign_binary_path=args.kalign_binary_path, + kalign_binary_path=resolve_bin_path(cfg.bin.kalign, 'kalign'), release_dates_path=None, - obsolete_pdbs_path=args.obsolete_pdbs_path) + obsolete_pdbs_path=cfg.template.obsolete_pdbs) monomer_data_pipeline = pipeline.DataPipeline( - jackhmmer_binary_path=args.jackhmmer_binary_path, - hhblits_binary_path=args.hhblits_binary_path, - hhsearch_binary_path=args.hhsearch_binary_path, - uniref90_database_path=args.uniref90_database_path, - mgnify_database_path=args.mgnify_database_path, - bfd_database_path=args.bfd_database_path, - uniclust30_database_path=args.uniclust30_database_path, - small_bfd_database_path=args.small_bfd_database_path , + jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'), + hhblits_binary_path=resolve_bin_path(cfg.bin.hhblits, 'hhblits'), + hhsearch_binary_path=resolve_bin_path(cfg.bin.hhsearch, 'hhsearch'), + uniref90_database_path=cfg.db.uniref90, + mgnify_database_path=cfg.db.mgnify, + bfd_database_path=cfg.db.bfd, + uniclust30_database_path=cfg.db.uniclust30, + small_bfd_database_path=cfg.db.small_bfd, template_searcher=template_searcher, template_featurizer=template_featurizer, - use_small_bfd=args.use_small_bfd, + use_small_bfd=cfg.use_small_bfd, use_precomputed_msas=use_precomputed_msas) prot_data_pipeline = pipeline_multimer.DataPipeline( monomer_data_pipeline=monomer_data_pipeline, - jackhmmer_binary_path=args.jackhmmer_binary_path, - uniprot_database_path=args.uniprot_database_path, + jackhmmer_binary_path=resolve_bin_path(cfg.bin.jackhmmer, 'jackhmmer'), + uniprot_database_path=cfg.db.uniprot, use_precomputed_msas=use_precomputed_msas) rna_monomer_data_pipeline = pipeline_rna.RNADataPipeline( - hmmer_binary_path=args.nhmmer_binary_path, - rfam_database_path=args.rfam_database_path, + hmmer_binary_path=resolve_bin_path(cfg.bin.nhmmer, 'nhmmer'), + rfam_database_path=cfg.db.rfam, rnacentral_database_path=None, nt_database_path=None, species_identifer_map_path=None, @@ -156,7 +174,6 @@ def get_msa_templates_pipeline(args) -> Dict: 'protein': prot_data_pipeline, 'rna': rna_data_pipeline } - def ranking_all_predictions(output_dirs): ranking_score_path_map = {} for outpath in output_dirs: @@ -176,27 +193,29 @@ def ranking_all_predictions(output_dirs): rank_id += 1 @paddle.no_grad() -def eval(args, model, batch): - """evaluate a given dataset""" +def eval(cfg: DictConfig, model:RunModel, batch): + """Evaluate a given dataset""" model.eval() - # inference + # Inference def _forward_with_precision(batch): - if args.precision == "bf16" or args.bf16_infer: + precision=cfg.precision + if precision not in ('bf16','fp32',): + raise ValueError("Please choose precision from bf16 and fp32!") + + if cfg.precision == "bf16" or cfg.bf16_infer: black_list, white_list = get_custom_amp_list() with paddle.amp.auto_cast(enable=True, - custom_white_list=white_list, - custom_black_list=black_list, - level=args.amp_level, - dtype='bfloat16'): + custom_white_list=white_list, + custom_black_list=black_list, + level=cfg.amp_level, + dtype='bfloat16'): return model(batch, compute_loss=False) - elif args.precision == "fp32": - return model(batch, compute_loss=False) - else: - raise ValueError("Please choose precision from bf16 and fp32! ") + + return model(batch, compute_loss=False) res = _forward_with_precision(batch) - logger.info(f"Inference Succeeds...\n") + logger.info("Inference Succeeds...\n") return res @@ -430,52 +449,55 @@ def split_prediction(pred, rank): return prediction -def main(args): - set_logging_level(args.logging_level) +@hydra.main(version_base=None, config_path=os.path.join(script_path,'config',),config_name='helixfold') +def main(cfg: DictConfig): + set_logging_level(cfg.logging_level) """main function""" new_einsum = os.getenv("FLAGS_new_einsum", True) print(f'>>> PaddlePaddle commit: {paddle.version.commit}') print(f'>>> FLAGS_new_einsum: {new_einsum}') - print(f'>>> args:\n{args}') + print(f'>>> config:\n{cfg}') - all_entitys = preprocess_json_entity(args.input_json, args.output_dir) + all_entitys = preprocess_json_entity(cfg.input, cfg.output) ## check maxit binary path - if args.maxit_binary is not None: - assert os.path.exists(args.maxit_binary), \ - f"The maxit binary path {args.maxit_binary} does not exists." + maxit_binary=resolve_bin_path(cfg.other.maxit_binary,'maxit') + + RCSBROOT=os.path.dirname(maxit_binary) + os.environ['RCSBROOT']=RCSBROOT + ## check obabel + obabel_bin=resolve_bin_path(cfg.bin.obabel,'obabel') + os.environ['OBABEL_BIN']=obabel_bin - ### set seed for reproduce experiment results - seed = args.seed + ### Set seed for reproducibility + seed = cfg.seed if seed is None: seed = np.random.randint(10000000) else: - logger.warning('seed is only used for reproduction') + logger.warning('Seed is only used for reproduction') init_seed(seed) - - use_small_bfd = args.preset == 'reduced_dbs' - setattr(args, 'use_small_bfd', use_small_bfd) + use_small_bfd = cfg.preset.preset == 'reduced_dbs' + setattr(cfg, 'use_small_bfd', use_small_bfd) if use_small_bfd: - assert args.small_bfd_database_path is not None + assert cfg.db.small_bfd is not None else: - assert args.bfd_database_path is not None - assert args.uniclust30_database_path is not None + assert cfg.db.bfd is not None + assert cfg.db.uniclust30 is not None logger.info('Getting MSA/Template Pipelines...') msa_templ_data_pipeline_dict = get_msa_templates_pipeline(args) - - ### create model - model_config = config.model_config(args.model_name) - print(f'>>> model_config:\n{model_config}') + ### Create model + model_config = config.model_config(cfg.job_id) + #print(f'>>> model_config:\n{model_config}') model = RunModel(model_config) - if (not args.init_model is None) and (not args.init_model == ""): - print(f"Load pretrain model from {args.init_model}") - pd_params = paddle.load(args.init_model) + if (not cfg.weight_path is None) and (cfg.weight_path != ""): + print(f"Load pretrain model from {cfg.weight_path}") + pd_params = paddle.load(cfg.weight_path) has_opt = 'optimizer' in pd_params if has_opt: @@ -483,42 +505,46 @@ def main(args): else: model.helixfold.set_state_dict(pd_params) - if args.precision == "bf16" and args.amp_level == "O2": + if cfg.precision == "bf16" and cfg.amp_level == "O2": raise NotImplementedError("bf16 O2 is not supported yet.") print(f"============ Data Loading ============") - job_base = pathlib.Path(args.input_json).stem - output_dir_base = pathlib.Path(args.output_dir).joinpath(job_base) + job_base = pathlib.Path(cfg.input).stem + output_dir_base = pathlib.Path(cfg.output).joinpath(job_base) msa_output_dir = output_dir_base.joinpath('msas') msa_output_dir.mkdir(parents=True, exist_ok=True) features_pkl = output_dir_base.joinpath('final_features.pkl') - feature_dict = feature_processing_aa.process_input_json( - all_entitys, - ccd_preprocessed_path=args.ccd_preprocessed_path, - msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict, - msa_output_dir=msa_output_dir) + if features_pkl.exists(): + with open(features_pkl, 'rb') as f: + feature_dict = pickle.load(f) + else: + feature_dict = feature_processing_aa.process_input_json( + all_entitys, + ccd_preprocessed_path=cfg.db.ccd_preprocessed, + msa_templ_data_pipeline_dict=msa_templ_data_pipeline_dict, + msa_output_dir=msa_output_dir) - # save features - with open(features_pkl, 'wb') as f: - pickle.dump(feature_dict, f, protocol=4) + # save features + with open(features_pkl, 'wb') as f: + pickle.dump(feature_dict, f, protocol=4) feature_dict['feat'] = batch_convert(feature_dict['feat'], add_batch=True) feature_dict['label'] = batch_convert(feature_dict['label'], add_batch=True) print(f"============ Start Inference ============") - infer_times = args.infer_times - if args.diff_batch_size > 0: - model_config.model.heads.diffusion_module.test_diff_batch_size = args.diff_batch_size + infer_times = cfg.infer_times + if cfg.diff_batch_size > 0: + model_config.model.heads.diffusion_module.test_diff_batch_size = cfg.diff_batch_size diff_batch_size = model_config.model.heads.diffusion_module.test_diff_batch_size logger.info(f'Inference {infer_times} Times...') - logger.info(f" diffusion batch size {diff_batch_size}...\n") + logger.info(f"Diffusion batch size {diff_batch_size}...\n") all_pred_path = [] for infer_id in range(infer_times): logger.info(f'Start {infer_id}-th inference...\n') - prediction = eval(args, model, feature_dict) + prediction = eval(cfg, model, feature_dict) # save result prediction = split_prediction(prediction, diff_batch_size) @@ -530,7 +556,7 @@ def main(args): feature_dict=feature_dict, prediction=prediction[rank_id], output_dir=output_dir, - maxit_bin=args.maxit_binary) + maxit_bin=cfg.other.maxit_binary) all_pred_path.append(output_dir) # final ranking @@ -538,100 +564,5 @@ def main(args): ranking_all_predictions(all_pred_path) print(f'============ Inference finished ! ============') - if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--bf16_infer", action='store_true', default=False) - parser.add_argument("--seed", type=int, default=None, help="set seed for reproduce experiment results, None is do not set seed") - parser.add_argument("--logging_level", type=str, default="DEBUG", help="NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL") - parser.add_argument("--model_name", type=str, help='used to choose model config') - parser.add_argument("--init_model", type=str, default='') - parser.add_argument("--precision", type=str, choices=['fp32', 'bf16'], default='fp32') - parser.add_argument("--amp_level", type=str, default='O1') - parser.add_argument("--infer_times", type=int, default=1) - parser.add_argument("--diff_batch_size", type=int, default=-1) - parser.add_argument('--input_json', type=str, - default=None, required=True, - help='Paths to json file, each containing ' - 'entity information including sequence, smiles or CCD, copies etc.') - parser.add_argument('--output_dir', type=str, - default=None, required=True, - help='Path to a directory that will store results.') - parser.add_argument('--ccd_preprocessed_path', type=str, - default=None, required=True, - help='Path to CCD preprocessed files.') - parser.add_argument('--jackhmmer_binary_path', type=str, - default='/usr/bin/jackhmmer', - help='Path to the JackHMMER executable.') - parser.add_argument('--hhblits_binary_path', type=str, - default='/usr/bin/hhblits', - help='Path to the HHblits executable.') - parser.add_argument('--hhsearch_binary_path', type=str, - default='/usr/bin/hhsearch', - help='Path to the HHsearch executable.') - parser.add_argument('--kalign_binary_path', type=str, - default='/usr/bin/kalign', - help='Path to the Kalign executable.') - parser.add_argument('--hmmsearch_binary_path', type=str, - default='/usr/bin/hmmsearch', - help='Path to the hmmsearch executable.') - parser.add_argument('--hmmbuild_binary_path', type=str, - default='/usr/bin/hmmbuild', - help='Path to the hmmbuild executable.') - - # binary path of the tool for RNA MSA searching - parser.add_argument('--nhmmer_binary_path', type=str, - default='/usr/bin/nhmmer', - help='Path to the nhmmer executable.') - - parser.add_argument('--uniprot_database_path', type=str, - default=None, required=True, - help='Path to the Uniprot database for use ' - 'by JackHMMER.') - parser.add_argument('--pdb_seqres_database_path', type=str, - default=None, required=True, - help='Path to the PDB ' - 'seqres database for use by hmmsearch.') - parser.add_argument('--uniref90_database_path', type=str, - default=None, required=True, - help='Path to the Uniref90 database for use ' - 'by JackHMMER.') - parser.add_argument('--mgnify_database_path', type=str, - default=None, required=True, - help='Path to the MGnify database for use by ' - 'JackHMMER.') - parser.add_argument('--bfd_database_path', type=str, default=None, - help='Path to the BFD database for use by HHblits.') - parser.add_argument('--small_bfd_database_path', type=str, default=None, - help='Path to the small version of BFD used ' - 'with the "reduced_dbs" preset.') - parser.add_argument('--uniclust30_database_path', type=str, default=None, - help='Path to the Uniclust30 database for use ' - 'by HHblits.') - # RNA MSA searching databases - parser.add_argument('--rfam_database_path', type=str, - default=None, required=True, - help='Path to the Rfam database for RNA MSA searching.') - parser.add_argument('--template_mmcif_dir', type=str, - default=None, required=True, - help='Path to a directory with template mmCIF ' - 'structures, each named .cif') - parser.add_argument('--max_template_date', type=str, - default=None, required=True, - help='Maximum template release date to consider. ' - 'Important if folding historical test sets.') - parser.add_argument('--obsolete_pdbs_path', type=str, - default=None, required=True, - help='Path to file containing a mapping from ' - 'obsolete PDB IDs to the PDB IDs of their ' - 'replacements.') - parser.add_argument('--preset', - default='full_dbs', required=False, - choices=['reduced_dbs', 'full_dbs'], - help='Choose preset model configuration - ' - 'no ensembling and smaller genetic database ' - 'config (reduced_dbs), no ensembling and full ' - 'genetic database config (full_dbs)') - parser.add_argument('--maxit_binary', type=str, default=None) - args = parser.parse_args() - main(args) + main() diff --git a/apps/protein_folding/helixfold3/utils/__init__.py b/apps/protein_folding/helixfold3/helixfold/utils/__init__.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/__init__.py rename to apps/protein_folding/helixfold3/helixfold/utils/__init__.py diff --git a/apps/protein_folding/helixfold3/utils/misc.py b/apps/protein_folding/helixfold3/helixfold/utils/misc.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/misc.py rename to apps/protein_folding/helixfold3/helixfold/utils/misc.py diff --git a/apps/protein_folding/helixfold3/utils/model.py b/apps/protein_folding/helixfold3/helixfold/utils/model.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/model.py rename to apps/protein_folding/helixfold3/helixfold/utils/model.py diff --git a/apps/protein_folding/helixfold3/utils/utils.py b/apps/protein_folding/helixfold3/helixfold/utils/utils.py similarity index 100% rename from apps/protein_folding/helixfold3/utils/utils.py rename to apps/protein_folding/helixfold3/helixfold/utils/utils.py diff --git a/apps/protein_folding/helixfold3/pyproject.toml b/apps/protein_folding/helixfold3/pyproject.toml new file mode 100644 index 00000000..bc988ca9 --- /dev/null +++ b/apps/protein_folding/helixfold3/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["poetry-core>=1.0.0,<2.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "helixfold" +version = "3.0.0" +description = "Code for helixfold v3" +authors = ["Name "] + +readme = "README.md" +license = "MIT" +repository = "https://github.com/PaddlePaddle/PaddleHelix/blob/dev/apps/protein_folding/helixfold3" +classifiers = [ + "Topic :: Scientific/Engineering :: Biochemistry", + "Topic :: Scientific/Engineering :: Protein Engineering" +] + + +packages = [ + { include = "helixfold" }, + { include = "helixfold/*.py" }, +] + + +[tool.poetry.dependencies] +python = "^3.8" + +absl-py = "0.13.0" +biopython = "1.79" +chex = "0.0.7" +dm-haiku = "0.0.4" +dm-tree = "0.1.6" +docker = "5.0.0" +immutabledict = "2.0.0" +jax = "0.2.14" +ml-collections = "0.1.0" +pandas = "1.3.4" +scipy = "1.9.0" +rdkit-pypi = "2022.9.5" +posebusters = "*" +hydra-core= "^1.3.2" +omegaconf = "^2.3.0" + + + +[tool.poetry.scripts] +helixfold = 'helixfold.inference:main' diff --git a/apps/protein_folding/helixfold3/setup_env.sh b/apps/protein_folding/helixfold3/setup_env.sh new file mode 100644 index 00000000..30f008d6 --- /dev/null +++ b/apps/protein_folding/helixfold3/setup_env.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +ENV_NAME='helixfold' +CUDA=12.0 + +# follow https://developer.nvidia.com/cuda-downloads to install cuda and cudatoolkit + +# Install py env +conda create -n ${ENV_NAME} -y -c conda-forge pip python=3.9; +source activate ${ENV_NAME} +conda install -y cudnn=8.4.1 cudatoolkit=11.7 nccl=2.14.3 -c conda-forge -c nvidia + +conda install -y -c bioconda hmmer==3.3.2 kalign2==2.04 hhsuite==3.3.0 +conda install -y -c conda-forge openbabel + +python -m pip install --upgrade 'pip<24';pip install . --no-cache-dir + +pip install https://paddle-wheel.bj.bcebos.com/2.5.1/linux/linux-gpu-cuda11.7-cudnn8.4.1-mkl-gcc8.2-avx/paddlepaddle_gpu-2.5.1.post117-cp39-cp39-linux_x86_64.whl