From 720f818a7512d4831749943e7555b256ddae4478 Mon Sep 17 00:00:00 2001 From: "Daniel L. Parton" Date: Mon, 3 Aug 2015 17:56:53 -0400 Subject: [PATCH 01/10] Created and document CLI for residue renumbering --- docs/cli_docs.rst | 23 ++++++++++++++++ ensembler/cli.py | 2 +- ensembler/cli_commands/__init__.py | 4 ++- ensembler/cli_commands/general.py | 1 + ensembler/cli_commands/refine_explicit.py | 1 + ensembler/cli_commands/renumber_residues.py | 30 +++++++++++++++++++++ ensembler/initproject.py | 1 + ensembler/tools/renumber_residues.py | 2 ++ 8 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 ensembler/cli_commands/renumber_residues.py diff --git a/docs/cli_docs.rst b/docs/cli_docs.rst index 72d225c..ba80c57 100644 --- a/docs/cli_docs.rst +++ b/docs/cli_docs.rst @@ -41,3 +41,26 @@ PDB-format coordinate files in the directory ``templates/structures-resolved``. Each structure should be named XXX.pdb, where XXX matches the identifier in the fasta file. The residues in the coordinate files should also match the sequences in the fasta file. + +Additional Tools +================ + +Ensembler includes a ``tools`` submodule, which allows the user to conduct +various useful tasks which are not considered core pipeline functions. The +use-cases for many of these tools are quite specific, so they may not be +applicable to every project, and should also be used with caution. + +Residue renumbering according to UniProt sequence coordinates +------------------------------------------------------------- + +:: + + $ ensembler renumber_residues --target EGFR_HUMAN_D0 + +The given target ID must begin with a UniProt mnemonic, e.g. "EGFR_HUMAN". +This will output two files in the ``models/[target_id]`` directory: +``topol-renumbered-implicit.pdb`` and ``topol-renumbered-explicit.pdb``. +The coordinates are simply copied from the first example found for each of +``refined-implicit.pdb.gz`` and ``refined-explicit.pdb.gz``. The residue +numbers are renumbered according to the canonical isoform sequence coordinates +in the UniProt entry. diff --git a/ensembler/cli.py b/ensembler/cli.py index 76e15b6..49dce8d 100644 --- a/ensembler/cli.py +++ b/ensembler/cli.py @@ -42,4 +42,4 @@ def main(): if not command_dispatched and args['--help']: print('\n'.join([ensembler.cli_commands.general.helpstring_header, ensembler.cli_commands.general.ensembler_helpstring])) - pass \ No newline at end of file + pass diff --git a/ensembler/cli_commands/__init__.py b/ensembler/cli_commands/__init__.py index 1ebf6a4..e121d43 100644 --- a/ensembler/cli_commands/__init__.py +++ b/ensembler/cli_commands/__init__.py @@ -12,6 +12,7 @@ 'refine_explicit', 'package_models', 'quickmodel', + 'renumber_residues', ] from . import general @@ -27,4 +28,5 @@ from . import solvate from . import refine_explicit from . import package_models -from . import quickmodel \ No newline at end of file +from . import quickmodel +from . import renumber_residues \ No newline at end of file diff --git a/ensembler/cli_commands/general.py b/ensembler/cli_commands/general.py index 7724c41..01fd8c4 100644 --- a/ensembler/cli_commands/general.py +++ b/ensembler/cli_commands/general.py @@ -43,6 +43,7 @@ [--template_pdbids ] [--template_chainids ] [--template_uniprot_query ] [--template_seqid_cutoff ] [--no-loopmodel] [--package_for_fah] [--nfahclones ] [--structure_dirs ] + ensembler renumber_residues [-h | --help] [--target ] [-v | --verbose] Commands: init Initialize a new Ensembler project diff --git a/ensembler/cli_commands/refine_explicit.py b/ensembler/cli_commands/refine_explicit.py index 5143ae5..2aeeeb6 100644 --- a/ensembler/cli_commands/refine_explicit.py +++ b/ensembler/cli_commands/refine_explicit.py @@ -75,6 +75,7 @@ helpstring = '\n\n'.join([helpstring_header, '\n\n'.join(helpstring_unique_options), '\n\n'.join(helpstring_nonunique_options)]) docopt_helpstring = '\n\n'.join(helpstring_unique_options) + def dispatch(args): if args['--targetsfile']: with open(args['--targetsfile'], 'r') as targetsfile: diff --git a/ensembler/cli_commands/renumber_residues.py b/ensembler/cli_commands/renumber_residues.py new file mode 100644 index 0000000..2019c54 --- /dev/null +++ b/ensembler/cli_commands/renumber_residues.py @@ -0,0 +1,30 @@ +from ensembler.tools.renumber_residues import RenumberResidues + +helpstring_header = """Renumber residues using the canonical UniProt sequence coordinates. +Target IDs must start with the UniProt mnemonic, e.g. 'ABL1_HUMAN' +""" + +helpstring_unique_options = [ + """\ + --target ID for target to work, e.g. 'ABL1_HUMAN_D0'""", +] + +helpstring_nonunique_options = [ + """\ + -v --verbose """, +] + +helpstring = '\n\n'.join([helpstring_header, '\n\n'.join(helpstring_unique_options), '\n\n'.join(helpstring_nonunique_options)]) +docopt_helpstring = '\n\n'.join(helpstring_unique_options) + + +def dispatch(args): + if args['--verbose']: + log_level = 'debug' + else: + log_level = 'info' + + RenumberResidues( + targetid=args['--target'], + log_level=log_level + ) diff --git a/ensembler/initproject.py b/ensembler/initproject.py index 013f2ab..c11080b 100644 --- a/ensembler/initproject.py +++ b/ensembler/initproject.py @@ -11,6 +11,7 @@ from Bio.SeqRecord import SeqRecord import ensembler +import ensembler.version import ensembler.targetexplorer import ensembler.uniprot import ensembler.pdb diff --git a/ensembler/tools/renumber_residues.py b/ensembler/tools/renumber_residues.py index a4c9aed..b624ffe 100644 --- a/ensembler/tools/renumber_residues.py +++ b/ensembler/tools/renumber_residues.py @@ -19,6 +19,8 @@ def __init__(self, targetid, project_dir='.', log_level=None): set_loglevel(log_level) self.targetid = targetid self.models_target_dir = os.path.join(default_project_dirnames.models, self.targetid) + if not os.path.exists(self.models_target_dir): + raise Exception('Model "{}" not found'.format(self.targetid)) self.project_dir = project_dir self.uniprot_mnemonic = '_'.join(self.targetid.split('_')[0:2]) self._get_models() From 59aa9c7d07bd3754dec41915f446699ae218b3b1 Mon Sep 17 00:00:00 2001 From: "Daniel L. Parton" Date: Mon, 3 Aug 2015 23:11:33 -0400 Subject: [PATCH 02/10] Refactored package_for_fah --- ensembler/cli_commands/__init__.py | 2 +- ensembler/cli_commands/package_models.py | 2 +- ensembler/packaging.py | 551 ++++++++++++----------- ensembler/tests/integrationtest_utils.py | 4 +- ensembler/tests/test_packaging.py | 47 ++ ensembler/utils.py | 16 +- setup.py | 11 +- 7 files changed, 356 insertions(+), 277 deletions(-) create mode 100644 ensembler/tests/test_packaging.py diff --git a/ensembler/cli_commands/__init__.py b/ensembler/cli_commands/__init__.py index e121d43..54b9806 100644 --- a/ensembler/cli_commands/__init__.py +++ b/ensembler/cli_commands/__init__.py @@ -29,4 +29,4 @@ from . import refine_explicit from . import package_models from . import quickmodel -from . import renumber_residues \ No newline at end of file +from . import renumber_residues diff --git a/ensembler/cli_commands/package_models.py b/ensembler/cli_commands/package_models.py index 7e03132..1fe3c8d 100644 --- a/ensembler/cli_commands/package_models.py +++ b/ensembler/cli_commands/package_models.py @@ -109,5 +109,5 @@ def dispatch(args): template_seqid_cutoff=template_seqid_cutoff, nclones=n_fah_clones, archive=archive, - verbose=args['--verbose'], + loglevel=loglevel, ) \ No newline at end of file diff --git a/ensembler/packaging.py b/ensembler/packaging.py index 2e44c09..7d51d5a 100644 --- a/ensembler/packaging.py +++ b/ensembler/packaging.py @@ -1,16 +1,23 @@ import os import subprocess -import numpy as np -import ensembler -from ensembler.core import mpistate, logger +from ensembler.core import mpistate, logger, default_project_dirnames +from ensembler.core import get_targets_and_templates, select_templates_by_seqid_cutoff +from ensembler.utils import set_loglevel, read_file_contents_gz_or_not +from ensembler.refinement import auto_select_openmm_platform import simtk.unit as unit import simtk.openmm as openmm +fah_projects_dir = os.path.join(default_project_dirnames.packaged_models, 'fah-projects') + def package_for_fah(process_only_these_targets=None, - process_only_these_templates=None, template_seqid_cutoff=None, - verbose=False, nclones=1, archive=False): - '''Create the input files and directory structure necessary to start a Folding@Home project. + process_only_these_templates=None, + template_seqid_cutoff=None, + nclones=1, archive=False, + openmm_platform=None, + loglevel=None): + """ + Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. @@ -18,293 +25,293 @@ def package_for_fah(process_only_these_targets=None, ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. - ''' - models_dir = ensembler.core.default_project_dirnames.models - packaged_models_dir = ensembler.core.default_project_dirnames.packaged_models - projects_dir = os.path.join(packaged_models_dir, 'fah-projects') + """ + set_loglevel(loglevel) + if mpistate.rank == 0: - if not os.path.exists(projects_dir): - os.mkdir(projects_dir) + if not os.path.exists(fah_projects_dir): + os.mkdir(fah_projects_dir) mpistate.comm.Barrier() - targets, templates_resolved_seq = ensembler.core.get_targets_and_templates() - - if process_only_these_templates: - selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] - else: - selected_template_indices = range(len(templates_resolved_seq)) - - def generateRun(run): - """ - Build Folding@Home RUN and CLONE subdirectories from (possibly compressed) OpenMM serialized XML files. - - ARGUMENTS - - run (int) - run index - """ - - if verbose: print("Building RUN %d" % run) - - try: - import os, shutil - import gzip - - # Determine directory and pathnames. - rundir = os.path.join(project_dir, 'RUN%d' % run) - template_filename = os.path.join(rundir, 'template.txt') - seqid_filename = os.path.join(rundir, 'sequence-identity.txt') - system_filename = os.path.join(rundir, 'system.xml') - integrator_filename = os.path.join(rundir, 'integrator.xml') - protein_structure_filename = os.path.join(rundir, 'protein.pdb') - system_structure_filename = os.path.join(rundir, 'system.pdb') - final_state_filename = os.path.join(rundir, 'state%d.xml' % (nclones - 1)) - protein_structure_gz_filename_source = os.path.join(source_dir, 'implicit-refined.pdb.gz') - system_structure_gz_filename_source = os.path.join(source_dir, 'explicit-refined.pdb.gz') - - # Return if this directory has already been set up. - if os.path.exists(rundir): - if os.path.exists(template_filename)\ - and os.path.exists(seqid_filename)\ - and os.path.exists(system_filename)\ - and os.path.exists(integrator_filename)\ - and os.path.exists(protein_structure_filename)\ - and os.path.exists(system_structure_filename)\ - and os.path.exists(final_state_filename): - return - else: - # Construct run directory if it does not exist. - if not os.path.exists(rundir): - os.makedirs(rundir) - - # Write template information. - [filepath, template_name] = os.path.split(source_dir) - with open(template_filename, 'w') as outfile: - outfile.write(template_name + '\n') - - # Write the protein and system structure pdbs - with gzip.open(protein_structure_gz_filename_source) as protein_structure_file_source: - with open(protein_structure_filename, 'w') as protein_structure_file: - protein_structure_file.write(protein_structure_file_source.read()) - - with gzip.open(system_structure_gz_filename_source) as system_structure_file_source: - with open(system_structure_filename, 'w') as system_structure_file: - system_structure_file.write(system_structure_file_source.read()) - - # Read system, integrator, and state. - def readFileContents(filename): - fullpath = os.path.join(source_dir, filename) - - if os.path.exists(fullpath): - infile = open(fullpath, 'r') - elif os.path.exists(fullpath+'.gz'): - infile = gzip.open(fullpath+'.gz', 'r') - else: - import ipdb; ipdb.set_trace() - raise IOError('File %s not found' % filename) - - contents = infile.read() - infile.close() - return contents - - def writeFileContents(filepath, contents): - with open(filepath, 'w') as outfile: - outfile.write(contents) - - system = openmm.XmlSerializer.deserialize(readFileContents('explicit-system.xml')) - state = openmm.XmlSerializer.deserialize(readFileContents('explicit-state.xml')) - - # Substitute default box vectors. - box_vectors = state.getPeriodicBoxVectors() - system.setDefaultPeriodicBoxVectors(*box_vectors) - - # Write sequence identity. - contents = readFileContents('sequence-identity.txt') - writeFileContents(seqid_filename, contents) - - # Integrator settings. - constraint_tolerance = 1.0e-5 - timestep = 2.0 * unit.femtoseconds - collision_rate = 1.0 / unit.picosecond - temperature = 300.0 * unit.kelvin - - # Create new integrator to use. - integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) - - # TODO: Make sure MonteCarloBarostat temperature matches set temperature. - - # Serialize System. - writeFileContents(system_filename, openmm.XmlSerializer.serialize(system)) - - # Serialize Integrator - writeFileContents(integrator_filename, openmm.XmlSerializer.serialize(integrator)) - - # Create Context so we can randomize velocities. - platform = openmm.Platform.getPlatformByName('Reference') - context = openmm.Context(system, integrator, platform) - context.setPositions(state.getPositions()) - context.setVelocities(state.getVelocities()) - box_vectors = state.getPeriodicBoxVectors() - context.setPeriodicBoxVectors(*box_vectors) - - # Create clones with different random initial velocities. - for clone_index in range(nclones): - state_filename = os.path.join(rundir, 'state%d.xml' % clone_index) - if os.path.exists(state_filename): - continue - context.setVelocitiesToTemperature(temperature) - state = context.getState(getPositions=True, getVelocities=True, getForces=True, getEnergy=True, getParameters=True, enforcePeriodicBox=True) - writeFileContents(state_filename, openmm.XmlSerializer.serialize(state)) - - # Clean up. - del context, integrator, state, system - - except Exception as e: - import traceback - print(traceback.format_exc()) - print(str(e)) - - return - - - def archiveRun(): - archive_filename = os.path.join(project_dir, 'RUN%d.tgz' % run_index) - run_dir = os.path.join(project_dir, 'RUN%d' % run_index) - subprocess.call(['tar', 'zcf', archive_filename, run_dir]) + targets, templates_resolved_seq = get_targets_and_templates() + if not openmm_platform: + openmm_platform = auto_select_openmm_platform() for target in targets: + if process_only_these_targets and (target.id not in process_only_these_targets): + continue - # Process only specified targets if directed. - if process_only_these_targets and (target.id not in process_only_these_targets): continue + target_project_dir = os.path.join(fah_projects_dir, target.id) - models_target_dir = os.path.join(models_dir, target.id) - if mpistate.rank == 0: - if not os.path.exists(models_target_dir): continue + models_target_dir = os.path.join(default_project_dirnames.models, target.id) + if not os.path.exists(models_target_dir): + continue mpistate.comm.Barrier() + sorted_valid_templates = [] + if mpistate.rank == 0: - print("-------------------------------------------------------------------------") - print("Building FAH OpenMM project for target %s" % target.id) - print("-------------------------------------------------------------------------") - - # ======== - # Build a list of valid templates - # ======== - - # Process all templates. - if verbose: print("Building list of valid templates...") - valid_templates = list() - - if template_seqid_cutoff: - process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=template_seqid_cutoff) - selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] - - ntemplates_selected = len(selected_template_indices) - - for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size): - template = templates_resolved_seq[selected_template_indices[template_index]] - # Check to make sure all files needed are present. - is_valid = True - filenames = ['explicit-system.xml', 'explicit-state.xml', 'explicit-integrator.xml'] - for filename in filenames: - fullpath = os.path.join(models_target_dir, template.id, filename) - if not (os.path.exists(fullpath) or os.path.exists(fullpath+'.gz')): - is_valid = False - # Exclude those that are not unique by clustering. - unique_by_clustering = os.path.exists(os.path.join(models_target_dir, template.id, 'unique_by_clustering')) - if not unique_by_clustering: - is_valid = False - - # Append if valid. - if is_valid: - valid_templates.append(template) - - nvalid = len(valid_templates) - if verbose: print("%d valid unique initial starting conditions found" % nvalid) - - # ======== - # Sort by sequence identity - # ======== - - if verbose: print("Sorting templates in order of decreasing sequence identity...") - sequence_identities = np.zeros([nvalid], np.float32) - for (template_index, template) in enumerate(valid_templates): - filename = os.path.join(models_target_dir, template.id, 'sequence-identity.txt') - with open(filename, 'r') as infile: - contents = infile.readline().strip() - sequence_identity = float(contents) - sequence_identities[template_index] = sequence_identity - sorted_indices = np.argsort(-sequence_identities) - valid_templates = [ valid_templates[index] for index in sorted_indices ] - if verbose: - print("Sorted") - print(sequence_identities[sorted_indices]) - - # ======== - # Create project directory - # ======== - - project_dir = os.path.join(projects_dir, target.id) - if mpistate.rank == 0: - if not os.path.exists(project_dir): - os.makedirs(project_dir) + logger.info('-------------------------------------------------------------------------') + logger.info('Building FAH OpenMM project for target {}'.format(target.id)) + logger.info('-------------------------------------------------------------------------') - mpistate.comm.Barrier() + valid_templates = get_valid_templates_for_target( + target, + templates_resolved_seq, + process_only_these_templates, + template_seqid_cutoff + ) - # ======== - # Build runs in parallel - # ======== + sorted_valid_templates = sort_valid_templates_by_seqid( + target, + valid_templates + ) - if verbose: print("Building RUNs in parallel...") - for run_index in range(mpistate.rank, len(valid_templates), mpistate.size): - print("-------------------------------------------------------------------------") - print("Building RUN for template %s" % valid_templates[run_index].id) - print("-------------------------------------------------------------------------") + create_target_project_dir(target) + + sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) + + logger.debug("Building RUNs in parallel...") + + for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): + logger.info('-------------------------------------------------------------------------') + logger.info('Building RUN for template {}'.format(sorted_valid_templates[run_index].id)) + logger.info('-------------------------------------------------------------------------') + + template = sorted_valid_templates[run_index] + + source_dir = os.path.join(models_target_dir, template.id) + generate_fah_run( + target_project_dir, + template, + source_dir, + run_index, + nclones, + openmm_platform, + ) - source_dir = os.path.join(models_target_dir, valid_templates[run_index].id) - generateRun(run_index) if archive: - archiveRun() - - # TODO - get this working - - # if mpistate.rank == 0: - # - # # ======== - # # Metadata - # # ======== - # - # import sys - # import yaml - # import ensembler.version - # import simtk.openmm.version - # datestamp = ensembler.core.get_utcnow_formatted() - # - # meta_filepath = os.path.join(models_target_dir, 'meta.yaml') - # with open(meta_filepath) as meta_file: - # metadata = yaml.load(meta_file, Loader=ensembler.core.YamlLoader) - # - # metadata['package_for_fah'] = { - # 'target_id': target.id, - # 'datestamp': datestamp, - # 'python_version': sys.version.split('|')[0].strip(), - # 'python_full_version': ensembler.core.literal_str(sys.version), - # 'ensembler_version': ensembler.version.short_version, - # 'ensembler_commit': ensembler.version.git_revision, - # 'biopython_version': Bio.__version__, - # 'openmm_version': simtk.openmm.version.short_version, - # 'openmm_commit': simtk.openmm.version.git_revision - # } - # - # meta_filepath = os.path.join(project_dir, 'meta.yaml') - # metadata = ensembler.core.ProjectMetadata(metadata) - # metadata.write(meta_filepath) + archive_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: print('Done.') +filenames_necessary_for_fah_packaging = [ + 'unique_by_clustering', + 'explicit-system.xml', + 'explicit-state.xml', + 'explicit-integrator.xml', +] + + +def get_valid_templates_for_target(target, + templates_resolved_seq, + process_only_these_templates, + template_seqid_cutoff + ): + logger.debug("Building list of valid templates...") + models_target_dir = os.path.join(default_project_dirnames.models, target.id) + if template_seqid_cutoff: + selected_templates = select_templates_by_seqid_cutoff( + target.id, seqid_cutoff=template_seqid_cutoff + ) + elif process_only_these_templates: + selected_templates = [ + seq_obj for seq_obj in templates_resolved_seq + if seq_obj.id in process_only_these_templates + ] + else: + selected_templates = templates_resolved_seq + + valid_templates = [] + + for template in selected_templates: + # Check to make sure all files needed are present. + for filename in filenames_necessary_for_fah_packaging: + fullpath = os.path.join(models_target_dir, template.id, filename) + if not (os.path.exists(fullpath) or os.path.exists(fullpath+'.gz')): + continue + valid_templates.append(template) + + logger.debug('{} valid unique initial starting conditions found'.format(len(valid_templates))) + + return valid_templates + + +def sort_valid_templates_by_seqid(target, valid_templates): + logger.debug("Sorting templates in order of decreasing sequence identity...") + models_target_dir = os.path.join(default_project_dirnames.models, target.id) + + seqids = [] + + for template in valid_templates: + seqids.append(get_seqid_for_model(models_target_dir, template)) + + sorted_valid_templates_and_seqids = sorted( + zip(valid_templates, seqids), + reverse=True, + key=lambda x: x[1] + ) + + sorted_valid_templates = zip(*sorted_valid_templates_and_seqids)[0] + return sorted_valid_templates + + +def get_seqid_for_model(models_target_dir, template): + seqid_filename = os.path.join(models_target_dir, template.id, 'sequence-identity.txt') + with open(seqid_filename, 'r') as infile: + seqid = float(infile.readline().strip()) + return seqid + + +def create_target_project_dir(target): + target_project_dir = os.path.join(fah_projects_dir, target.id) + if not os.path.exists(target_project_dir): + os.makedirs(target_project_dir) + + +def generate_fah_run(target_project_dir, + template, + source_dir, + run_index, + nclones, + openmm_platform, + ): + """ + Build Folding@Home RUN and CLONE subdirectories from (possibly compressed) OpenMM serialized XML files. + + ARGUMENTS + + run (int) - run index + """ + logger.debug("Building RUN %d" % run_index) + + try: + # Determine directory and pathnames. + run_dir = os.path.join(target_project_dir, 'RUN%d' % run_index) + run_template_id_filepath = os.path.join(run_dir, 'template.txt') + run_seqid_filepath = os.path.join(run_dir, 'sequence-identity.txt') + run_system_filepath = os.path.join(run_dir, 'system.xml') + run_integrator_filepath = os.path.join(run_dir, 'integrator.xml') + run_protein_structure_filepath = os.path.join(run_dir, 'protein.pdb') + run_system_structure_filepath = os.path.join(run_dir, 'system.pdb') + run_final_state_filepath = os.path.join(run_dir, 'state%d.xml' % (nclones - 1)) + source_seqid_filepath = os.path.join(source_dir, 'sequence-identity.txt') + source_protein_structure_filepath = os.path.join(source_dir, 'implicit-refined.pdb.gz') + source_system_structure_filepath = os.path.join(source_dir, 'explicit-refined.pdb.gz') + source_openmm_system_filepath = os.path.join(source_dir, 'explicit-system.xml') + source_openmm_state_filepath = os.path.join(source_dir, 'explicit-state.xml') + + # Return if this directory has already been set up. + if os.path.exists(run_dir): + if ( + os.path.exists(run_template_id_filepath) + and os.path.exists(run_seqid_filepath) + and os.path.exists(run_system_filepath) + and os.path.exists(run_integrator_filepath) + and os.path.exists(run_protein_structure_filepath) + and os.path.exists(run_system_structure_filepath) + and os.path.exists(run_final_state_filepath) + ): + return + else: + # Construct run directory if it does not exist. + if not os.path.exists(run_dir): + os.makedirs(run_dir) + + # Write template ID + with open(run_template_id_filepath, 'w') as outfile: + outfile.write(template.id + '\n') + + # Write the protein and system structure pdbs + + with open(run_protein_structure_filepath, 'w') as protein_structure_file: + protein_structure_file.write( + read_file_contents_gz_or_not(source_protein_structure_filepath) + ) + + with open(run_system_structure_filepath, 'w') as system_structure_file: + system_structure_file.write( + read_file_contents_gz_or_not(source_system_structure_filepath) + ) + + system = openmm.XmlSerializer.deserialize( + read_file_contents_gz_or_not(source_openmm_system_filepath) + ) + state = openmm.XmlSerializer.deserialize( + read_file_contents_gz_or_not(source_openmm_state_filepath) + ) + + # Substitute default box vectors. + box_vectors = state.getPeriodicBoxVectors() + system.setDefaultPeriodicBoxVectors(*box_vectors) + + # Write sequence identity. + with open(run_seqid_filepath, 'w') as run_seqid_file: + run_seqid_file.write(read_file_contents_gz_or_not(source_seqid_filepath)) + + # Integrator settings. + constraint_tolerance = 1.0e-5 + timestep = 2.0 * unit.femtoseconds + collision_rate = 1.0 / unit.picosecond + temperature = 300.0 * unit.kelvin + + # Create new integrator to use. + integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) + + # TODO: Make sure MonteCarloBarostat temperature matches set temperature. + + # Serialize System. + with open(run_system_filepath, 'w') as run_system_file: + run_system_file.write(openmm.XmlSerializer.serialize(system)) + + # Serialize Integrator + with open(run_integrator_filepath, 'w') as run_integrator_file: + run_integrator_file.write(openmm.XmlSerializer.serialize(integrator)) + + # Create Context so we can randomize velocities. + platform = openmm.Platform.getPlatformByName(openmm_platform) + context = openmm.Context(system, integrator, platform) + context.setPositions(state.getPositions()) + context.setVelocities(state.getVelocities()) + box_vectors = state.getPeriodicBoxVectors() + context.setPeriodicBoxVectors(*box_vectors) + + # Create clones with different random initial velocities. + for clone_index in range(nclones): + state_filename = os.path.join(run_dir, 'state%d.xml' % clone_index) + if os.path.exists(state_filename): + continue + context.setVelocitiesToTemperature(temperature) + state = context.getState( + getPositions=True, + getVelocities=True, + getForces=True, + getEnergy=True, + getParameters=True, + enforcePeriodicBox=True + ) + with open(state_filename, 'w') as state_file: + state_file.write(openmm.XmlSerializer.serialize(state)) + + except Exception as e: + import traceback + print(traceback.format_exc()) + print(str(e)) + + return + + +def archive_fah_run(target, run_index): + project_target_dir = os.path.join(fah_projects_dir, target.id) + archive_filename = os.path.join(project_target_dir, 'RUN%d.tgz' % run_index) + run_dir = os.path.join(project_target_dir, 'RUN%d' % run_index) + subprocess.call(['tar', 'zcf', archive_filename, run_dir]) + + def package_for_transfer(process_only_these_targets=None): - raise Exception('Not implemented yet.') \ No newline at end of file + raise Exception('Not implemented yet.') diff --git a/ensembler/tests/integrationtest_utils.py b/ensembler/tests/integrationtest_utils.py index 2e93e66..2fc49b7 100644 --- a/ensembler/tests/integrationtest_utils.py +++ b/ensembler/tests/integrationtest_utils.py @@ -37,7 +37,9 @@ def init(self): def targets(self): self.init() distutils.dir_util.copy_tree( - get_installed_resource_filename(os.path.join('example_project', default_project_dirnames.targets)), + get_installed_resource_filename( + os.path.join('example_project', default_project_dirnames.targets) + ), os.path.join(self.project_dir, default_project_dirnames.targets) ) diff --git a/ensembler/tests/test_packaging.py b/ensembler/tests/test_packaging.py new file mode 100644 index 0000000..1fc54c3 --- /dev/null +++ b/ensembler/tests/test_packaging.py @@ -0,0 +1,47 @@ +import os +from nose.plugins.attrib import attr +from ensembler.packaging import package_for_fah +from ensembler.core import default_project_dirnames +from ensembler.tests.integrationtest_utils import integrationtest_context + + +@attr('unit') +def test_package_for_fah(): + with integrationtest_context(set_up_project_stage='refined_explicit'): + package_for_fah( + process_only_these_targets=['EGFR_HUMAN_D0'], + process_only_these_templates=[ + 'KC1D_HUMAN_D0_4HNF_A', + 'KC1D_HUMAN_D0_4KB8_D' + ] + ) + packaged_project_base_path = os.path.join( + default_project_dirnames.packaged_models, + 'fah-projects', + 'EGFR_HUMAN_D0' + ) + assert os.path.exists(packaged_project_base_path) + assert os.path.exists(os.path.join( + packaged_project_base_path, + 'RUN0' + )) + assert os.path.exists(os.path.join( + packaged_project_base_path, + 'RUN1' + )) + run_filenames = [ + 'template.txt', + 'system.pdb', + 'protein.pdb', + 'sequence-identity.txt', + 'system.xml', + 'integrator.xml', + 'state0.xml', + ] + for run_id in range(2): + for run_filename in run_filenames: + assert os.path.exists(os.path.join( + packaged_project_base_path, + 'RUN{}'.format(run_id), + run_filename + )) diff --git a/ensembler/utils.py b/ensembler/utils.py index 226f2cb..e50fbfa 100644 --- a/ensembler/utils.py +++ b/ensembler/utils.py @@ -1,5 +1,6 @@ import contextlib import os +import gzip import logging import functools import shutil @@ -114,4 +115,17 @@ def wrapper(*args, **kwargs): def set_arg_with_default(arg, default_arg): if arg is None: arg = default_arg - return arg \ No newline at end of file + return arg + + +def read_file_contents_gz_or_not(base_filepath): + if os.path.exists(base_filepath): + with open(base_filepath) as infile: + contents = infile.read() + elif os.path.exists(base_filepath+'.gz'): + with gzip.open(base_filepath+'.gz') as infile: + contents = infile.read() + else: + raise IOError('File {} not found'.format(base_filepath)) + + return contents diff --git a/setup.py b/setup.py index 333eb48..87ca3c6 100644 --- a/setup.py +++ b/setup.py @@ -4,17 +4,20 @@ ########################## VERSION = "1.0.3" -ISRELEASED = True +ISRELEASED = False __version__ = VERSION ########################## + def read_readme(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() + ########################## # Function for determining current git commit ########################## + def git_version(): # Return the git revision as a string # copied from numpy setup.py @@ -41,11 +44,14 @@ def _minimal_ext_cmd(cmd): return GIT_REVISION + ########################## # Function for writing version.py (this will be copied to the install directory) ########################## ensembler_version_filepath = 'ensembler/version.py' + + def write_version_py(filename=ensembler_version_filepath): cnt = """# THIS FILE IS GENERATED FROM ENSEMBLER SETUP.PY short_version = '%(version)s' @@ -77,10 +83,12 @@ def write_version_py(filename=ensembler_version_filepath): finally: a.close() + ########################## # Find package data ########################## + def find_package_data(): package_data = [] basepath = os.path.join('ensembler', 'tests') @@ -96,6 +104,7 @@ def find_package_data(): package_data.append(filepath) return package_data + ########################## # Setup ########################## From 3c41111d3f570b86de4200b056595dcb27856e7d Mon Sep 17 00:00:00 2001 From: "Daniel L. Parton" Date: Mon, 3 Aug 2015 23:17:11 -0400 Subject: [PATCH 03/10] Refactored package_for_fah --- ensembler/packaging.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ensembler/packaging.py b/ensembler/packaging.py index 7d51d5a..f4c3eb8 100644 --- a/ensembler/packaging.py +++ b/ensembler/packaging.py @@ -102,6 +102,7 @@ def package_for_fah(process_only_these_targets=None, filenames_necessary_for_fah_packaging = [ 'unique_by_clustering', + 'sequence-identity.txt', 'explicit-system.xml', 'explicit-state.xml', 'explicit-integrator.xml', From e7cfc47140df40ab00b9022427c5fbe4fa88d5f2 Mon Sep 17 00:00:00 2001 From: "Daniel L. Parton" Date: Mon, 3 Aug 2015 23:22:29 -0400 Subject: [PATCH 04/10] Refactored package_for_fah --- ensembler/packaging.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ensembler/packaging.py b/ensembler/packaging.py index f4c3eb8..284675d 100644 --- a/ensembler/packaging.py +++ b/ensembler/packaging.py @@ -132,11 +132,17 @@ def get_valid_templates_for_target(target, for template in selected_templates: # Check to make sure all files needed are present. + not_valid = False for filename in filenames_necessary_for_fah_packaging: fullpath = os.path.join(models_target_dir, template.id, filename) if not (os.path.exists(fullpath) or os.path.exists(fullpath+'.gz')): - continue - valid_templates.append(template) + not_valid = True + break + + if not_valid: + continue + else: + valid_templates.append(template) logger.debug('{} valid unique initial starting conditions found'.format(len(valid_templates))) From f4378b9c04b1821878f6fe55f3c6ed8506f2d3cd Mon Sep 17 00:00:00 2001 From: "Daniel L. Parton" Date: Tue, 4 Aug 2015 18:11:26 -0400 Subject: [PATCH 05/10] package_for_fah now uses renumbered topologies if available, and also now only outputs a single system and integrator file for each target --- ensembler/cli_commands/general.py | 2 +- ensembler/cli_commands/package_models.py | 6 +- ensembler/packaging.py | 153 +++++++++++++++++------ ensembler/tests/integrationtest_utils.py | 2 +- ensembler/tests/test_packaging.py | 14 ++- ensembler/tools/renumber_residues.py | 4 +- ensembler/utils.py | 10 +- 7 files changed, 140 insertions(+), 51 deletions(-) diff --git a/ensembler/cli_commands/general.py b/ensembler/cli_commands/general.py index 01fd8c4..cbf2922 100644 --- a/ensembler/cli_commands/general.py +++ b/ensembler/cli_commands/general.py @@ -36,7 +36,7 @@ [--api_params ] [-v | --verbose] ensembler package_models [-h | --help] [--package_for ] [--targets ] [--targetsfile ] [--templates