diff --git a/scripts/ReadMe.md b/scripts/ReadMe.md index 023632f..2cb3a00 100644 --- a/scripts/ReadMe.md +++ b/scripts/ReadMe.md @@ -48,6 +48,10 @@ This folder contains scripts submitted by users or CCDC scientists for anyone to - Calculates the surface charge for a given structure and surface terminations. Runs both from CMD and Mercury. +## Refcodes With Properties + +- A script for generating refcode lists with specific properties from an easy-to-read control file. + ## Tips A section for top tips in using the repository and GitHub. diff --git a/scripts/refcodes_with_properties/ReadMe.md b/scripts/refcodes_with_properties/ReadMe.md new file mode 100644 index 0000000..dd97202 --- /dev/null +++ b/scripts/refcodes_with_properties/ReadMe.md @@ -0,0 +1,62 @@ +# Refcode List Generator + +## Summary + +A script that allows you to create refcode lists (or CSV files of properties for a refcode list) for simple properties. +The advantage of the script is that the control is via an easy to read file so you can keep an interprettable record of +how a test set was generated in research. You can also then reproduce the list, or indeed run it on a new database and +update it with the same conditions. + +### Relevance + +We want research to be FAIR (Findable, Attributable, Interoperable and Reproducible) - this script means we can create a +simple decscription of the test set used that any researcher could then reproduce from the script and the description. + +## Requirements + +- Tested with CSD Python API version 3.9 on Linux and Windows +- ccdc.io +- ccdc.search + +## Licensing Requirements + +- CSD-Core + +## Instructions on Running + +### Linux command line + +- load the CSD Python API Miniconda environment +- create a text control file with the various control lines specified +- call Python to read the script and specify necessary arguments + +~~~bash +python refcodes_with_properties.py --help +~~~ + +The above will print an extended help message that describes the registered + +You can run the script with an Example file. Results are printed by default and can be redirected to be saved in an +output file, e.g. + +~~~ +python refcodes_with_properties.py -c example_control_file.txt -o mylist.gcd +~~~ + +This will generate a GCD file that can be used in other work. + +### Windows CSD Python API + +- launch a CMD window +- Use the installed version of the CSD Python API, for example C:\Users\ + \CCDC\ccdc-software\csd-python-api assuming the CCDC tools are installed in the ususal place do this + +~~~bat +C:\Users\\CCDC\ccdc-software\csd-python-api\run_python_api.bat refcodes_with_properties.py --help +~~~ + +## Author + +_Jason C.Cole_ 2025 + +> For feedback or to report any issues please contact [support@ccdc.cam.ac.uk](mailto:support@ccdc.cam.ac.uk) diff --git a/scripts/refcodes_with_properties/entry_property_calculator.py b/scripts/refcodes_with_properties/entry_property_calculator.py new file mode 100644 index 0000000..588a17e --- /dev/null +++ b/scripts/refcodes_with_properties/entry_property_calculator.py @@ -0,0 +1,452 @@ +# +# This script can be used for any purpose without limitation subject to the +# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx +# +# This permission notice and the following statement of attribution must be +# included in all copies or substantial portions of this script. +# +# 2025-03-14: created by Jason C. Cole, The Cambridge Crystallographic Data Centre + + +''' +Utility classes for filtering CSD entries based on a property control file +''' + + +_filter_classes = {} + + +def register(cls): + ''' Register a filter class to use in the script. + :param cls: the class to register. + ''' + if cls.name() in _filter_classes: + raise ValueError(f"a class with the name {cls.name()} is already registered. Use a different name") + + _filter_classes[cls.name()] = cls + + +def filter(name): + return _filter_classes[name] + + +def helptext(): + ''' Get help text + ''' + txt = "" + for name in _filter_classes.keys(): + cls = _filter_classes[name] + txt = txt + " %s -> %s," % (name, cls.helptext()) + return txt[:-1] + + +class _Filter(object): + + @staticmethod + def name(): + raise NotImplementedError # override this + + @staticmethod + def helptext(): + raise NotImplementedError # override this + + @staticmethod + def argument_pair(): + raise NotImplementedError # override this + + +class _ComparativeFilter(_Filter): + def __init__(self, args): + value = False + if args.strip() == '1' or args.strip().lower() == 'true': + value = True + + self.expected_value = value + + def value(self, theobject): + raise NotImplementedError # override this + + def __call__(self, theobject): + value = self.value(theobject) + return value == self.expected_value + + +class _RangeFilter(_Filter): + def __init__(self, args): + parts = [p.strip() for p in args.split()] + self.minimum = float(parts[0]) + self.maximum = float(parts[1]) + + def value(self, theobject): + raise NotImplementedError # override this + + def __call__(self, theobject): + value = self.value(theobject) + return value >= self.minimum and value <= self.maximum + + +class AllowedAtomicNumbersFilter(_Filter): + def __init__(self, args): + self.allowed_atomic_numbers = [int(atomic_number) for atomic_number in args.strip().split()] + + @staticmethod + def name(): + return "allowed atomic numbers" + + @staticmethod + def helptext(): + return "specify a set of atomic numbers (space separated) that the structure can have (and no others)" + + def __call__(self, entry): + try: + molecule = entry.crystal.molecule + return len([x for x in molecule.atoms if x.atomic_number in self.allowed_atomic_numbers]) == len(molecule.atoms) + except TypeError: + return False + + +register(AllowedAtomicNumbersFilter) + + +class MustContainAtomicNumbersFilter(_Filter): + def __init__(self, args): + self.must_have_atomic_numbers = [int(atomic_number) for atomic_number in args.strip().split()] + + @staticmethod + def name(): + return "must have atomic numbers" + + @staticmethod + def helptext(): + return "specify a set of atomic numbers (space separated) that the structure must have" + + def __call__(self, entry): + try: + molecule = entry.crystal.molecule + + contains = set() + for x in molecule.atoms: + contains.add(x.atomic_number) + + for x in self.must_have_atomic_numbers: + if x not in contains: + return False + + return True + except TypeError: + return False + + +register(MustContainAtomicNumbersFilter) + + +class OrganicFilter(_ComparativeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "organic" + + @staticmethod + def helptext(): + return "organic entries or not" + + def value(self, entry): + return entry.is_organic + + +register(OrganicFilter) + + +class PolymericFilter(_ComparativeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "polymeric" + + @staticmethod + def helptext(): + return "polymeric entries or not" + + def value(self, entry): + return entry.is_polymeric + + +register(PolymericFilter) + + +class AllHaveSitesFilter(_ComparativeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "all atoms have sites" + + @staticmethod + def helptext(): + return "whether all atoms have to have sites" + + def value(self, entry): + try: + return entry.crystal.molecule.all_atoms_have_sites + except TypeError: + return False + + +register(AllHaveSitesFilter) + + +class DisorderedFilter(_ComparativeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "disordered" + + @staticmethod + def helptext(): + return "disordered entries or not" + + def value(self, entry): + return entry.has_disorder + + +register(DisorderedFilter) + + +class AtomicWeightFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "atomic weight" + + @staticmethod + def helptext(): + return "specify a range of atomic weight (for the whole structure - not individual molecules)" + + def value(self, entry): + try: + molecule = entry.crystal.molecule + return molecule.molecular_weight + except TypeError: + return 0.0 + + +register(AtomicWeightFilter) + + +class AtomCountFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "atom count" + + @staticmethod + def helptext(): + return "specify a range of atom counts (for the whole structure - not individual molecules)" + + def value(self, entry): + try: + molecule = entry.crystal.molecule + return len(molecule.atoms) + except TypeError: + return 0 + + +register(AtomCountFilter) + + +class RotatableBondFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "rotatable bond count" + + @staticmethod + def helptext(): + return "specify the number of rotatable bonds (for the whole structure - not individual molecules)" + + def value(self, entry): + try: + molecule = entry.crystal.molecule + return sum(x.is_rotatable for x in molecule.bonds) + except TypeError: + return 0 + + +register(RotatableBondFilter) + + +class DonorCountFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "donor count" + + @staticmethod + def helptext(): + return "specify a donor atom count range (for the whole structure - not individual molecules)" + + def value(self, entry): + try: + molecule = entry.crystal.molecule + return len([x for x in molecule.atoms if x.is_donor]) + except TypeError: + return 0 + + +register(DonorCountFilter) + + +class AcceptorCountFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "acceptor count" + + @staticmethod + def helptext(): + return "specify an acceptor atom count range (for the whole structure - not individual molecules)" + + def value(self, entry): + try: + molecule = entry.crystal.molecule + return len([x for x in molecule.atoms if x.is_acceptor]) + except TypeError: + return 0 + + +register(AcceptorCountFilter) + + +class ComponentCountFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "component range" + + @staticmethod + def helptext(): + return "specify a component count range for the whole structure" + + def value(self, entry): + try: + return len(entry.crystal.molecule.components) + except TypeError: + return 0 + + +register(ComponentCountFilter) + + +class ZPrimeFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "zprime range" + + @staticmethod + def helptext(): + return "specify a z-prime range" + + def value(self, entry): + return entry.crystal.z_prime + + +register(ZPrimeFilter) + + +class RfactorFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "rfactor range" + + @staticmethod + def helptext(): + return "specify r-factor range (in %%)" + + def value(self, entry): + return entry.r_factor + + +register(RfactorFilter) + + +class SpacegroupNumberFilter(_RangeFilter): + def __init__(self, args): + super().__init__(args) + + @staticmethod + def name(): + return "spacegroup number range" + + @staticmethod + def helptext(): + return "specify spacegroup number range" + + def value(self, entry): + return entry.crystal.spacegroup_number_and_setting[0] + + +register(SpacegroupNumberFilter) + + +class FilterEvaluation(object): + def __init__(self): + self._methods = [] + + def add_filter(self, method): + self._methods.append(method) + + def evaluate(self, entry): + for method in self._methods: + try: + if not method(entry): + return False + except TypeError: + return False + + return True + + def values(self, entry): + values = {} + for method in self._methods: + if hasattr(method, "value"): + try: + values[method.name()] = method.value(entry) + except NotImplementedError: + pass + return values + + +def parse_control_file(lines): + evaluator = FilterEvaluation() + for line in lines: + if len(line) > 0 and line[0] != '#': + parts = line.split(":") + if len(parts) > 1: + cls = _filter_classes[parts[0].strip()] + evaluator.add_filter(cls(parts[1])) + return evaluator diff --git a/scripts/refcodes_with_properties/example_control_file.txt b/scripts/refcodes_with_properties/example_control_file.txt new file mode 100644 index 0000000..a511b2d --- /dev/null +++ b/scripts/refcodes_with_properties/example_control_file.txt @@ -0,0 +1,19 @@ +# An example control file - this will find all organic structures +# with up to 100 atoms, Z' = 1, only 1 component that isnt disordered and +# has a low R-Factor +# +# only include organic structures as output +organic : 1 +# number of atoms to allow through +atom count : 0 100 +# Ensure Z-prime is one +zprime range : 0.99 1.01 +# Ensure only one component in the structure +component range : 0 1 +# Dont include disordered structures +disordered : 0 +# Specify an R-factor range +rfactor range : 0.1 5 + + + diff --git a/scripts/refcodes_with_properties/more_elaborate_control.txt b/scripts/refcodes_with_properties/more_elaborate_control.txt new file mode 100644 index 0000000..3590d33 --- /dev/null +++ b/scripts/refcodes_with_properties/more_elaborate_control.txt @@ -0,0 +1,29 @@ +# An example control file +# +# +# only include organic structures as output +organic : 1 +# specify a range of donors +donor count : 0 10 +# specify a range of acceptors +acceptor count : 5 5 +# rotatable bond count range +rotatable bond count : 3 7 +# number of atoms to allow through +atom count : 0 100 +# only include structures containing Hydrogen, Carbon, Nitrogen or Oxygen and nothing else +allowed atomic numbers : 1 6 7 8 +# only include structures containing all of these elements (i.e.) Hydrogen, Carbon, Nitrogen or Oxygen +must have atomic numbers : 1 6 7 8 +# Ensure Z-prime is one +zprime range : 0.99 1.01 +# Ensure only one component in the structure +component range : 0 1 +# Dont include disordered structures +disordered : 0 +# Specify an R-factor range +rfactor range : 0.1 5 +# atomic weight +atomic weight : 0.0 1000.0 + + diff --git a/scripts/refcodes_with_properties/refcodes_with_properties.py b/scripts/refcodes_with_properties/refcodes_with_properties.py new file mode 100644 index 0000000..e98b7ea --- /dev/null +++ b/scripts/refcodes_with_properties/refcodes_with_properties.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# +# This script can be used for any purpose without limitation subject to the +# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx +# +# This permission notice and the following statement of attribution must be +# included in all copies or substantial portions of this script. +# +# 2025-03-14: created by Jason C. Cole, The Cambridge Crystallographic Data Centre + +''' +Filter a refcode list to the subset that have the desired properties +''' + +######################################################################### + +import argparse +import csv +import sys + +from ccdc import io + +import entry_property_calculator + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-r', '--refcode_file', help='input file containing the list of refcodes', default=None) + parser.add_argument('-d', '--database_file', help='input file containing the list of refcodes', default=None) + parser.add_argument('-c', '--control_file', help='configuration file containing the desired properties\n\n %s' % ( + entry_property_calculator.helptext())) + parser.add_argument('-v', '--get_values', action="store_true", + help='calculate and print descriptor values where possible rather than filter\n\n %s' % ( + entry_property_calculator.helptext())) + parser.add_argument('-o', '--output_file', default=None, + help='output CSV file for results\n\n %s' % (entry_property_calculator.helptext())) + + args = parser.parse_args() + + refcode_file = args.refcode_file + database_file = args.database_file + control_file = args.control_file + print_values = args.get_values + + outfile = sys.stdout + if args.output_file is not None: + outfile = open(args.output_file, 'wb') + + filterer = entry_property_calculator.parse_control_file(open(control_file, "r").readlines()) + + reader = None + if refcode_file: + reader = io.EntryReader(refcode_file, format='identifiers') + elif database_file: + reader = io.EntryReader(database_file) + else: + reader = io.EntryReader('CSD') + + if args.get_values: + + csvwriter = None + for entry in reader: + values = filterer.values(entry) + if csvwriter == None: + fieldnames = ["identifier"] + values.keys() + csvwriter = csv.DictWriter(outfile, fieldnames=fieldnames) + csvwriter.writeheader() + values["identifier"] = entry.identifier + csvwriter.writerow(values) + + else: + for entry in reader: + if filterer.evaluate(entry): + outfile.write(entry.identifier + "\n") diff --git a/scripts/refcodes_with_properties/test_entry_property_calculator.py b/scripts/refcodes_with_properties/test_entry_property_calculator.py new file mode 100644 index 0000000..c89488e --- /dev/null +++ b/scripts/refcodes_with_properties/test_entry_property_calculator.py @@ -0,0 +1,153 @@ +import unittest + +from ccdc.io import EntryReader + +from entry_property_calculator import parse_control_file + + +class TestFiltering(unittest.TestCase): + + def setUp(self): + + self.reader = EntryReader('CSD') + self.aabhtz = self.reader.entry("AABHTZ") + self.aacani_ten = self.reader.entry("AACANI10") + self.aadamc = self.reader.entry("AADAMC") + self.aadrib = self.reader.entry("AADRIB") + self.abadis = self.reader.entry("ABADIS") + + def test_organic_filter(self): + + test_file = """ +organic : 1 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + + self.assertTrue(evaluator.evaluate(self.aabhtz)) + + self.assertFalse(evaluator.evaluate(self.aacani_ten)) + + def test_component_filter(self): + test_file = """ +component range : 0 1 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + + self.assertTrue(evaluator.evaluate(self.aabhtz)) + + self.assertFalse(evaluator.evaluate(self.aacani_ten)) + + def test_donor_count_filter(self): + test_file = """ +donor count : 2 2 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + + self.assertFalse(evaluator.evaluate(self.aabhtz)) + + self.assertTrue(evaluator.evaluate(self.aadamc)) + + test_file = """ +donor count : 0 3 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + + self.assertTrue(evaluator.evaluate(self.aabhtz)) + self.assertTrue(evaluator.evaluate(self.aadamc)) + + def test_acceptor_count_filter(self): + test_file = """ +acceptor count : 7 7 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + + # regards Cl as an acceptor ... + self.assertTrue(evaluator.evaluate(self.aabhtz)) + + self.assertTrue(evaluator.evaluate(self.aacani_ten)) + + def test_zprime(self): + test_file = """ +zprime range : 0.99 1.01 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + self.assertTrue(evaluator.evaluate(self.aabhtz)) + self.assertFalse(evaluator.evaluate(self.aadrib)) + + def test_atomic_numbers(self): + test_file = """ +allowed atomic numbers : 1 6 7 8 +must have atomic numbers : 1 6 7 8 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + self.assertFalse(evaluator.evaluate(self.aabhtz)) + self.assertFalse(evaluator.evaluate(self.aadrib)) + + test_file = """ +must have atomic numbers : 1 6 7 8 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + self.assertTrue(evaluator.evaluate(self.aabhtz)) + self.assertFalse(evaluator.evaluate(self.aadrib)) + + def test_rotatable_bond_count(self): + test_file = """ +rotatable bond count : 0 4 +""" + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + self.assertTrue(evaluator.evaluate(self.abadis)) + + def test_multiple(self): + test_file = """ + +# An example control file +# +# +# only include organic structures as output +organic : 1 +# specify a range of donors +donor count : 0 10 +# specify a range of acceptors +acceptor count : 5 5 +# rotatable bond count range +rotatable bond count : 3 7 +# number of atoms to allow through +atom count : 0 100 +# only include structures containing Hydrogen, Carbon, Nitrogen or Oxygen and nothing else +allowed atomic numbers : 1 6 7 8 +# only include structures containing all of these elements (i.e.) Hydrogen, Carbon, Nitrogen or Oxygen +must have atomic numbers : 1 6 7 8 +# Ensure Z-prime is one +zprime range : 0.99 1.01 +# Ensure only one component in the structure +component range : 2 2 +# Dont include disordered structures +disordered : 0 +# Specify an R-factor range +rfactor range : 0.1 5 +# + + +""" + + lines = test_file.split('\n') + evaluator = parse_control_file(lines) + hits = [] + + test_entries = ['AABHTZ', 'ABAQEB', 'ABELEY', 'ADAQOM', 'ADARAA', 'ADARAZ', 'ADUWIG', 'AFEREK'] + for id in test_entries: + e = self.reader.entry(id) + + if evaluator.evaluate(e): + hits.append(e.identifier) + + self.assertEqual(['ABAQEB', 'ABELEY', 'ADAQOM', 'ADUWIG', 'AFEREK'], hits)