Generalized input format logic

Now doesn't need `mofgen`, just `openbabel`. Can still handle cif- to-raspa without `openbabel`.
danieleongari · Jul 30, 2013 · 19289de · 19289de
1 parent d971724
commit 19289de
Show file tree

Hide file tree

Showing 4 changed files with 321 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -85,5 +85,5 @@ import EQeq
 # Load a file. In practice, this can come from any source, such as a database
 with open("IRMOF-1.cif") as in_file:
     data = in_file.read()
-charges = EQeq.run(data, output_type="list", method="ewald")
+charges = EQeq.run(data, output_type="object", method="ewald")
 ```
diff --git a/eqeq.py b/eqeq.py
@@ -10,9 +10,11 @@
 import sys
 
 try:
-    from mofgen import format_converter
+    import format_converter
 except:
-    sys.stderr.write("Mofgen not found. JSON format conversion unsupported.\n")
+    sys.stderr.write("Openbabel not found. Format conversion unsupported.\n"
+                     "Only use cif files for output, and cif/mol/pdb/car/files"
+                     " for output.")
 
 eqeq = cdll.LoadLibrary("/usr/lib/libeqeq.so")
 eqeq.run.argtypes = (c_char_p, c_char_p, c_double, c_float, c_int, c_char_p,
@@ -32,13 +34,13 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
 
     Args:
         structure: Either a filename or data encoding a chemical.
-        input_type: (Optional) Specifies input type. Options are "cif" and
-            "json", where the latter can also take Python objects.
+        input_type: (Optional) Specifies input type. Can be anything supported
+            by openbabel, as well as "json"
         output_type: (Optional) Specifies the output type. Currently, options
             are "cif", "mol", "pdb", "car", "json", "object", and "files". The
-            first four return chemical data formats, "object" returns a Python
-            object, "json" is that object serialized, and "files" saves files
-            of all possible output types.
+            first four return modified chemical data formats, "object" returns
+            a Python object, "json" is that object serialized, and "files"
+            saves files of all possible output types.
         l: (Optional) Lambda, the dielectric screening parameter.
         h_i0: (Optional) The electron affinity of hydrogen.
         charge_precision: (Optional) Number of decimals to use for charges.
@@ -60,16 +62,14 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
         output type is set to "files"
     """
     # Error handling on string params. Should spare users some annoyance.
-    i, o, m = input_type.lower(), output_type.lower(), method.lower()
-    if i not in ["cif", "json"]:
-        raise NotImplementedError("Input format '%s' is not supported!" % i)
+    o, m = output_type.lower(), method.lower()
     if o not in ["cif", "pdb", "car", "mol", "json", "object", "files"]:
         raise NotImplementedError("Output format '%s' is not supported!" % o)
     if m not in ["direct", "nonperiodic", "ewald"]:
         raise NotImplementedError("Method '%s' is not supported!" % m)
-    # If linked to mofgen, use it to handle json interconversion externally
-    if input_type == "json":
-        structure = format_converter.convert(structure, "json", "cif")
+    # If linked to openbabel, use it to handle json interconversion externally
+    if input_type != "cif":
+        structure = format_converter.convert(structure, input_type, "cif")
     # Calls libeqeq.so's run method, returning a string of data
     result = eqeq.run(structure, output_type, l, h_i0, charge_precision,
                       method, m_r, m_k, eta, ionization_data_path,
@@ -91,8 +91,8 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
     parser.add_argument("input", type=str, help="An input cif file. Can be "
                         "either a filepath or the input data itself")
     parser.add_argument("--input-type", type=str, default="cif",
-                        help="Specifies input type. Options are 'cif' and"
-                        "'json'")
+                        help="Specifies input type. Can be anything supported "
+                        "by openbabel and 'json'")
     parser.add_argument("--output-type", type=str, default="files",
                         help="Specifies the output type. Currently, options "
                         "are 'cif', 'mol', 'pdb', 'car', 'json', and 'files'. "
@@ -126,7 +126,14 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
                         "'chargecenters.dat'")
     args = parser.parse_args()
 
-    output = run(args.input, output_type=args.output_type, l=args.l,
+    try:
+        with open(args.input) as in_file:
+            args.input = in_file.read()
+    except IOError:
+        pass
+
+    output = run(args.input, input_type=args.input_type,
+                 output_type=args.output_type, l=args.l,
                  h_i0=args.hi0, charge_precision=args.charge_precision,
                  method=args.method, m_r=args.mr, m_k=args.mk, eta=args.eta,
                  ionization_data_path=args.ionization_data_path,

diff --git a/format_converter.py b/format_converter.py
@@ -0,0 +1,213 @@
+"""
+Methods to interconvert between json and other (cif, mol, smi, etc.) files
+"""
+
+import pybel
+ob = pybel.ob
+import numpy as np
+from numpy.linalg import norm, inv
+
+import json_formatter as json
+
+table = ob.OBElementTable()
+
+
+def convert(data, in_format, out_format, pretty=False):
+    """Converts between two inputted chemical formats.
+
+    Args:
+        data: A string representing the chemical file to be converted. If the
+            `in_format` is "json", this can also be a Python object
+        in_format: The format of the `data` string. Can be "json" or any format
+            recognized by Open Babel
+        out_format: The format to convert to. Can be "json" or any format
+            recognized by Open Babel
+        pretty: (Optional) If True and `out_format` is "json", will pretty-
+            print the output for human readability
+    Returns:
+        A string representing the inputted `data` in the specified `out_format`
+    """
+
+    # Decide on a json formatter depending on desired prettiness
+    dumps = json.dumps if pretty else json.compress
+
+    # If it's a json string, load it
+    if in_format == "json" and isinstance(data, basestring):
+        data = json.loads(data)
+
+    # A little "hack" to format inputted json
+    if in_format == "json" and out_format == "json":
+        return json.dumps(data)
+
+    # These are converted manually to retain crystallographic information
+    if in_format == "json" and out_format == "cif":
+        return json_to_cif(data)
+
+    # These use the open babel library to interconvert, with additions for json
+    mol = (json_to_pybel(data) if in_format == "json" else
+           pybel.readstring(in_format.encode("ascii"),
+                            "".join(i for i in data if ord(i) < 128)
+                            .encode("ascii")))
+
+    # Infer structure in cases where the input format has no specification
+    if not mol.OBMol.HasNonZeroCoords():
+        mol.make3D()
+    mol.OBMol.Center()
+
+    # EQeq takes a specific cif format that openbabel does not output.
+    # This manually overrides that.
+    if out_format == "cif":
+        return json_to_cif(pybel_to_json(mol))
+
+    if out_format == "object":
+        return pybel_to_json(mol)
+    elif out_format == "json":
+        return dumps(pybel_to_json(mol))
+    else:
+        return mol.write(out_format)
+
+
+def json_to_cif(mof):
+    """Converts python mof data structure to a cif file.
+
+    Args:
+        mof: A Python data structure analogous to `structures.Mof`
+    Returns:
+        A cif file containing atom and periodic connection data
+    """
+    va, vb, vc = [np.array(v) for v in mof["periodic_connections"]]
+    a, b, c = [norm(i) for i in [va, vb, vc]]
+    alpha, beta, gamma = [get_angle(i, j) * 180 / np.pi
+                          for i, j in [[vb, vc], [va, vc], [vb, va]]]
+
+    # Headers and unit cell information (note the space groups are wrong)\n"
+    cif = ("data_functionalizedCrystal\n"
+           " _audit_creation_method\t'MofGen v2!'\n"
+           "_symmetry_space_group_name_H-M\t'P1'\n"
+           "_symmetry_Int_Tables_number\t1\n"
+           "_symmetry_cell_setting\ttriclinic\n"
+           "loop_\n"
+           "_symmetry_equiv_pos_as_xyz\n"
+           "   x,y,z\n"
+           "_cell_length_a\t%f\n"
+           "_cell_length_b\t%f\n"
+           "_cell_length_c\t%f\n"
+           "_cell_angle_alpha\t%f\n"
+           "_cell_angle_beta\t%f\n"
+           "_cell_angle_gamma\t%f\n"
+           "loop_\n"
+           "_atom_site_label\n"
+           "_atom_site_type_symbol\n"
+           "_atom_site_fract_x\n"
+           "_atom_site_fract_y\n"
+           "_atom_site_fract_z\n") % (a, b, c, alpha, beta, gamma)
+
+    # Adding atom positional data in fractional coordinates
+    if "building_blocks" in mof:
+        mof["atoms"] = [atom for bb in mof["building_blocks"]
+                        for atom in bb["atoms"]]
+    cif += "\n".join("%s%d\t%s\t%.5f\t%.5f\t%.5f"
+                     % tuple([atom["element"], i, atom["element"]] +
+                     cartesian_to_fractional(atom["location"], va, vb, vc))
+                     for i, atom in enumerate(mof["atoms"]))
+    return cif
+
+
+def get_angle(first_vector, second_vector):
+    """Returns the angle, in radians, between two vectors."""
+    return np.arccos(np.clip(np.dot(first_vector, second_vector) /
+                    (norm(first_vector) * norm(second_vector)), -1, 1))
+
+
+def cartesian_to_fractional(location, va, vb, vc):
+    """Convert cartesian to fractional coordinates using unit cell vectors.
+
+    Args:
+        location: The cartesian coordinates to be converted
+        va, vb, vc: The three crystal vectors of the structure
+    Returns:
+        A three-element list containing the fractional coordinates
+    """
+    # `cartesian` = matrix(`vectors`) * `fractional`. Invert to solve.
+    frac = inv(np.matrix([va, vb, vc]).transpose()).dot(np.array(location))
+    # This constrains the fractional coordinates to [0, 1]
+    return [f % 1 for f in np.nditer(frac)]
+
+
+def json_to_pybel(data, center=True):
+    """Converts python data structure to pybel.Molecule.
+
+    This will infer bond data if not specified.
+
+    Args:
+        data: The loaded json data of a molecule, as a Python object
+        center: (Optional) Centers the coordinates of the outputted molecule
+    Returns:
+        An instance of `pybel.Molecule`
+    """
+    obmol = ob.OBMol()
+    obmol.BeginModify()
+    for atom in data["atoms"]:
+        obatom = obmol.NewAtom()
+        obatom.SetAtomicNum(table.GetAtomicNum(str(atom["element"])))
+        obatom.SetVector(*atom["location"])
+        if "charge" in atom:
+            obatom.SetPartialCharge(atom["charge"])
+    # If there is no bond data, try to infer them
+    if "bonds" not in data or not data["bonds"]:
+        obmol.ConnectTheDots()
+        obmol.PerceiveBondOrders()
+    # Otherwise, use the bonds in the data set
+    else:
+        for bond in data["bonds"]:
+            if "atoms" not in bond:
+                continue
+            obmol.AddBond(bond["atoms"][0] + 1, bond["atoms"][1] + 1,
+                          bond["order"])
+    obmol.EndModify()
+    if center:
+        obmol.Center()
+    return pybel.Molecule(obmol)
+
+
+def pybel_to_json(molecule):
+    """Converts a pybel molecule to json.
+
+    Args:
+        molecule: An instance of `pybel.Molecule`
+    Returns:
+        A Python dictionary containing atom and bond data
+    """
+    # Save atom element type and 3D location.
+    atoms = [{"element": table.GetSymbol(atom.atomicnum),
+              "location": atom.coords}
+             for atom in molecule.atoms]
+    # Saves partial charge data, if exists
+    for atom, mol_atom in zip(atoms, molecule.atoms):
+        if hasattr(mol_atom, "partialcharge") and mol_atom.partialcharge:
+            atom["charge"] = mol_atom.partialcharge
+    # Save number of bonds and indices of endpoint atoms
+    bonds = [{"atoms": [b.GetBeginAtom().GetIndex(),
+                        b.GetEndAtom().GetIndex()],
+              "order": b.GetBondOrder()}
+             for b in ob.OBMolBondIter(molecule.OBMol)]
+    output = {"atoms": atoms, "bonds": bonds}
+
+    # If there's unit cell data, save it to the json output
+    if hasattr(molecule, "unitcell"):
+        uc = molecule.unitcell
+        output["periodic_connections"] = [[v.GetX(), v.GetY(), v.GetZ()]
+                                          for v in uc.GetCellVectors()]
+    return output
+
+
+if __name__ == "__main__":
+    # Lazy converter to test this out
+    import sys
+    in_data, in_format, out_format = sys.argv[1:]
+    try:
+        with open(in_data) as in_file:
+            data = in_file.read()
+    except IOError:
+        data = in_data
+    print convert(data, in_format, out_format, pretty=True)
diff --git a/json_formatter.py b/json_formatter.py
@@ -0,0 +1,84 @@
+"""
+Some small edits to json output.
+ * Float decimals are truncated to three digits
+ * [x, y, z] vectors are displayed on one line
+ * Converts numpy arrays to lists and defined objects to dictionaries
+ * Removes internal parent references in objects
+ * Atoms and bonds are on one line each (looks more like other chem formats)
+"""
+
+from itertools import takewhile
+import re
+import json
+from json import encoder
+encoder.FLOAT_REPR = lambda o: format(o, '.6f')
+
+import numpy as np
+
+load = json.load
+loads = json.loads
+
+def compress(obj):
+    """Outputs json without whitespace + object handling."""
+    return json.dumps(obj, sort_keys=True, separators=(",", ":"),
+                      cls=CustomEncoder)
+
+
+def dumps(obj):
+    """Outputs json with formatting edits + object handling."""
+    return json.dumps(obj, indent=4, sort_keys=True, cls=CustomEncoder)
+
+
+class CustomEncoder(json.JSONEncoder):
+    def default(self, obj):
+        """Fired when an unserializable object is hit."""
+        if hasattr(obj, '__dict__'):
+            d = obj.__dict__.copy()
+            d.pop("parent_block", None)
+            return d
+        elif isinstance(obj, np.ndarray):
+            return obj.copy().tolist()
+        else:
+            raise TypeError(("Object of type %s with value of %s is not JSON "
+                            "serializable") % (type(obj), repr(obj)))
+
+    def encode(self, obj):
+        """Fired for every object."""
+        if hasattr(obj, "history"):
+            del obj.history
+        s = super(CustomEncoder, self).encode(obj)
+        # If uncompressed, postprocess for formatting
+        if len(s.splitlines()) > 1:
+            s = self.postprocess(s)
+        return s
+
+    def postprocess(self, json_string):
+        """Display float lists as one line in json. Useful for vectors."""
+
+        # As a general rule, all three-element float lists are on one line
+        json_string = re.sub("\[\s*([-+]?\d*\.?\d*), \s*([-+]?\d*\.?\d*),"
+                             "\s*([-+]?\d*\.?\d*)\s*\]", r"[ \1, \2, \3 ]",
+                             json_string)
+
+        # This further compresses atoms and bonds to be on one line
+        is_compressing = False
+        compressed = []
+        spaces = 0
+        for row in json_string.split("\n"):
+            if is_compressing:
+                if row.strip() == "{":
+                    compressed.append(row.rstrip())
+                elif row.rstrip() == " " * spaces + "],":
+                    compressed.append(row.rstrip())
+                    is_compressing = False
+                else:
+                    compressed[-1] += " " + row.strip()
+            else:
+                compressed.append(row.rstrip())
+                if any(a in row for a in ["atoms", "bonds"]):
+                    # Fix to handle issues that arise with empty lists
+                    if "[]" in row:
+                        continue
+                    spaces = sum(1 for _ in takewhile(str.isspace, row))
+                    is_compressing = True
+        return "\n".join(compressed)