Skip to content

Commit

Permalink
Generalized input format logic
Browse files Browse the repository at this point in the history
Now doesn't need `mofgen`, just `openbabel`. Can still handle cif-
to-raspa without `openbabel`.
  • Loading branch information
patrickfuller committed Jul 30, 2013
1 parent d971724 commit 19289de
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,5 @@ import EQeq
# Load a file. In practice, this can come from any source, such as a database
with open("IRMOF-1.cif") as in_file:
data = in_file.read()
charges = EQeq.run(data, output_type="list", method="ewald")
charges = EQeq.run(data, output_type="object", method="ewald")
```
39 changes: 23 additions & 16 deletions eqeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
import sys

try:
from mofgen import format_converter
import format_converter
except:
sys.stderr.write("Mofgen not found. JSON format conversion unsupported.\n")
sys.stderr.write("Openbabel not found. Format conversion unsupported.\n"
"Only use cif files for output, and cif/mol/pdb/car/files"
" for output.")

eqeq = cdll.LoadLibrary("/usr/lib/libeqeq.so")
eqeq.run.argtypes = (c_char_p, c_char_p, c_double, c_float, c_int, c_char_p,
Expand All @@ -32,13 +34,13 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
Args:
structure: Either a filename or data encoding a chemical.
input_type: (Optional) Specifies input type. Options are "cif" and
"json", where the latter can also take Python objects.
input_type: (Optional) Specifies input type. Can be anything supported
by openbabel, as well as "json"
output_type: (Optional) Specifies the output type. Currently, options
are "cif", "mol", "pdb", "car", "json", "object", and "files". The
first four return chemical data formats, "object" returns a Python
object, "json" is that object serialized, and "files" saves files
of all possible output types.
first four return modified chemical data formats, "object" returns
a Python object, "json" is that object serialized, and "files"
saves files of all possible output types.
l: (Optional) Lambda, the dielectric screening parameter.
h_i0: (Optional) The electron affinity of hydrogen.
charge_precision: (Optional) Number of decimals to use for charges.
Expand All @@ -60,16 +62,14 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
output type is set to "files"
"""
# Error handling on string params. Should spare users some annoyance.
i, o, m = input_type.lower(), output_type.lower(), method.lower()
if i not in ["cif", "json"]:
raise NotImplementedError("Input format '%s' is not supported!" % i)
o, m = output_type.lower(), method.lower()
if o not in ["cif", "pdb", "car", "mol", "json", "object", "files"]:
raise NotImplementedError("Output format '%s' is not supported!" % o)
if m not in ["direct", "nonperiodic", "ewald"]:
raise NotImplementedError("Method '%s' is not supported!" % m)
# If linked to mofgen, use it to handle json interconversion externally
if input_type == "json":
structure = format_converter.convert(structure, "json", "cif")
# If linked to openbabel, use it to handle json interconversion externally
if input_type != "cif":
structure = format_converter.convert(structure, input_type, "cif")
# Calls libeqeq.so's run method, returning a string of data
result = eqeq.run(structure, output_type, l, h_i0, charge_precision,
method, m_r, m_k, eta, ionization_data_path,
Expand All @@ -91,8 +91,8 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
parser.add_argument("input", type=str, help="An input cif file. Can be "
"either a filepath or the input data itself")
parser.add_argument("--input-type", type=str, default="cif",
help="Specifies input type. Options are 'cif' and"
"'json'")
help="Specifies input type. Can be anything supported "
"by openbabel and 'json'")
parser.add_argument("--output-type", type=str, default="files",
help="Specifies the output type. Currently, options "
"are 'cif', 'mol', 'pdb', 'car', 'json', and 'files'. "
Expand Down Expand Up @@ -126,7 +126,14 @@ def run(structure, input_type="cif", output_type="cif", l=1.2, h_i0=-2.0,
"'chargecenters.dat'")
args = parser.parse_args()

output = run(args.input, output_type=args.output_type, l=args.l,
try:
with open(args.input) as in_file:
args.input = in_file.read()
except IOError:
pass

output = run(args.input, input_type=args.input_type,
output_type=args.output_type, l=args.l,
h_i0=args.hi0, charge_precision=args.charge_precision,
method=args.method, m_r=args.mr, m_k=args.mk, eta=args.eta,
ionization_data_path=args.ionization_data_path,
Expand Down
213 changes: 213 additions & 0 deletions format_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
"""
Methods to interconvert between json and other (cif, mol, smi, etc.) files
"""

import pybel
ob = pybel.ob
import numpy as np
from numpy.linalg import norm, inv

import json_formatter as json

table = ob.OBElementTable()


def convert(data, in_format, out_format, pretty=False):
"""Converts between two inputted chemical formats.
Args:
data: A string representing the chemical file to be converted. If the
`in_format` is "json", this can also be a Python object
in_format: The format of the `data` string. Can be "json" or any format
recognized by Open Babel
out_format: The format to convert to. Can be "json" or any format
recognized by Open Babel
pretty: (Optional) If True and `out_format` is "json", will pretty-
print the output for human readability
Returns:
A string representing the inputted `data` in the specified `out_format`
"""

# Decide on a json formatter depending on desired prettiness
dumps = json.dumps if pretty else json.compress

# If it's a json string, load it
if in_format == "json" and isinstance(data, basestring):
data = json.loads(data)

# A little "hack" to format inputted json
if in_format == "json" and out_format == "json":
return json.dumps(data)

# These are converted manually to retain crystallographic information
if in_format == "json" and out_format == "cif":
return json_to_cif(data)

# These use the open babel library to interconvert, with additions for json
mol = (json_to_pybel(data) if in_format == "json" else
pybel.readstring(in_format.encode("ascii"),
"".join(i for i in data if ord(i) < 128)
.encode("ascii")))

# Infer structure in cases where the input format has no specification
if not mol.OBMol.HasNonZeroCoords():
mol.make3D()
mol.OBMol.Center()

# EQeq takes a specific cif format that openbabel does not output.
# This manually overrides that.
if out_format == "cif":
return json_to_cif(pybel_to_json(mol))

if out_format == "object":
return pybel_to_json(mol)
elif out_format == "json":
return dumps(pybel_to_json(mol))
else:
return mol.write(out_format)


def json_to_cif(mof):
"""Converts python mof data structure to a cif file.
Args:
mof: A Python data structure analogous to `structures.Mof`
Returns:
A cif file containing atom and periodic connection data
"""
va, vb, vc = [np.array(v) for v in mof["periodic_connections"]]
a, b, c = [norm(i) for i in [va, vb, vc]]
alpha, beta, gamma = [get_angle(i, j) * 180 / np.pi
for i, j in [[vb, vc], [va, vc], [vb, va]]]

# Headers and unit cell information (note the space groups are wrong)\n"
cif = ("data_functionalizedCrystal\n"
" _audit_creation_method\t'MofGen v2!'\n"
"_symmetry_space_group_name_H-M\t'P1'\n"
"_symmetry_Int_Tables_number\t1\n"
"_symmetry_cell_setting\ttriclinic\n"
"loop_\n"
"_symmetry_equiv_pos_as_xyz\n"
" x,y,z\n"
"_cell_length_a\t%f\n"
"_cell_length_b\t%f\n"
"_cell_length_c\t%f\n"
"_cell_angle_alpha\t%f\n"
"_cell_angle_beta\t%f\n"
"_cell_angle_gamma\t%f\n"
"loop_\n"
"_atom_site_label\n"
"_atom_site_type_symbol\n"
"_atom_site_fract_x\n"
"_atom_site_fract_y\n"
"_atom_site_fract_z\n") % (a, b, c, alpha, beta, gamma)

# Adding atom positional data in fractional coordinates
if "building_blocks" in mof:
mof["atoms"] = [atom for bb in mof["building_blocks"]
for atom in bb["atoms"]]
cif += "\n".join("%s%d\t%s\t%.5f\t%.5f\t%.5f"
% tuple([atom["element"], i, atom["element"]] +
cartesian_to_fractional(atom["location"], va, vb, vc))
for i, atom in enumerate(mof["atoms"]))
return cif


def get_angle(first_vector, second_vector):
"""Returns the angle, in radians, between two vectors."""
return np.arccos(np.clip(np.dot(first_vector, second_vector) /
(norm(first_vector) * norm(second_vector)), -1, 1))


def cartesian_to_fractional(location, va, vb, vc):
"""Convert cartesian to fractional coordinates using unit cell vectors.
Args:
location: The cartesian coordinates to be converted
va, vb, vc: The three crystal vectors of the structure
Returns:
A three-element list containing the fractional coordinates
"""
# `cartesian` = matrix(`vectors`) * `fractional`. Invert to solve.
frac = inv(np.matrix([va, vb, vc]).transpose()).dot(np.array(location))
# This constrains the fractional coordinates to [0, 1]
return [f % 1 for f in np.nditer(frac)]


def json_to_pybel(data, center=True):
"""Converts python data structure to pybel.Molecule.
This will infer bond data if not specified.
Args:
data: The loaded json data of a molecule, as a Python object
center: (Optional) Centers the coordinates of the outputted molecule
Returns:
An instance of `pybel.Molecule`
"""
obmol = ob.OBMol()
obmol.BeginModify()
for atom in data["atoms"]:
obatom = obmol.NewAtom()
obatom.SetAtomicNum(table.GetAtomicNum(str(atom["element"])))
obatom.SetVector(*atom["location"])
if "charge" in atom:
obatom.SetPartialCharge(atom["charge"])
# If there is no bond data, try to infer them
if "bonds" not in data or not data["bonds"]:
obmol.ConnectTheDots()
obmol.PerceiveBondOrders()
# Otherwise, use the bonds in the data set
else:
for bond in data["bonds"]:
if "atoms" not in bond:
continue
obmol.AddBond(bond["atoms"][0] + 1, bond["atoms"][1] + 1,
bond["order"])
obmol.EndModify()
if center:
obmol.Center()
return pybel.Molecule(obmol)


def pybel_to_json(molecule):
"""Converts a pybel molecule to json.
Args:
molecule: An instance of `pybel.Molecule`
Returns:
A Python dictionary containing atom and bond data
"""
# Save atom element type and 3D location.
atoms = [{"element": table.GetSymbol(atom.atomicnum),
"location": atom.coords}
for atom in molecule.atoms]
# Saves partial charge data, if exists
for atom, mol_atom in zip(atoms, molecule.atoms):
if hasattr(mol_atom, "partialcharge") and mol_atom.partialcharge:
atom["charge"] = mol_atom.partialcharge
# Save number of bonds and indices of endpoint atoms
bonds = [{"atoms": [b.GetBeginAtom().GetIndex(),
b.GetEndAtom().GetIndex()],
"order": b.GetBondOrder()}
for b in ob.OBMolBondIter(molecule.OBMol)]
output = {"atoms": atoms, "bonds": bonds}

# If there's unit cell data, save it to the json output
if hasattr(molecule, "unitcell"):
uc = molecule.unitcell
output["periodic_connections"] = [[v.GetX(), v.GetY(), v.GetZ()]
for v in uc.GetCellVectors()]
return output


if __name__ == "__main__":
# Lazy converter to test this out
import sys
in_data, in_format, out_format = sys.argv[1:]
try:
with open(in_data) as in_file:
data = in_file.read()
except IOError:
data = in_data
print convert(data, in_format, out_format, pretty=True)
84 changes: 84 additions & 0 deletions json_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Some small edits to json output.
* Float decimals are truncated to three digits
* [x, y, z] vectors are displayed on one line
* Converts numpy arrays to lists and defined objects to dictionaries
* Removes internal parent references in objects
* Atoms and bonds are on one line each (looks more like other chem formats)
"""

from itertools import takewhile
import re
import json
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.6f')

import numpy as np

load = json.load
loads = json.loads

def compress(obj):
"""Outputs json without whitespace + object handling."""
return json.dumps(obj, sort_keys=True, separators=(",", ":"),
cls=CustomEncoder)


def dumps(obj):
"""Outputs json with formatting edits + object handling."""
return json.dumps(obj, indent=4, sort_keys=True, cls=CustomEncoder)


class CustomEncoder(json.JSONEncoder):
def default(self, obj):
"""Fired when an unserializable object is hit."""
if hasattr(obj, '__dict__'):
d = obj.__dict__.copy()
d.pop("parent_block", None)
return d
elif isinstance(obj, np.ndarray):
return obj.copy().tolist()
else:
raise TypeError(("Object of type %s with value of %s is not JSON "
"serializable") % (type(obj), repr(obj)))

def encode(self, obj):
"""Fired for every object."""
if hasattr(obj, "history"):
del obj.history
s = super(CustomEncoder, self).encode(obj)
# If uncompressed, postprocess for formatting
if len(s.splitlines()) > 1:
s = self.postprocess(s)
return s

def postprocess(self, json_string):
"""Display float lists as one line in json. Useful for vectors."""

# As a general rule, all three-element float lists are on one line
json_string = re.sub("\[\s*([-+]?\d*\.?\d*), \s*([-+]?\d*\.?\d*),"
"\s*([-+]?\d*\.?\d*)\s*\]", r"[ \1, \2, \3 ]",
json_string)

# This further compresses atoms and bonds to be on one line
is_compressing = False
compressed = []
spaces = 0
for row in json_string.split("\n"):
if is_compressing:
if row.strip() == "{":
compressed.append(row.rstrip())
elif row.rstrip() == " " * spaces + "],":
compressed.append(row.rstrip())
is_compressing = False
else:
compressed[-1] += " " + row.strip()
else:
compressed.append(row.rstrip())
if any(a in row for a in ["atoms", "bonds"]):
# Fix to handle issues that arise with empty lists
if "[]" in row:
continue
spaces = sum(1 for _ in takewhile(str.isspace, row))
is_compressing = True
return "\n".join(compressed)

0 comments on commit 19289de

Please sign in to comment.