hackingmaterials · ardunn · May 13, 2020 · Jul 8, 2020 · Jul 8, 2020 · Jul 28, 2020
diff --git a/automatminer_dev/config.py b/automatminer_dev/config.py
@@ -29,6 +29,7 @@
     "target": "log10(K_VRH)",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": None,
 }
 
 LOG_GVRH = {
@@ -37,6 +38,7 @@
     "target": "log10(G_VRH)",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": None,
 }
 
 DIELECTRIC = {
@@ -45,6 +47,7 @@
     "target": "n",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": None,
 }
 
 JDFT2D = {
@@ -53,6 +56,7 @@
     "target": "exfoliation_en",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "meV/atom"
 }
 
 MP_GAP = {
@@ -61,6 +65,7 @@
     "target": "gap pbe",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "eV"
 }
 
 MP_IS_METAL = {
@@ -69,6 +74,7 @@
     "target": "is_metal",
     "problem_type": AMM_CLF_NAME,
     "clf_pos_label": True,
+    "unit": None
 }
 
 MP_E_FORM = {
@@ -77,6 +83,7 @@
     "target": "e_form",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "eV/atom"
 }
 
 PEROVSKITES = {
@@ -85,6 +92,7 @@
     "target": "e_form",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "eV"
 }
 
 GLASS = {
@@ -93,6 +101,7 @@
     "target": "gfa",
     "problem_type": AMM_CLF_NAME,
     "clf_pos_label": True,
+    "unit": None
 }
 
 EXPT_IS_METAL = {
@@ -101,6 +110,7 @@
     "target": "is_metal",
     "problem_type": AMM_CLF_NAME,
     "clf_pos_label": True,
+    "unit": None
 }
 
 EXPT_GAP = {
@@ -109,6 +119,7 @@
     "target": "gap expt",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "eV"
 }
 
 PHONONS = {
@@ -117,6 +128,7 @@
     "target": "last phdos peak",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "cm^-1"
 }
 
 STEELS = {
@@ -125,6 +137,7 @@
     "target": "yield strength",
     "problem_type": AMM_REG_NAME,
     "clf_pos_label": None,
+    "unit": "MPa"
 }
 
 BENCHMARK_DEBUG_SET = [JDFT2D, PHONONS, EXPT_IS_METAL, STEELS]
@@ -143,3 +156,17 @@
     STEELS,
     PHONONS,
 ]
+
+HAS_STRUCTURE = [
+    LOG_KVRH,
+    LOG_GVRH,
+    DIELECTRIC,
+    JDFT2D,
+    MP_GAP,
+    MP_IS_METAL,
+    MP_E_FORM,
+    PEROVSKITES,
+    PHONONS
+]
+
+BENCHMARK_DICT = {ds["name"]: ds for ds in BENCHMARK_FULL_SET}
diff --git a/automatminer_dev/matbench/dataset_creation/__init__.py b/automatminer_dev/matbench/dataset_creation/__init__.py
diff --git a/automatminer_dev/matbench/castelli.py → ...dev/matbench/dataset_creation/castelli.py b/automatminer_dev/matbench/castelli.py → ...dev/matbench/dataset_creation/castelli.py
diff --git a/automatminer_dev/matbench/dielectric.py → ...v/matbench/dataset_creation/dielectric.py b/automatminer_dev/matbench/dielectric.py → ...v/matbench/dataset_creation/dielectric.py
diff --git a/automatminer_dev/matbench/expt_gap.py → ...dev/matbench/dataset_creation/expt_gap.py b/automatminer_dev/matbench/expt_gap.py → ...dev/matbench/dataset_creation/expt_gap.py
@@ -73,6 +73,10 @@
 df_new = df_new.reset_index(drop=True)
 
 
+# you need to manually change GaAs0.1P0.9G1128 to its correct composition, which
+# is GaAs0.1P0.9 from Solid Solutions in Semiconducting Systems.Handbook,
+# M., Nauka 1978, 200 p. and was gathered from http://bg.imet-db.ru
+
 store_dataframe_as_json(df_new, "expt_gap.json.gz", compression="gz")
 
 print(df_new)
diff --git a/automatminer_dev/matbench/expt_is_metal.py → ...atbench/dataset_creation/expt_is_metal.py b/automatminer_dev/matbench/expt_is_metal.py → ...atbench/dataset_creation/expt_is_metal.py
@@ -74,6 +74,10 @@
 
 df_new["is_metal"] = df_new["is_metal"] == 1
 
+# you need to manually change GaAs0.1P0.9G1128 to its correct composition, which
+# is GaAs0.1P0.9 from Solid Solutions in Semiconducting Systems.Handbook,
+# M., Nauka 1978, 200 p. and was gathered from http://bg.imet-db.ru
+
 store_dataframe_as_json(df_new, "expt_is_metal.json.gz", compression="gz")
 
 print(df_new)

diff --git a/automatminer_dev/matbench/glass.py → ...er_dev/matbench/dataset_creation/glass.py b/automatminer_dev/matbench/glass.py → ...er_dev/matbench/dataset_creation/glass.py
diff --git a/automatminer_dev/matbench/jdft2d.py → ...r_dev/matbench/dataset_creation/jdft2d.py b/automatminer_dev/matbench/jdft2d.py → ...r_dev/matbench/dataset_creation/jdft2d.py
diff --git a/automatminer_dev/matbench/mp_eform.py → ...dev/matbench/dataset_creation/mp_eform.py b/automatminer_dev/matbench/mp_eform.py → ...dev/matbench/dataset_creation/mp_eform.py
diff --git a/automatminer_dev/matbench/mp_elasticity.py → ...atbench/dataset_creation/mp_elasticity.py b/automatminer_dev/matbench/mp_elasticity.py → ...atbench/dataset_creation/mp_elasticity.py
diff --git a/automatminer_dev/matbench/mp_gaps.py → ..._dev/matbench/dataset_creation/mp_gaps.py b/automatminer_dev/matbench/mp_gaps.py → ..._dev/matbench/dataset_creation/mp_gaps.py
diff --git a/automatminer_dev/matbench/phonons.py → ..._dev/matbench/dataset_creation/phonons.py b/automatminer_dev/matbench/phonons.py → ..._dev/matbench/dataset_creation/phonons.py
diff --git a/automatminer_dev/matbench/steels.py → ...r_dev/matbench/dataset_creation/steels.py b/automatminer_dev/matbench/steels.py → ...r_dev/matbench/dataset_creation/steels.py
@@ -8,6 +8,8 @@
 from matminer.datasets.dataset_retrieval import load_dataset
 
 
+
+# Note the units are in MPa, NOT GPa
 if __name__ == "__main__":
     df = load_dataset("steel_strength")
     df = df[["formula", "yield strength"]]

diff --git a/automatminer_dev/matbench/docs/__init__.py b/automatminer_dev/matbench/docs/__init__.py
diff --git a/automatminer_dev/matbench/get_info.py → ...atbench/docs/formatting_matbench_table.py b/automatminer_dev/matbench/get_info.py → ...atbench/docs/formatting_matbench_table.py
@@ -1,4 +1,12 @@
 from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets, get_all_dataset_info
+
+
+'''
+
+Helper function to format matbench documentation page.
+'''
+
+
 datasets = get_available_datasets(print_format=None)
 
 for dataset in datasets:

diff --git a/automatminer_dev/matbench/mpcontribs/__init__.py b/automatminer_dev/matbench/mpcontribs/__init__.py
diff --git a/automatminer_dev/matbench/mpcontribs/upload.py b/automatminer_dev/matbench/mpcontribs/upload.py
@@ -0,0 +1,219 @@
+import wget, json, os, math
+from string import capwords
+from pybtex.database import parse_string
+import pybtex.errors
+from mpcontribs.client import Client
+from pymatgen import MPRester, Structure
+import tqdm
+import pprint
+
+# from matminer.datasets.dataset_retrieval import (
+#     get_all_dataset_info,
+#     get_available_datasets,
+#     load_dataset,
+# )
+
+from matminer.datasets import load_dataset
+
+from automatminer_dev.config import DIELECTRIC, JDFT2D, PEROVSKITES, STEELS, BENCHMARK_FULL_SET, BENCHMARK_DICT, HAS_STRUCTURE
+
+
+pybtex.errors.set_strict_mode(False)
+api_key = os.environ["MPCONTRIBS_API_KEY"]
+client = Client(api_key, host='ml-api.materialsproject.cloud')
+mprester = MPRester()
+
+
+# client.get_project("matbench_steels").pretty()
+
+
+fn = 'dataset_metadata.json'
+if not os.path.exists(fn):
+    wget.download(f'https://raw.githubusercontent.com/hackingmaterials/matminer/master/matminer/datasets/{fn}')
+metadata = json.load(open(fn, 'r'))
+metadata = {k: d for k, d in metadata.items() if "matbench" in k}
+
+
+
+# Creating new projects
+#######################
+# todo: might not have access to add new projects
+# for name, info in metadata.items():
+#
+#     print(f"Uploading {name}")
+#
+#     columns = {}
+#     for col, text in info['columns'].items():
+#         k = col.replace('_', '|').replace('-', '|').replace('(', ' ').replace(
+#             ')', '')
+#         columns[k] = text
+#
+#     project = {
+#         'is_public': True,
+#         'owner': '[email protected]',
+#         "name": name,
+#         'title': name,  # TODO update and set long_title
+#         'authors': 'A. Dunn, A. Jain',
+#         'description': info['description'],
+#         'other': {
+#             'columns': columns,
+#             'entries': info['num_entries']
+#         },
+#         'references': []
+#     }
+#
+#     for ref in info['bibtex_refs']:
+#
+#         if name == "matbench_phonons":
+#             ref = ref.replace(
+#                 "petretto_dwaraknath_miranda_winston_giantomassi_rignanese_van setten_gonze_persson_hautier_2018",
+#                 "petretto2018")
+#
+#         bib = parse_string(ref, 'bibtex')
+#         for key, entry in bib.entries.items():
+#             key_is_doi = key.startswith('doi:')
+#             url = 'https://doi.org/' + key.split(':', 1)[
+#                 -1] if key_is_doi else entry.fields.get('url')
+#             k = 'Zhuo2018' if key_is_doi else capwords(key.replace('_', ''))
+#             if k.startswith('C2'):
+#                 k = 'Castelli2012'
+#             elif k.startswith('Landolt'):
+#                 k = 'LB1997'
+#             elif k == 'Citrine':
+#                 url = 'https://www.citrination.com'
+#
+#             if len(k) > 8:
+#                 k = k[:4] + k[-4:]
+#             project['references'].append({"label": k, "url": url})
+#
+#     try:
+#         print(client.projects.create_entry(project=project).result())
+#     except Exception as ex:
+#         print(
+#             ex)  # TODO should use get_entry to check existence -> use update_entry if project exists
+
+
+
+
+
+# Map of canonical yet non-mpcontribs-compatible tagret nams to compatible (unicode, no punctuation) target names
+target_map = {
+    "yield strength": "σᵧ",
+    "log10(K_VRH)": "log₁₀Kᵛʳʰ",
+    "log10(G_VRH)": "log₁₀Gᵛʳʰ",
+    "n": "𝑛",
+    "exfoliation_en": "Eˣ",
+    "gap pbe": "Eᵍ",
+    "is_metal": "metallic",
+    "e_form": "Eᶠ",
+    "gfa": "glass",
+    "gap expt": "Eᵍ",
+    "last phdos peak": "ωᵐᵃˣ",
+}
+
+
+# # Getting project-level metadata in order
+# #########################################
+#
+# # Add warning to mpcontribs since the results will be stored out of order.
+# # Also, fix columns for new mpcontribs deployment
+# for name, info in metadata.items():
+#     mb_shortname = name.replace("matbench_", "")
+#
+#     description = info["description"] + f" If you are viewing this on MPContribs-ML interactively, please ensure the order of the identifiers is sequential (mb-{mb_shortname}-0001, mb-{mb_shortname}-0002, etc.) before benchmarking."
+#     if "For benchmarking" not in description:
+#         print(name, description)
+#
+#     has_structure = mb_shortname in [ds["name"] for ds in HAS_STRUCTURE]
+#     primitive_key = "structure" if has_structure else "composition"
+#     target = BENCHMARK_DICT[mb_shortname]["target"]
+#
+#     print(client.projects.update_entry(
+#         pk=name,
+#         project={
+#             "description": description,
+#             'other.columns': {
+#                 target_map[target]: metadata[name]["columns"][target],
+#                 primitive_key: metadata[name]["columns"][primitive_key]
+#             }
+#         }).result())
+
+
+
+
+# Entering all contributions to projects
+########################################
+
+
+# steels.........X
+# log_kvrh.......
+# log_gvrh.......
+# dielectric.....
+# jdft2d.........X
+# expt_gap.......X
+# expt_is_metal..X
+# phonons........
+# mp_is_metal....
+# mp_gap.........
+# glass..........X
+# mp_e_form......
+# perovskites....
+
+
+for ds in ["dielectric", "phonons", "mp_gap", "mp_is_metal", "perovskites", "mp_e_form"]:
+
+    ds_config = BENCHMARK_DICT[ds]
+
+    name = "matbench_" + ds_config["name"]
+    print(f"Loading {name}")
+    df = load_dataset(name)
+    target = ds_config["target"]
+    unit = f" {ds_config['unit']}" if ds_config["unit"] else ""
+
+
+    # print(f"Updating 'other' column entries of {name} with unicode.")
+    # print(client.projects.update_entry(pk=name, project={
+    #     'other.columns': {
+    #         target_map[target]: metadata[name]["columns"][target],
+    #         "structure": metadata[name]["columns"]["structure"]
+    #         # "composition": metadata[name]["columns"]["composition"]
+    #     }
+    # }).result())
+
+
+
+    # print(f"Deleting contributions of {name}")
+    # client.delete_contributions(name)
+
+
+    print(f"Assembling and uploading contributions for {name}")
+    structure_filename = "/Users/ardunn/Downloads/outfile.cif"
+    contributions = []
+    id_prefix = df.shape[0]
+
+
+    id_n_zeros = math.floor(math.log(df.shape[0], 10)) + 1
+    for i, row in tqdm.tqdm(enumerate(df.iterrows())):
+        entry = row[1]
+        contrib = {'project': name, 'is_public': True}
+
+        if "structure" in entry.index:
+            structures = []
+            s = entry.loc["structure"]
+            s.to("cif", structure_filename)
+            s = Structure.from_file(structure_filename)
+            c = s.composition.get_integer_formula_and_factor()[0]
+            contrib["structures"] = [s]
+
+        else:
+            c = entry["composition"]
+
+        id_number = f"{i+1:0{id_n_zeros}d}"
+        identifier = f"mb-{ds_config['name']}-{id_number}"
+        contrib["identifier"] = identifier
+
+        contrib["data"] = {target_map[target]: f"{entry.loc[target]}{unit}"}
+        contrib["formula"] = c
+        contributions.append(contrib)
+
+    client.submit_contributions(contributions, per_page=10)
diff --git a/docs/_sources/datasets.rst.txt b/docs/_sources/datasets.rst.txt
@@ -195,7 +195,7 @@ procedures, etc.) on a dataset with :code:`matminer.datasets.get_all_dataset_inf
     Description: Matbench v0.1 dataset for predicting steel yield strengths from chemical composition alone. Retrieved from Citrine informatics. Deduplicated.
     Columns:
         composition: Chemical formula.
-        yield strength: Target variable. Experimentally measured steel yield strengths, in GPa.
+        yield strength: Target variable. Experimentally measured steel yield strengths, in MPa.
     Num Entries: 312
     Reference: https://citrination.com/datasets/153092/
     Bibtex citations: ['@misc{Citrine Informatics,\ntitle = {Mechanical properties of some steels},\nhowpublished = {\\url{https://citrination.com/datasets/153092/},\n}']