Skip to content

Update pmg requirement + docs #325

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions automatminer_dev/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"target": "log10(K_VRH)",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": None,
}

LOG_GVRH = {
Expand All @@ -37,6 +38,7 @@
"target": "log10(G_VRH)",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": None,
}

DIELECTRIC = {
Expand All @@ -45,6 +47,7 @@
"target": "n",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": None,
}

JDFT2D = {
Expand All @@ -53,6 +56,7 @@
"target": "exfoliation_en",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "meV/atom"
}

MP_GAP = {
Expand All @@ -61,6 +65,7 @@
"target": "gap pbe",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "eV"
}

MP_IS_METAL = {
Expand All @@ -69,6 +74,7 @@
"target": "is_metal",
"problem_type": AMM_CLF_NAME,
"clf_pos_label": True,
"unit": None
}

MP_E_FORM = {
Expand All @@ -77,6 +83,7 @@
"target": "e_form",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "eV/atom"
}

PEROVSKITES = {
Expand All @@ -85,6 +92,7 @@
"target": "e_form",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "eV"
}

GLASS = {
Expand All @@ -93,6 +101,7 @@
"target": "gfa",
"problem_type": AMM_CLF_NAME,
"clf_pos_label": True,
"unit": None
}

EXPT_IS_METAL = {
Expand All @@ -101,6 +110,7 @@
"target": "is_metal",
"problem_type": AMM_CLF_NAME,
"clf_pos_label": True,
"unit": None
}

EXPT_GAP = {
Expand All @@ -109,6 +119,7 @@
"target": "gap expt",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "eV"
}

PHONONS = {
Expand All @@ -117,6 +128,7 @@
"target": "last phdos peak",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "cm^-1"
}

STEELS = {
Expand All @@ -125,6 +137,7 @@
"target": "yield strength",
"problem_type": AMM_REG_NAME,
"clf_pos_label": None,
"unit": "MPa"
}

BENCHMARK_DEBUG_SET = [JDFT2D, PHONONS, EXPT_IS_METAL, STEELS]
Expand All @@ -143,3 +156,17 @@
STEELS,
PHONONS,
]

HAS_STRUCTURE = [
LOG_KVRH,
LOG_GVRH,
DIELECTRIC,
JDFT2D,
MP_GAP,
MP_IS_METAL,
MP_E_FORM,
PEROVSKITES,
PHONONS
]

BENCHMARK_DICT = {ds["name"]: ds for ds in BENCHMARK_FULL_SET}
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@
df_new = df_new.reset_index(drop=True)


# you need to manually change GaAs0.1P0.9G1128 to its correct composition, which
# is GaAs0.1P0.9 from Solid Solutions in Semiconducting Systems.Handbook,
# M., Nauka 1978, 200 p. and was gathered from http://bg.imet-db.ru

store_dataframe_as_json(df_new, "expt_gap.json.gz", compression="gz")

print(df_new)
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@

df_new["is_metal"] = df_new["is_metal"] == 1

# you need to manually change GaAs0.1P0.9G1128 to its correct composition, which
# is GaAs0.1P0.9 from Solid Solutions in Semiconducting Systems.Handbook,
# M., Nauka 1978, 200 p. and was gathered from http://bg.imet-db.ru

store_dataframe_as_json(df_new, "expt_is_metal.json.gz", compression="gz")

print(df_new)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from matminer.datasets.dataset_retrieval import load_dataset



# Note the units are in MPa, NOT GPa
if __name__ == "__main__":
df = load_dataset("steel_strength")
df = df[["formula", "yield strength"]]
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
from matminer.datasets.dataset_retrieval import load_dataset, get_available_datasets, get_all_dataset_info


'''

Helper function to format matbench documentation page.
'''


datasets = get_available_datasets(print_format=None)

for dataset in datasets:
Expand Down
Empty file.
219 changes: 219 additions & 0 deletions automatminer_dev/matbench/mpcontribs/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
import wget, json, os, math
from string import capwords
from pybtex.database import parse_string
import pybtex.errors
from mpcontribs.client import Client
from pymatgen import MPRester, Structure
import tqdm
import pprint

# from matminer.datasets.dataset_retrieval import (
# get_all_dataset_info,
# get_available_datasets,
# load_dataset,
# )

from matminer.datasets import load_dataset

from automatminer_dev.config import DIELECTRIC, JDFT2D, PEROVSKITES, STEELS, BENCHMARK_FULL_SET, BENCHMARK_DICT, HAS_STRUCTURE


pybtex.errors.set_strict_mode(False)
api_key = os.environ["MPCONTRIBS_API_KEY"]
client = Client(api_key, host='ml-api.materialsproject.cloud')
mprester = MPRester()


# client.get_project("matbench_steels").pretty()


fn = 'dataset_metadata.json'
if not os.path.exists(fn):
wget.download(f'https://raw.githubusercontent.com/hackingmaterials/matminer/master/matminer/datasets/{fn}')
metadata = json.load(open(fn, 'r'))
metadata = {k: d for k, d in metadata.items() if "matbench" in k}



# Creating new projects
#######################
# todo: might not have access to add new projects
# for name, info in metadata.items():
#
# print(f"Uploading {name}")
#
# columns = {}
# for col, text in info['columns'].items():
# k = col.replace('_', '|').replace('-', '|').replace('(', ' ').replace(
# ')', '')
# columns[k] = text
#
# project = {
# 'is_public': True,
# 'owner': '[email protected]',
# "name": name,
# 'title': name, # TODO update and set long_title
# 'authors': 'A. Dunn, A. Jain',
# 'description': info['description'],
# 'other': {
# 'columns': columns,
# 'entries': info['num_entries']
# },
# 'references': []
# }
#
# for ref in info['bibtex_refs']:
#
# if name == "matbench_phonons":
# ref = ref.replace(
# "petretto_dwaraknath_miranda_winston_giantomassi_rignanese_van setten_gonze_persson_hautier_2018",
# "petretto2018")
#
# bib = parse_string(ref, 'bibtex')
# for key, entry in bib.entries.items():
# key_is_doi = key.startswith('doi:')
# url = 'https://doi.org/' + key.split(':', 1)[
# -1] if key_is_doi else entry.fields.get('url')
# k = 'Zhuo2018' if key_is_doi else capwords(key.replace('_', ''))
# if k.startswith('C2'):
# k = 'Castelli2012'
# elif k.startswith('Landolt'):
# k = 'LB1997'
# elif k == 'Citrine':
# url = 'https://www.citrination.com'
#
# if len(k) > 8:
# k = k[:4] + k[-4:]
# project['references'].append({"label": k, "url": url})
#
# try:
# print(client.projects.create_entry(project=project).result())
# except Exception as ex:
# print(
# ex) # TODO should use get_entry to check existence -> use update_entry if project exists





# Map of canonical yet non-mpcontribs-compatible tagret nams to compatible (unicode, no punctuation) target names
target_map = {
"yield strength": "σᵧ",
"log10(K_VRH)": "log₁₀Kᵛʳʰ",
"log10(G_VRH)": "log₁₀Gᵛʳʰ",
"n": "𝑛",
"exfoliation_en": "Eˣ",
"gap pbe": "Eᵍ",
"is_metal": "metallic",
"e_form": "Eᶠ",
"gfa": "glass",
"gap expt": "Eᵍ",
"last phdos peak": "ωᵐᵃˣ",
}


# # Getting project-level metadata in order
# #########################################
#
# # Add warning to mpcontribs since the results will be stored out of order.
# # Also, fix columns for new mpcontribs deployment
# for name, info in metadata.items():
# mb_shortname = name.replace("matbench_", "")
#
# description = info["description"] + f" If you are viewing this on MPContribs-ML interactively, please ensure the order of the identifiers is sequential (mb-{mb_shortname}-0001, mb-{mb_shortname}-0002, etc.) before benchmarking."
# if "For benchmarking" not in description:
# print(name, description)
#
# has_structure = mb_shortname in [ds["name"] for ds in HAS_STRUCTURE]
# primitive_key = "structure" if has_structure else "composition"
# target = BENCHMARK_DICT[mb_shortname]["target"]
#
# print(client.projects.update_entry(
# pk=name,
# project={
# "description": description,
# 'other.columns': {
# target_map[target]: metadata[name]["columns"][target],
# primitive_key: metadata[name]["columns"][primitive_key]
# }
# }).result())




# Entering all contributions to projects
########################################


# steels.........X
# log_kvrh.......
# log_gvrh.......
# dielectric.....
# jdft2d.........X
# expt_gap.......X
# expt_is_metal..X
# phonons........
# mp_is_metal....
# mp_gap.........
# glass..........X
# mp_e_form......
# perovskites....


for ds in ["dielectric", "phonons", "mp_gap", "mp_is_metal", "perovskites", "mp_e_form"]:

ds_config = BENCHMARK_DICT[ds]

name = "matbench_" + ds_config["name"]
print(f"Loading {name}")
df = load_dataset(name)
target = ds_config["target"]
unit = f" {ds_config['unit']}" if ds_config["unit"] else ""


# print(f"Updating 'other' column entries of {name} with unicode.")
# print(client.projects.update_entry(pk=name, project={
# 'other.columns': {
# target_map[target]: metadata[name]["columns"][target],
# "structure": metadata[name]["columns"]["structure"]
# # "composition": metadata[name]["columns"]["composition"]
# }
# }).result())



# print(f"Deleting contributions of {name}")
# client.delete_contributions(name)


print(f"Assembling and uploading contributions for {name}")
structure_filename = "/Users/ardunn/Downloads/outfile.cif"
contributions = []
id_prefix = df.shape[0]


id_n_zeros = math.floor(math.log(df.shape[0], 10)) + 1
for i, row in tqdm.tqdm(enumerate(df.iterrows())):
entry = row[1]
contrib = {'project': name, 'is_public': True}

if "structure" in entry.index:
structures = []
s = entry.loc["structure"]
s.to("cif", structure_filename)
s = Structure.from_file(structure_filename)
c = s.composition.get_integer_formula_and_factor()[0]
contrib["structures"] = [s]

else:
c = entry["composition"]

id_number = f"{i+1:0{id_n_zeros}d}"
identifier = f"mb-{ds_config['name']}-{id_number}"
contrib["identifier"] = identifier

contrib["data"] = {target_map[target]: f"{entry.loc[target]}{unit}"}
contrib["formula"] = c
contributions.append(contrib)

client.submit_contributions(contributions, per_page=10)
2 changes: 1 addition & 1 deletion docs/_sources/datasets.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ procedures, etc.) on a dataset with :code:`matminer.datasets.get_all_dataset_inf
Description: Matbench v0.1 dataset for predicting steel yield strengths from chemical composition alone. Retrieved from Citrine informatics. Deduplicated.
Columns:
composition: Chemical formula.
yield strength: Target variable. Experimentally measured steel yield strengths, in GPa.
yield strength: Target variable. Experimentally measured steel yield strengths, in MPa.
Num Entries: 312
Reference: https://citrination.com/datasets/153092/
Bibtex citations: ['@misc{Citrine Informatics,\ntitle = {Mechanical properties of some steels},\nhowpublished = {\\url{https://citrination.com/datasets/153092/},\n}']
Expand Down
Loading