From 690f783630a3c6ff9598a36f8fea127776026e90 Mon Sep 17 00:00:00 2001 From: Xingjian Zhang Date: Sun, 30 Apr 2023 14:02:58 -0400 Subject: [PATCH 1/4] Add versioning in save methods --- gli/io.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/gli/io.py b/gli/io.py index fc294831..9416ac13 100644 --- a/gli/io.py +++ b/gli/io.py @@ -5,6 +5,7 @@ import warnings import numpy as np from scipy.sparse import isspmatrix, spmatrix, coo_matrix +import time from gli.utils import save_data @@ -247,6 +248,10 @@ def _attr_to_metadata_dict(key_to_loc, prefix, a): return metadata +def _get_version(): + """Get the current time as the version.""" + return time.strftime("%Y%m%d%H%M%S") + def save_homograph( name: str, edge: np.ndarray, @@ -455,6 +460,9 @@ def save_homograph( metadata["citation"] = citation metadata["is_heterogeneous"] = False + metadata["version"] = _get_version() + print("The graph metadata is saved to", os.path.join(save_dir, "metadata.json")) + print("Version:", metadata["version"]) if citation == "": warnings.warn("The citation is empty.") @@ -907,6 +915,9 @@ def save_heterograph( graph_dict[attr.name] = _attr_to_metadata_dict(key_to_loc, "Graph", attr) metadata["data"]["Graph"] = graph_dict + metadata["version"] = _get_version() + print("The graph metadata is saved to", os.path.join(save_dir, "metadata.json")) + print("Version:", metadata["version"]) if citation == "": warnings.warn("The citation is empty.") @@ -983,7 +994,8 @@ def _save_task_reg_or_cls(task_type, test_ratio=0.1, num_samples=None, task_id=1, - save_dir="."): + save_dir=".", + latest_supported_ver=None): """Save the information of a regression or classification task into task json and data files. :param task_type: The type of the task. It should be either @@ -1041,6 +1053,9 @@ def _save_task_reg_or_cls(task_type, :param save_dir: The directory to save the task json and data files. Default: ".". :type save_dir: str + :param latest_supported_ver: The latest supported version of the + metadata.json file. + :type latest_supported_ver: str :raises ValueError: If `task_type` is not "NodeRegression" or "NodeClassification". @@ -1077,6 +1092,9 @@ def _save_task_reg_or_cls(task_type, test_set, train_ratio, val_ratio, test_ratio, num_samples) + if latest_supported_ver is None: + raise ValueError("The latest supported version of the metadata.json " + "file must be provided.") # Task-dependent checks. if task_type in ("NodeClassification", "NodeRegression"): @@ -1096,7 +1114,8 @@ def _save_task_reg_or_cls(task_type, "description": description, "type": task_type, "feature": feature, - "target": target + "target": target, + "latest_supported_ver": latest_supported_ver, } if num_classes is not None: task_dict["num_classes"] = num_classes @@ -1141,7 +1160,8 @@ def save_task_node_regression(name, test_ratio=0.1, num_samples=None, task_id=1, - save_dir="."): + save_dir=".", + latest_supported_ver=None): """Save the node regression task information into task json and data files. :param name: The name of the dataset. @@ -1200,6 +1220,8 @@ def save_task_node_regression(name, :param save_dir: The directory to save the task json and data files. Default: ".". :type save_dir: str + :param latest_supported_ver: The latest supported version of the metadata. + :type latest_supported_ver: str :raises ValueError: If `task_type` is not "NodeRegression" or "NodeClassification". @@ -1285,7 +1307,8 @@ def save_task_node_regression(name, test_ratio=test_ratio, num_samples=num_samples, task_id=task_id, - save_dir=save_dir) + save_dir=save_dir, + latest_supported_ver=latest_supported_ver) def save_task_node_classification(name, @@ -1301,7 +1324,8 @@ def save_task_node_classification(name, test_ratio=0.1, num_samples=None, task_id=1, - save_dir="."): + save_dir=".", + latest_supported_ver=None): """Save the node classification task information into task json and data files. :param name: The name of the dataset. @@ -1450,4 +1474,5 @@ def save_task_node_classification(name, test_ratio=test_ratio, num_samples=num_samples, task_id=task_id, - save_dir=save_dir) + save_dir=save_dir, + latest_supported_ver=latest_supported_ver) From 6cfb244ef26ba8caf00b707e2c87995936672c6c Mon Sep 17 00:00:00 2001 From: Xingjian Zhang Date: Sun, 30 Apr 2023 14:08:05 -0400 Subject: [PATCH 2/4] Fix style --- gli/io.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gli/io.py b/gli/io.py index 9416ac13..740d125c 100644 --- a/gli/io.py +++ b/gli/io.py @@ -166,8 +166,7 @@ def save_graph( description, cite, save_dir) # verify the inputs are dict for heterograph if not isinstance(edge, dict): - raise TypeError( - "The input edge must be a dictionary for heterograph.") + raise TypeError("The input edge must be a dictionary for heterograph.") if num_nodes is not None and not isinstance(num_nodes, dict): raise TypeError( "The input num_nodes must be a dictionary for heterograph.") @@ -252,6 +251,7 @@ def _get_version(): """Get the current time as the version.""" return time.strftime("%Y%m%d%H%M%S") + def save_homograph( name: str, edge: np.ndarray, @@ -461,7 +461,8 @@ def save_homograph( metadata["citation"] = citation metadata["is_heterogeneous"] = False metadata["version"] = _get_version() - print("The graph metadata is saved to", os.path.join(save_dir, "metadata.json")) + print("The graph metadata is saved to", + os.path.join(save_dir, "metadata.json")) print("Version:", metadata["version"]) if citation == "": @@ -916,7 +917,8 @@ def save_heterograph( attr) metadata["data"]["Graph"] = graph_dict metadata["version"] = _get_version() - print("The graph metadata is saved to", os.path.join(save_dir, "metadata.json")) + print("The graph metadata is saved to", + os.path.join(save_dir, "metadata.json")) print("Version:", metadata["version"]) if citation == "": From 3881691fc610a5912e750b1cdd520db789982823 Mon Sep 17 00:00:00 2001 From: Xingjian Zhang Date: Mon, 8 May 2023 15:42:39 -0400 Subject: [PATCH 3/4] Use UTC --- gli/io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gli/io.py b/gli/io.py index 740d125c..e4e9f51e 100644 --- a/gli/io.py +++ b/gli/io.py @@ -1,4 +1,5 @@ """Helper functions for creating datasets in GLI format.""" +import datetime import json import os from typing import Dict, List, Optional, Tuple, Union @@ -248,8 +249,8 @@ def _attr_to_metadata_dict(key_to_loc, prefix, a): def _get_version(): - """Get the current time as the version.""" - return time.strftime("%Y%m%d%H%M%S") + """Get the current utc time as the version.""" + return datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S") def save_homograph( From 0db000091a6dcff0b9183600b84a43d263f308b0 Mon Sep 17 00:00:00 2001 From: Xingjian Zhang Date: Mon, 8 May 2023 15:49:15 -0400 Subject: [PATCH 4/4] Use metadata version as task version --- gli/io.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gli/io.py b/gli/io.py index e4e9f51e..39435a85 100644 --- a/gli/io.py +++ b/gli/io.py @@ -983,6 +983,14 @@ def _check_feature(feature): "Each element in `feature` must be a node/edge/graph attribute." +def _get_metadata_version(metadata): + """Check the version of the metadata is valid.""" + assert "version" in metadata, \ + "The metadata does not contain the version information." \ + "Please add the version information to the metadata." + return metadata["version"] + + def _save_task_reg_or_cls(task_type, name, description, @@ -997,8 +1005,7 @@ def _save_task_reg_or_cls(task_type, test_ratio=0.1, num_samples=None, task_id=1, - save_dir=".", - latest_supported_ver=None): + save_dir="."): """Save the information of a regression or classification task into task json and data files. :param task_type: The type of the task. It should be either @@ -1056,9 +1063,6 @@ def _save_task_reg_or_cls(task_type, :param save_dir: The directory to save the task json and data files. Default: ".". :type save_dir: str - :param latest_supported_ver: The latest supported version of the - metadata.json file. - :type latest_supported_ver: str :raises ValueError: If `task_type` is not "NodeRegression" or "NodeClassification". @@ -1095,9 +1099,12 @@ def _save_task_reg_or_cls(task_type, test_set, train_ratio, val_ratio, test_ratio, num_samples) - if latest_supported_ver is None: - raise ValueError("The latest supported version of the metadata.json " - "file must be provided.") + + # Check if metadata.json exists. + metadata_path = os.path.join(save_dir, "metadata.json") + assert os.path.exists(metadata_path), \ + "metadata.json does not exist. Please create it first." + current_metadata_version = _get_metadata_version(metadata_path) # Task-dependent checks. if task_type in ("NodeClassification", "NodeRegression"): @@ -1118,7 +1125,7 @@ def _save_task_reg_or_cls(task_type, "type": task_type, "feature": feature, "target": target, - "latest_supported_ver": latest_supported_ver, + "version": current_metadata_version, } if num_classes is not None: task_dict["num_classes"] = num_classes