diff --git a/benchmarks/NodeRegression/README.md b/benchmarks/NodeRegression/README.md new file mode 100644 index 00000000..9f1d8460 --- /dev/null +++ b/benchmarks/NodeRegression/README.md @@ -0,0 +1,60 @@ +# GLI Benchmarking on `NodeRegrssion` Task + +The code in this folder can be used to benchmark some popular models on `NodeRegrssion` task. + +## How to run + +Example commands to run the code: + +```bash +# full batch +python train.py --dataset --model GCN +python train.py --dataset --model MLP +python train.py --dataset --model GAT --model-cfg configs/GAT.yaml +python train.py --dataset --model GraphSAGE --model-cfg configs/GraphSAGE.yaml +python train.py --dataset --model MoNet --model-cfg configs/MoNet.yaml +python train.py --dataset --model MixHop --model-cfg configs/MixHop.yaml +python train.py --dataset --model LINKX --model-cfg configs/LINKX.yaml --train-cfg configs/LINKX_train.yaml + +# mini batch +python train_minibatch.py --dataset --model GCN_minibatch + +# GBDT +python train_gbdt.py --dataset --model lightgbm +python train_gbdt.py --dataset --model catboost +``` + +One can provide a `yaml` file to arguments `--model-cfg` or `--train-cfg` respectively for model configuration or training configuration. If not provided, default configurations (see [model_default.yaml](https://github.com/Graph-Learning-Benchmarks/gli/blob/main/benchmarks/NodeRegression/configs/model_default.yaml) and [train_default.yaml](https://github.com/Graph-Learning-Benchmarks/gli/blob/main/benchmarks/NodeRegression/configs/train_default.yaml)) will be used. + +Note that some models may have unique hyperparameters not included in the default configuration files. In this case, one should pass the model-specific coniguration files to `train.py`. + +## Supported models + +The following list of models are supported by this benchmark. + +### Full batch + +- `GCN` +- `MLP` +- `GAT` +- `GraphSAGE` +- `MoNet` +- `MixHop` +- `LINKX` + +### Mini batch + +- `GCN_minibatch` + +### Gradient Boosting Decision Tree (GBDT) + +- `catboost` +- `lightgbm` + +To add a new model, one should add the model implementation under the `models` folder, and add model specific confgurations under the `configs` folder when needed. We have tried to implement `train.py` in a generic way so one may only need to make minimal modifications to `train.py` and `utils.py`. + +Contributions of new models are welcome through pull requests. + +## Supported datasets + +No `NodeRegrssion` datasets available now. diff --git a/benchmarks/NodeRegression/config_gen.py b/benchmarks/NodeRegression/config_gen.py new file mode 100644 index 00000000..c376101f --- /dev/null +++ b/benchmarks/NodeRegression/config_gen.py @@ -0,0 +1,75 @@ +""" +Random search. + +References: +https://github.com/pyg-team/pytorch_geometric/blob/master/graphgym/configs_gen.py +https://github.com/pyg-team/pytorch_geometric/blob/master/torch_geometric/ +graphgym/utils/io.py +""" + +import argparse +import yaml +import time +from utils import load_config_file, makedirs_rm_exist +from random import randint + +train_cfg_list = ["self_loop", "to_dense", "lr", "weight_decay", "num_trials", + "max_epoch", "early_stopping"] + + +def parse_args(): + """Parse the arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("--model-cfg", type=str, + default="configs/model_default.yaml", + help="The model configuration file path.") + parser.add_argument("--train-cfg", type=str, + default="configs/train_default.yaml", + help="The training configuration file path.") + parser.add_argument("--grid", type=str, + help="configuration file for grid search.", + default="grid/grid_example.yaml") + parser.add_argument("--sample_num", dest="sample_num", + help="Number of random samples in the space.", + default=10, type=int) + parser.add_argument("--trial_num", type=int, default=5, + help="Number of trials for same configuration.") + parser.add_argument("--model", type=str, default="GCN", + help="model to be used. GCN, GAT, MoNet,\ + GraphSAGE, MLP for now.") + return parser.parse_args() + + +def grid_gen(args, gen_cfg, model_cfg, train_cfg): + """Generate random search configuration files.""" + dir_name = "./grid/" + args.model + time.strftime("_%Y%m%d_%H%M%S") + makedirs_rm_exist(dir_name) + for i in range(args.sample_num): + for key in gen_cfg: + key_len = len(gen_cfg[key]) + if key in train_cfg_list: + train_cfg[key] = gen_cfg[key][randint(0, key_len-1)] + else: + # otherwise, the key is for model + model_cfg[key] = gen_cfg[key][randint(0, key_len-1)] + for j in range(args.trial_num): + index_str = str(i) + "_" + str(j) + # the i-th configuration, j-th trial + train_cfg_name = args.model + "_train_" + index_str + ".yaml" + model_cfg_name = args.model + "_model_" + index_str + ".yaml" + train_cfg["seed"] = randint(1, 10000) + with open(dir_name + "/" + train_cfg_name, + "w", encoding="utf-8") as f: + yaml.dump(train_cfg, f, default_flow_style=False) + with open(dir_name + "/" + model_cfg_name, + "w", encoding="utf-8") as f: + yaml.dump(model_cfg, f, default_flow_style=False) + + +if __name__ == "__main__": + Args = parse_args() + Gen_cfg = load_config_file(Args.grid) + # load default configuration for training and model + Model_cfg = load_config_file(Args.model_cfg) + Train_cfg = load_config_file(Args.train_cfg) + grid_gen(Args, Gen_cfg, Model_cfg, Train_cfg) diff --git a/benchmarks/NodeRegression/configs/GAT.yaml b/benchmarks/NodeRegression/configs/GAT.yaml new file mode 100644 index 00000000..176f7a4c --- /dev/null +++ b/benchmarks/NodeRegression/configs/GAT.yaml @@ -0,0 +1,7 @@ +num_layers: 2 +num_hidden: 8 +num_heads: 8 +num_out_heads: 2 +residual: False +dropout: .6 +negative_slope: .2 diff --git a/benchmarks/NodeRegression/configs/GraphSAGE.yaml b/benchmarks/NodeRegression/configs/GraphSAGE.yaml new file mode 100644 index 00000000..0732681a --- /dev/null +++ b/benchmarks/NodeRegression/configs/GraphSAGE.yaml @@ -0,0 +1,4 @@ +num_layers: 2 +num_hidden: 8 +dropout: .6 +aggregator_type: gcn diff --git a/benchmarks/NodeRegression/configs/LINKX.yaml b/benchmarks/NodeRegression/configs/LINKX.yaml new file mode 100644 index 00000000..c12f8882 --- /dev/null +++ b/benchmarks/NodeRegression/configs/LINKX.yaml @@ -0,0 +1,7 @@ +num_hidden: 16 +num_layers: 1 +dropout: .5 +inner_activation: False +inner_dropout: False +init_layers_A: 1 +init_layers_X: 1 diff --git a/benchmarks/NodeRegression/configs/LINKX_train.yaml b/benchmarks/NodeRegression/configs/LINKX_train.yaml new file mode 100644 index 00000000..9e67cc7e --- /dev/null +++ b/benchmarks/NodeRegression/configs/LINKX_train.yaml @@ -0,0 +1,11 @@ +loss_fun: cross_entropy +self_loop: False +to_dense: False +lr: .01 +weight_decay: 0.001 +max_epoch: 10000 +early_stopping: True +seed: 0 +batch_size: 256 +to_undirected: False +optimizer: "AdamW" diff --git a/benchmarks/NodeRegression/configs/MixHop.yaml b/benchmarks/NodeRegression/configs/MixHop.yaml new file mode 100644 index 00000000..ccc80a99 --- /dev/null +++ b/benchmarks/NodeRegression/configs/MixHop.yaml @@ -0,0 +1,6 @@ +num_hidden: 8 +p: [0, 1, 2] +num_layers: 2 +dropout: .5 +layer_dropout: 0.9 +batchnorm: False diff --git a/benchmarks/NodeRegression/configs/MoNet.yaml b/benchmarks/NodeRegression/configs/MoNet.yaml new file mode 100644 index 00000000..989e365d --- /dev/null +++ b/benchmarks/NodeRegression/configs/MoNet.yaml @@ -0,0 +1,5 @@ +num_layers: 2 +num_hidden: 8 +dropout: .6 +pseudo_dim: 2 +num_kernels: 3 diff --git a/benchmarks/NodeRegression/configs/catboost.yaml b/benchmarks/NodeRegression/configs/catboost.yaml new file mode 100644 index 00000000..a2c887df --- /dev/null +++ b/benchmarks/NodeRegression/configs/catboost.yaml @@ -0,0 +1,12 @@ +hp: + lr: + - 0.01 + - 0.1 + depth: + - 4 + - 6 + l2_leaf_reg: + - null +num_epochs: 1000 +patience: 100 +verbose: false diff --git a/benchmarks/NodeRegression/configs/lightgbm.yaml b/benchmarks/NodeRegression/configs/lightgbm.yaml new file mode 100644 index 00000000..88130fc1 --- /dev/null +++ b/benchmarks/NodeRegression/configs/lightgbm.yaml @@ -0,0 +1,14 @@ +hp: + lr: + - 0.01 + - 0.1 + num_leaves: + - 15 + - 63 + lambda_l2: + - 0.0 + boosting: + - gbdt +num_epochs: 1000 +patience: 100 + \ No newline at end of file diff --git a/benchmarks/NodeRegression/configs/model_default.yaml b/benchmarks/NodeRegression/configs/model_default.yaml new file mode 100644 index 00000000..fc98b689 --- /dev/null +++ b/benchmarks/NodeRegression/configs/model_default.yaml @@ -0,0 +1,3 @@ +num_layers: 2 +num_hidden: 8 +dropout: .6 diff --git a/benchmarks/NodeRegression/configs/train_default.yaml b/benchmarks/NodeRegression/configs/train_default.yaml new file mode 100644 index 00000000..842c684e --- /dev/null +++ b/benchmarks/NodeRegression/configs/train_default.yaml @@ -0,0 +1,11 @@ +loss_fcn: mse +self_loop: True +to_dense: False +lr: .01 +weight_decay: 0.001 +max_epoch: 10000 +early_stopping: True +seed: 0 +batch_size: 256 +to_undirected: False +optimizer: "Adam" diff --git a/benchmarks/NodeRegression/grid/grid_example.yaml b/benchmarks/NodeRegression/grid/grid_example.yaml new file mode 100644 index 00000000..6235819a --- /dev/null +++ b/benchmarks/NodeRegression/grid/grid_example.yaml @@ -0,0 +1,4 @@ +num_hidden: [32 ,64] +lr: [0.001, 0.005, 0.01, .1] +dropout: [0.2, 0.4, 0.6, 0.8] +weight_decay: [.0001, .001, .01, .1] diff --git a/benchmarks/NodeRegression/models/gat.py b/benchmarks/NodeRegression/models/gat.py new file mode 100644 index 00000000..436f442f --- /dev/null +++ b/benchmarks/NodeRegression/models/gat.py @@ -0,0 +1,57 @@ +""" +GAT model in GLI. + +References: +https://github.com/dmlc/dgl/tree/master/examples/pytorch/gat +""" + +from torch import nn +from dgl.nn import GATConv + + +class GAT(nn.Module): + """GAT network.""" + + def __init__(self, + g, + num_layers, + in_dim, + num_hidden, + num_classes, + heads, + activation, + feat_drop, + attn_drop, + negative_slope, + residual): + """Initiate model.""" + super().__init__() + self.g = g + self.num_layers = num_layers + self.gat_layers = nn.ModuleList() + self.activation = activation + + # input projection (no residual) + self.gat_layers.append(GATConv( + in_dim, num_hidden, heads[0], + feat_drop, attn_drop, negative_slope, False, self.activation)) + # hidden layers + for layer in range(1, num_layers - 2): + # due to multi-head, the in_dim = num_hidden * num_heads + self.gat_layers.append(GATConv(num_hidden * heads[layer-1], + num_hidden, heads[layer], + feat_drop, attn_drop, + negative_slope, residual, + self.activation)) + # output projection + self.gat_layers.append(GATConv( + num_hidden * heads[-2], num_classes, heads[-1], + feat_drop, attn_drop, negative_slope, residual, None)) + + def forward(self, inputs): + """Forward.""" + h = inputs + for layer in range(self.num_layers): + h = self.gat_layers[layer](self.g, h) + h = h.flatten(1) if layer != self.num_layers - 1 else h.mean(1) + return h diff --git a/benchmarks/NodeRegression/models/gbdt.py b/benchmarks/NodeRegression/models/gbdt.py new file mode 100644 index 00000000..d4e49b85 --- /dev/null +++ b/benchmarks/NodeRegression/models/gbdt.py @@ -0,0 +1,272 @@ +""" +CatBoost and Lightgbm model in GLI. + +References: +https://github.com/nd7141/bgnn/blob/master/models/GBDT.py +""" + +from catboost import Pool, CatBoostClassifier, CatBoostRegressor +import time +from sklearn.metrics import mean_squared_error, accuracy_score, r2_score +import numpy as np +from collections import defaultdict as ddict +import lightgbm + + +class GBDTCatBoost: + """GDBT CatBoost.""" + + def __init__(self, + task="regression", + depth=6, + lr=0.1, + l2_leaf_reg=None, + max_bin=None): + """Initiate class.""" + self.task = task + self.depth = depth + self.learning_rate = lr + self.l2_leaf_reg = l2_leaf_reg + self.max_bin = max_bin + + def init_model(self, num_epochs, patience): + """Initiate model.""" + catboost_model_obj = CatBoostRegressor if self.task == "regression" \ + else CatBoostClassifier + # self.loss_function = "RMSE" + # if self.task == "regression" else "CrossEntropy" + self.loss_function = "RMSE" if self.task == "regression" \ + else "MultiClass" + self.custom_metrics = ["R2"] if self.task == "regression" \ + else ["Accuracy"] + # ["Accuracy", "AUC", "Precision", "Recall", "F1", "MCC", "R2"], + + print("loss function: ", self.loss_function) + print("metric: ", self.custom_metrics) + + self.model = catboost_model_obj(iterations=num_epochs, + depth=self.depth, + learning_rate=self.learning_rate, + loss_function=self.loss_function, + custom_metric=self.custom_metrics, + random_seed=0, + early_stopping_rounds=patience, + l2_leaf_reg=self.l2_leaf_reg, + max_bin=self.max_bin, + nan_mode="Min") + + def get_metrics(self): + """Get metrics.""" + d = self.model.evals_result_ + metrics = ddict(list) + keys = ["learn", "validation_0", "validation_1"] \ + if "validation_0" in self.model.evals_result_ \ + else ["learn", "validation"] + for metric_name in d[keys[0]]: + perf = [d[key][metric_name] for key in keys] + if metric_name == self.loss_function: + metrics["loss"] = list(zip(*perf)) + else: + metrics[metric_name.lower()] = list(zip(*perf)) + + return metrics + + def get_test_metric(self, metrics, metric_name): + """Get test metric.""" + if metric_name == "loss": + val_epoch = np.argmin([acc[1] for acc in metrics[metric_name]]) + else: + val_epoch = np.argmax([acc[1] for acc in metrics[metric_name]]) + min_metric = metrics[metric_name][val_epoch] + return min_metric, val_epoch + + def save_metrics(self, metrics, fn): + """Save metrics.""" + with open(fn, "w+", encoding="utf-8") as f: + for key, value in metrics.items(): + print(key, value, file=f) + + def train_val_test_split(self, x, y, train_mask, val_mask, test_mask): + """Get train/val/test split.""" + x_train, y_train = x[train_mask], y[train_mask] + x_val, y_val = x[val_mask], y[val_mask] + x_test, y_test = x[test_mask], y[test_mask] + return x_train, y_train, x_val, y_val, x_test, y_test + + def fit(self, + x, y, train_mask, val_mask, test_mask, + num_epochs=1000, patience=200, + plot=False, verbose=False, + loss_fn="", metric_name="loss"): + """Fit model.""" + x_train, y_train, x_val, y_val, x_test, y_test = \ + self.train_val_test_split(x, y, train_mask, val_mask, test_mask) + self.init_model(num_epochs, patience) + + start = time.time() + # print("type(x_train)", type(x_train)) + # print("type(y_train)", type(y_train)) + # print("cat_features", cat_features) + pool = Pool(x_train.numpy(), y_train.numpy()) + eval_set = [(x_val.numpy(), y_val.numpy()), + (x_test.numpy(), y_test.numpy())] + self.model.fit(pool, eval_set=eval_set, plot=plot, verbose=verbose) + finish = time.time() + + num_trees = self.model.tree_count_ + print(f"Finished training. Total time: {finish - start:.2f} |\ + Number of trees: {num_trees:d} |\ + Time per tree: {(time.time() - start )/num_trees:.2f}") + + metrics = self.get_metrics() + min_metric, min_val_epoch = self.get_test_metric(metrics, metric_name) + if loss_fn: + self.save_metrics(metrics, loss_fn) + print(f"Best {metric_name} at iteration {min_val_epoch}:\ + {min_metric[0]:.3f}/{min_metric[1]:.3f}/{min_metric[2]:.3f}") + return metrics + + def predict(self, x_test, y_test): + """Predict.""" + pred = self.model.predict(x_test) + + metrics = {} + metrics["rmse"] = mean_squared_error(pred, y_test) ** .5 + + return metrics + + +class GBDTLGBM: + """GBDT Lightgbm.""" + + def __init__(self, task="regression", lr=0.1, num_leaves=31, max_bin=255, + lambda_l1=0., lambda_l2=0., boosting="gbdt"): + """Initiate lightgbm.""" + self.task = task + self.boosting = boosting + self.learning_rate = lr + self.num_leaves = num_leaves + self.max_bin = max_bin + self.lambda_l1 = lambda_l1 + self.lambda_l2 = lambda_l2 + + def accuracy(self, preds, train_data): + """Calculate accuracy.""" + labels = train_data.get_label() + preds_classes = preds.reshape((preds.shape[0]//labels.shape[0], + labels.shape[0])).argmax(0) + return "accuracy", accuracy_score(labels, preds_classes), True + + def r2(self, preds, train_data): + """Calculate R2.""" + labels = train_data.get_label() + return "r2", r2_score(labels, preds), True + + def init_model(self): + """Initiate model.""" + self.parameters = { + "objective": "regression" if self.task == "regression" + else "multiclass", + "metric": {"rmse"} if self.task == "regression" + else {"multiclass"}, + "num_classes": self.num_classes, + "boosting": self.boosting, + "num_leaves": self.num_leaves, + "max_bin": self.max_bin, + "learning_rate": self.learning_rate, + "lambda_l1": self.lambda_l1, + "lambda_l2": self.lambda_l2, + # "num_threads": 1, + # "feature_fraction": 0.9, + # "bagging_fraction": 0.8, + # "bagging_freq": 5, + "verbose": 1, + # "device_type": "gpu" + } + self.evals_result = {} + + def get_metrics(self): + """Get metrics.""" + d = self.evals_result + metrics = ddict(list) + keys = ["training", "valid_1", "valid_2"] \ + if "training" in d \ + else ["valid_0", "valid_1"] + for metric_name in d[keys[0]]: + perf = [d[key][metric_name] for key in keys] + if metric_name in ["regression", "multiclass", "rmse", "l2", + "multi_logloss", "binary_logloss"]: + metrics["loss"] = list(zip(*perf)) + else: + metrics[metric_name] = list(zip(*perf)) + return metrics + + def get_test_metric(self, metrics, metric_name): + """Get test metrics.""" + if metric_name == "loss": + val_epoch = np.argmin([acc[1] for acc in metrics[metric_name]]) + else: + val_epoch = np.argmax([acc[1] for acc in metrics[metric_name]]) + min_metric = metrics[metric_name][val_epoch] + return min_metric, val_epoch + + def save_metrics(self, metrics, fn): + """Save metrics.""" + with open(fn, "w+", encoding="utf-8") as f: + for key, value in metrics.items(): + print(key, value, file=f) + + def train_val_test_split(self, x, y, train_mask, val_mask, test_mask): + """Get train/val/test splits.""" + x_train, y_train = x[train_mask], y[train_mask] + x_val, y_val = x[val_mask], y[val_mask] + x_test, y_test = x[test_mask], y[test_mask] + return x_train, y_train, x_val, y_val, x_test, y_test + + def fit(self, + x, y, train_mask, val_mask, test_mask, + num_epochs=1000, patience=200, + loss_fn="", metric_name="loss"): + """Fit model.""" + x_train, y_train, x_val, y_val, x_test, y_test = \ + self.train_val_test_split(x.numpy(), y.numpy(), + train_mask.numpy(), + val_mask.numpy(), + test_mask.numpy()) + self.num_classes = None if self.task == "regression"\ + else int(y.max()+1) + self.init_model() + + start = time.time() + train_data = lightgbm.Dataset(x_train, label=y_train) + val_data = lightgbm.Dataset(x_val, label=y_val) + test_data = lightgbm.Dataset(x_test, label=y_test) + valid_sets = [train_data, val_data, test_data] + self.model = lightgbm.train(params=self.parameters, + train_set=train_data, + valid_sets=valid_sets, + num_boost_round=num_epochs, + early_stopping_rounds=patience, + evals_result=self.evals_result, + feval=self.r2 if self.task == "regression" + else self.accuracy, + verbose_eval=1) + finish = time.time() + print(f"Finished training. Total time: {finish - start:.2f}") + + metrics = self.get_metrics() + min_metric, min_val_epoch = self.get_test_metric(metrics, metric_name) + if loss_fn: + self.save_metrics(metrics, loss_fn) + print(f"Best {metric_name} at iteration {min_val_epoch}:\ + {min_metric[0]:.3f}/{min_metric[1]:.3f}/{min_metric[2]:.3f}") + return metrics + + def predict(self, x_test, y_test): + """Predict.""" + pred = self.model.predict(x_test) + + metrics = {} + metrics["rmse"] = mean_squared_error(pred, y_test) ** .5 + + return metrics diff --git a/benchmarks/NodeRegression/models/gcn.py b/benchmarks/NodeRegression/models/gcn.py new file mode 100644 index 00000000..ffd0a047 --- /dev/null +++ b/benchmarks/NodeRegression/models/gcn.py @@ -0,0 +1,45 @@ +""" +GCN model in GLI. + +References: +https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn +""" + +from torch import nn +from dgl.nn.pytorch import GraphConv + + +class GCN(nn.Module): + """GCN network.""" + + def __init__(self, + g, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + dropout): + """Initiate model.""" + super().__init__() + self.g = g + self.layers = nn.ModuleList() + # input layer + self.layers.append(GraphConv(in_feats, n_hidden, + activation=activation)) + # hidden layers + for _ in range(n_layers - 2): + self.layers.append(GraphConv(n_hidden, n_hidden, + activation=activation)) + # output layer + self.layers.append(GraphConv(n_hidden, n_classes)) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, features): + """Forward.""" + h = features + for i, layer in enumerate(self.layers): + if i != 0: + h = self.dropout(h) + h = layer(self.g, h) + return h diff --git a/benchmarks/NodeRegression/models/gcn_minibatch.py b/benchmarks/NodeRegression/models/gcn_minibatch.py new file mode 100644 index 00000000..13a329b1 --- /dev/null +++ b/benchmarks/NodeRegression/models/gcn_minibatch.py @@ -0,0 +1,47 @@ +""" +GCN model in GLI. + +References: +https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn +https://docs.dgl.ai/guide/minibatch-node.html?highlight=sampling +""" + +from torch import nn +from dgl.nn.pytorch import GraphConv + + +class GCNminibatch(nn.Module): + """GCN network.""" + + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + dropout): + """Initiate model.""" + super().__init__() + self.layers = nn.ModuleList() + # input layer + self.layers.append(GraphConv(in_feats, n_hidden, + activation=activation, + norm='none')) + # hidden layers + for _ in range(n_layers - 2): + self.layers.append(GraphConv(n_hidden, n_hidden, + activation=activation, + norm='none')) + # output layer + self.layers.append(GraphConv(n_hidden, n_classes, + norm='none')) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, blocks, features): + """Forward.""" + h = features + for i, layer in enumerate(self.layers): + if i != 0: + h = self.dropout(h) + h = layer(blocks[i], h) + return h diff --git a/benchmarks/NodeRegression/models/graph_sage.py b/benchmarks/NodeRegression/models/graph_sage.py new file mode 100644 index 00000000..a62efdb2 --- /dev/null +++ b/benchmarks/NodeRegression/models/graph_sage.py @@ -0,0 +1,47 @@ +""" +GraphSAGE model in GLI. + +References: +https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/train_full.py +""" + +from torch import nn +from dgl.nn.pytorch.conv import SAGEConv + + +class GraphSAGE(nn.Module): + """GraphSAGE model.""" + + def __init__(self, + g, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + dropout, + aggregator_type): + """Initiate model.""" + super().__init__() + self.g = g + self.layers = nn.ModuleList() + self.dropout = nn.Dropout(dropout) + self.activation = activation + + # input layer + self.layers.append(SAGEConv(in_feats, n_hidden, aggregator_type)) + # hidden layers + for _ in range(n_layers - 2): + self.layers.append(SAGEConv(n_hidden, n_hidden, aggregator_type)) + # output layer + self.layers.append(SAGEConv(n_hidden, n_classes, aggregator_type)) + + def forward(self, inputs): + """Forward.""" + h = self.dropout(inputs) + for length, layer in enumerate(self.layers): + h = layer(self.g, h) + if length != len(self.layers) - 1: + h = self.activation(h) + h = self.dropout(h) + return h diff --git a/benchmarks/NodeRegression/models/linkx.py b/benchmarks/NodeRegression/models/linkx.py new file mode 100644 index 00000000..8168f833 --- /dev/null +++ b/benchmarks/NodeRegression/models/linkx.py @@ -0,0 +1,112 @@ +""" +LINKX model in Non-Homophily-Large-Scale. + +References: +https://github.com/CUAI/Non-Homophily-Large-Scale +""" + +import torch +from torch import nn +import torch.nn.functional as F +from torch_sparse import SparseTensor + + +class LINKX(nn.Module): + """ + LINKX method with skip connections. + + a = MLP_1(A), x = MLP_2(X), MLP_3(sigma(W_1[a, x] + a + x)). + """ + + def __init__(self, g, in_channels, hidden_channels, out_channels, + num_layers, num_nodes, dropout=.5, inner_activation=False, + inner_dropout=False, init_layers_A=1, + init_layers_X=1): + """Initiate model.""" + super().__init__() + self.g = g + self.mlpa = MLP(num_nodes, hidden_channels, hidden_channels, + init_layers_A, dropout=0) + self.mlpx = MLP(in_channels, hidden_channels, hidden_channels, + init_layers_X, dropout=0) + self.w = nn.Linear(2*hidden_channels, hidden_channels) + self.mlp_final = MLP(hidden_channels, hidden_channels, out_channels, + num_layers, dropout=dropout) + self.in_channels = in_channels + self.num_nodes = num_nodes + self.inner_activation = inner_activation + self.inner_dropout = inner_dropout + + def reset_parameters(self): + """Reset parameters.""" + self.mlpa.reset_parameters() + self.mlpx.reset_parameters() + self.w.reset_parameters() + self.mlp_final.reset_parameters() + + def forward(self, feats): + """Forward.""" + m = self.num_nodes + feat_dim = feats + row, col = self.g.edges() + row = row-row.min() + aa = SparseTensor( + row=row, col=col, sparse_sizes=(m, m) + ).to_torch_sparse_coo_tensor() + + xa = self.mlpa(aa, input_tensor=True) + xx = self.mlpx(feat_dim, input_tensor=True) + x = torch.cat((xa, xx), axis=-1) + x = self.w(x) + if self.inner_dropout: + x = F.dropout(x) + if self.inner_activation: + x = F.relu(x) + x = F.relu(x + xa + xx) + x = self.mlp_final(x, input_tensor=True) + + return x + + +class MLP(nn.Module): + """MLP model.""" + + def __init__(self, in_channels, hidden_channels, out_channels, + num_layers, dropout=.5): + """Initiate layer.""" + super().__init__() + self.lins = nn.ModuleList() + self.bns = nn.ModuleList() + if num_layers == 1: + # just linear layer i.e. logistic regression + self.lins.append(nn.Linear(in_channels, out_channels)) + else: + self.lins.append(nn.Linear(in_channels, hidden_channels)) + self.bns.append(nn.BatchNorm1d(hidden_channels)) + for _ in range(num_layers - 2): + self.lins.append(nn.Linear(hidden_channels, hidden_channels)) + self.bns.append(nn.BatchNorm1d(hidden_channels)) + self.lins.append(nn.Linear(hidden_channels, out_channels)) + + self.dropout = dropout + + def reset_parameters(self): + """Reset parameters.""" + for lin in self.lins: + lin.reset_parameters() + for bn in self.bns: + bn.reset_parameters() + + def forward(self, data, input_tensor=False): + """Forward.""" + if not input_tensor: + x = data.graph['node_feat'] + else: + x = data + for i, lin in enumerate(self.lins[:-1]): + x = lin(x) + x = F.relu(x, inplace=True) + x = self.bns[i](x) + x = F.dropout(x, p=self.dropout, training=self.training) + x = self.lins[-1](x) + return x diff --git a/benchmarks/NodeRegression/models/mixhop.py b/benchmarks/NodeRegression/models/mixhop.py new file mode 100644 index 00000000..747106dc --- /dev/null +++ b/benchmarks/NodeRegression/models/mixhop.py @@ -0,0 +1,132 @@ +""" +MIXHOP model in GLI. + +References: +https://github.com/dmlc/dgl/tree/master/examples/pytorch/mixhop +""" + +import torch +from torch import nn +import torch.nn.functional as F +import dgl.function as fn + + +class MixHopConv(nn.Module): + r"""MixHopConv layer.""" + + def __init__(self, + in_dim, + out_dim, + p, + dropout=0, + activation=None, + batchnorm=False): + """Initiate layer.""" + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.p = p + self.activation = activation + self.batchnorm = batchnorm + + # define dropout layer + self.dropout = nn.Dropout(dropout) + + # define batch norm layer + if self.batchnorm: + self.bn = nn.BatchNorm1d(out_dim * len(p)) + + # define weight dict for each power j + self.weights = nn.ModuleDict({ + str(j): nn.Linear(in_dim, out_dim, bias=False) for j in p + }) + + def forward(self, graph, feats): + """Forward.""" + with graph.local_scope(): + degs = graph.in_degrees().float().clamp(min=1) + norm = torch.pow(degs, -0.5).to(feats.device).unsqueeze(1) + max_j = max(self.p) + 1 + outputs = [] + for j in range(max_j): + + if j in self.p: + output = self.weights[str(j)](feats) + outputs.append(output) + + feats = feats * norm + graph.ndata['h'] = feats + graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h')) + feats = graph.ndata.pop('h') + feats = feats * norm + + final = torch.cat(outputs, dim=1) + + if self.batchnorm: + final = self.bn(final) + + if self.activation is not None: + final = self.activation(final) + + final = self.dropout(final) + + return final + + +class MixHop(nn.Module): + """MixHop model.""" + + def __init__(self, + g, + in_dim, + hid_dim, + out_dim, + p, + num_layers=2, + input_dropout=0.0, + layer_dropout=0.0, + batchnorm=False): + """Initiate model.""" + super().__init__() + self.g = g + self.in_dim = in_dim + self.hid_dim = hid_dim + self.out_dim = out_dim + self.num_layers = num_layers + self.p = p + self.input_dropout = input_dropout + self.layer_dropout = layer_dropout + self.activation = F.tanh + self.batchnorm = batchnorm + + self.layers = nn.ModuleList() + self.dropout = nn.Dropout(self.input_dropout) + + # Input layer + self.layers.append(MixHopConv(self.in_dim, + self.hid_dim, + p=self.p, + dropout=self.input_dropout, + activation=self.activation, + batchnorm=self.batchnorm)) + + # Hidden layers with n - 1 MixHopConv layers + for _ in range(self.num_layers - 2): + self.layers.append(MixHopConv(self.hid_dim * len(p), + self.hid_dim, + p=self.p, + dropout=self.layer_dropout, + activation=self.activation, + batchnorm=self.batchnorm)) + + self.fc_layers = nn.Linear(self.hid_dim * len(p), + self.out_dim, bias=False) + + def forward(self, feats): + """Forward.""" + feats = self.dropout(feats) + for layer in self.layers: + feats = layer(self.g, feats) + + feats = self.fc_layers(feats) + return feats diff --git a/benchmarks/NodeRegression/models/mlp.py b/benchmarks/NodeRegression/models/mlp.py new file mode 100644 index 00000000..393412ab --- /dev/null +++ b/benchmarks/NodeRegression/models/mlp.py @@ -0,0 +1,45 @@ +""" +MLP model in GLI. + +References: +https://github.com/dmlc/dgl/blob/195f99362d883f8b6d131b70a7868a537e55b786/examples/pytorch/grand/model.py +""" + +from torch import nn + + +class MLP(nn.Module): + """MLP network.""" + + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + dropout): + """Initiate model.""" + super().__init__() + self.layers = nn.ModuleList() + self.activation = activation + # input layer + self.layers.append(nn.Linear(in_feats, n_hidden, bias=True)) + + # hidden layers + for _ in range(n_layers - 1): + self.layers.append(nn.Linear(n_hidden, n_hidden, bias=True)) + + # output layer + self.layers.append(nn.Linear(n_hidden, n_classes, bias=True)) + + self.dropout = nn.Dropout(dropout) + + def forward(self, features): + """Forward.""" + h = features + for i, layer in enumerate(self.layers): + if i != 0: + h = self.dropout(h) + h = layer(h) + h = self.activation(h) + return h diff --git a/benchmarks/NodeRegression/models/monet.py b/benchmarks/NodeRegression/models/monet.py new file mode 100644 index 00000000..c54c0af7 --- /dev/null +++ b/benchmarks/NodeRegression/models/monet.py @@ -0,0 +1,63 @@ +""" +GAT model in GLI. + +References: +https://github.com/dmlc/dgl/blob/master/examples/pytorch/monet/citation.py +""" + +import torch +from torch import nn +from dgl.nn.pytorch.conv import GMMConv + + +class MoNet(nn.Module): + """Monet model.""" + + def __init__(self, + g, + in_feats, + n_hidden, + out_feats, + n_layers, + dim, + n_kernels, + dropout): + """Initiate model.""" + super().__init__() + self.g = g + self.layers = nn.ModuleList() + self.pseudo_proj = nn.ModuleList() + + # process pseudo + us, vs = g.edges(order="eid") + udeg, vdeg = 1 / torch.sqrt(g.in_degrees(us).float()), 1 / \ + torch.sqrt(g.in_degrees(vs).float()) + self.pseudo = torch.cat([udeg.unsqueeze(1), vdeg.unsqueeze(1)], dim=1) + + # Input layer + self.layers.append( + GMMConv(in_feats, n_hidden, dim, n_kernels)) + self.pseudo_proj.append( + nn.Sequential(nn.Linear(2, dim), nn.Tanh())) + + # Hidden layer + for _ in range(n_layers - 2): + self.layers.append(GMMConv(n_hidden, n_hidden, dim, n_kernels)) + self.pseudo_proj.append( + nn.Sequential(nn.Linear(2, dim), nn.Tanh())) + + # Output layer + self.layers.append(GMMConv(n_hidden, out_feats, dim, n_kernels)) + self.pseudo_proj.append( + nn.Sequential(nn.Linear(2, dim), nn.Tanh())) + self.dropout = nn.Dropout(dropout) + + def forward(self, feat): + """Forward.""" + h = feat + for i in range(len(self.layers)): + if i != 0: + h = self.dropout(h) + h = self.layers[i]( + self.g, h, self.pseudo_proj[i](self.pseudo)) + return h diff --git a/benchmarks/NodeRegression/train.py b/benchmarks/NodeRegression/train.py new file mode 100644 index 00000000..b19c9838 --- /dev/null +++ b/benchmarks/NodeRegression/train.py @@ -0,0 +1,178 @@ +""" +Train for node regression dataset. + +References: +https://github.com/dmlc/dgl/blob/master/examples/pytorch/gat/train.py +https://github.com/pyg-team/pytorch_geometric/blob/master/graphgym/main.py +""" + + +import time +import re +import torch +from torch import nn +import numpy as np +import dgl +import gli +from utils import generate_model, parse_args, Models_need_to_be_densed,\ + load_config_file, check_multiple_split,\ + EarlyStopping, set_seed, Datasets_need_to_be_undirected,\ + get_label_number +from gli.utils import to_dense + + +def evaluate(model, features, labels, mask, eval_func): + """Evaluate model.""" + model.eval() + with torch.no_grad(): + logits = model(features) + logits = logits[mask] + labels = labels[mask] + return eval_func(logits.squeeze(), labels) + + +def main(): + """Load dataset and train the model.""" + # Load cmd line args + args = parse_args() + print(args) + # Load config file + model_cfg = load_config_file(args.model_cfg) + train_cfg = load_config_file(args.train_cfg) + set_seed(train_cfg["seed"]) + + # load and preprocess dataset + if args.gpu < 0: + device = "cpu" + cuda = False + else: + device = args.gpu + cuda = True + + data = gli.dataloading.get_gli_dataset(args.dataset, args.task, + args.task_id, device, + args.verbose) + g = data[0] + if train_cfg["to_dense"] or \ + args.model in Models_need_to_be_densed: + g = to_dense(g) + # add self loop + if train_cfg["self_loop"]: + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) + # convert to undirected set + if train_cfg["to_undirected"] or \ + args.dataset in Datasets_need_to_be_undirected: + g = g.to("cpu") + g = dgl.to_bidirected(g, copy_ndata=True) + g = g.to(device) + + feature_name = re.search(r".*Node/(\w+)", data.features[0]).group(1) + label_name = re.search(r".*Node/(\w+)", data.target).group(1) + features = g.ndata[feature_name] + labels = g.ndata[label_name].squeeze() + train_mask = g.ndata["train_mask"] + val_mask = g.ndata["val_mask"] + test_mask = g.ndata["test_mask"] + + # for multi-split dataset, choose 0-th split for now + if check_multiple_split(args.dataset): + train_mask = train_mask[:, 0] + val_mask = val_mask[:, 0] + test_mask = test_mask[:, 0] + + # When labels contains -1, modify masks + if labels.min() < 0: + train_mask = train_mask * (labels >= 0) + val_mask = val_mask * (labels >= 0) + test_mask = test_mask * (labels >= 0) + + in_feats = features.shape[1] + n_classes = data.num_labels + n_edges = g.number_of_edges() + + print(f"""----Data statistics------' + #Edges {n_edges} + #Classes {n_classes} + #Train samples {train_mask.int().sum().item()} + #Val samples {val_mask.int().sum().item()} + #Test samples {test_mask.int().sum().item()}""") + + # create model + label_number = get_label_number(labels) + model = generate_model(args, g, in_feats, label_number, **model_cfg) + + # create loss function and evalution function + if train_cfg["loss_fcn"] == "mse": + eval_func = loss_fcn = nn.MSELoss() + elif train_cfg["loss_fcn"] == "mae": + eval_func = loss_fcn = nn.L1Loss() + else: + raise NotImplementedError(f"Loss function \ + {train_cfg['loss_fcn']} is not supported.") + + print(model) + if cuda: + model.cuda() + + # use optimizer + if train_cfg["optimizer"] == "AdamW": + optimizer = torch.optim.AdamW( + model.parameters(), lr=train_cfg["lr"], + weight_decay=train_cfg["weight_decay"]) + elif train_cfg["optimizer"] == "Adam": + optimizer = torch.optim.Adam( + model.parameters(), lr=train_cfg["lr"], + weight_decay=train_cfg["weight_decay"]) + else: + raise NotImplementedError(f"Optimizer \ + {train_cfg['optimizer']} is not supported.") + + ckpt_name = args.model + "_" + args.dataset + "_" + ckpt_name += args.train_cfg + stopper = EarlyStopping(ckpt_name=ckpt_name, + early_stop=train_cfg["early_stopping"], + patience=50) + + # initialize graph + dur = [] + for epoch in range(train_cfg["max_epoch"]): + model.train() + if epoch >= 3: + if cuda: + torch.cuda.synchronize() + t0 = time.time() + # forward + logits = model(features) + loss = loss_fcn(logits[train_mask].squeeze(), + labels[train_mask].float()) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if epoch >= 3: + if cuda: + torch.cuda.synchronize() + dur.append(time.time() - t0) + + val_loss = evaluate(model, features, labels, val_mask, eval_func) + print(f"Epoch {epoch:05d} | Time(s) {np.mean(dur):.4f}" + f"| Loss {loss.item():.4f} | " + f" Val Loss {val_loss.item():.4f} | " + f"ETputs(KTEPS) {n_edges / np.mean(dur) / 1000:.2f}") + + if stopper.step(val_loss.item(), model): + break + + print() + + model.load_state_dict(torch.load(stopper.ckpt_dir)) + + loss = evaluate(model, features, labels, test_mask, eval_func) + val_loss = stopper.best_score + print(f"Test loss {loss:.4f}, Val loss {val_loss:.4f}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/NodeRegression/train_gbdt.py b/benchmarks/NodeRegression/train_gbdt.py new file mode 100644 index 00000000..4fa76f97 --- /dev/null +++ b/benchmarks/NodeRegression/train_gbdt.py @@ -0,0 +1,243 @@ +""" +Train for GBDT. + +Reference: +https://github.com/nd7141/bgnn/blob/master/scripts/run.py +""" +import os +import re +import json +import time +import datetime +from collections import defaultdict as ddict + +import numpy as np +from sklearn.model_selection import ParameterGrid + +import gli +from utils import parse_args, set_seed,\ + load_config_file, check_multiple_split +from gli.utils import to_dense +from models.gbdt import GBDTCatBoost, GBDTLGBM + + +class RunModel: + """Model class for gbdt.""" + + def __init__(self, args, model_cfg, train_cfg): + """Initiate model.""" + self.args = args + self.model_cfg = model_cfg + self.train_cfg = train_cfg + + def read_input(self): + """Read input.""" + data = gli.dataloading.get_gli_dataset(self.args.dataset, + "NodeClassification") + g = data[0] + g = to_dense(g) + feature_name = re.search(r".*Node/(\w+)", data.features[0]).group(1) + label_name = re.search(r".*Node/(\w+)", data.target).group(1) + features = g.ndata[feature_name] + labels = g.ndata[label_name].squeeze() + train_mask = g.ndata["train_mask"] + val_mask = g.ndata["val_mask"] + test_mask = g.ndata["test_mask"] + # for multi-split dataset, choose 0-th split for now + if check_multiple_split(self.args.dataset): + train_mask = train_mask[:, 0] + val_mask = val_mask[:, 0] + test_mask = test_mask[:, 0] + + # When labels contains -1, modify masks + if labels.min() < 0: + train_mask = train_mask * (labels >= 0) + val_mask = val_mask * (labels >= 0) + test_mask = test_mask * (labels >= 0) + + self.x = features + self.y = labels + + self.masks = {"0": {"train": train_mask, + "val": val_mask, + "test": test_mask}} + + def get_input(self): + """Get input.""" + if self.save_folder is None: + self.save_folder = f"grid/gdbt_results/{self.args.dataset}/\ + {datetime.datetime.now().strftime('%d_%m')}" + + self.read_input() + print("Save to folder:", self.save_folder) + + def run_one_model(self, config_fn, model_name): + """Run single model.""" + print(config_fn) + # self.config = OmegaConf.load(config_fn) + # print(type(self.config)) + # print(self.config) + # grid = ParameterGrid(dict(self.config.hp)) + + self.config = load_config_file(config_fn) + print(type(self.config)) + print(self.config) + grid = ParameterGrid(self.config["hp"]) + + for ps in grid: + print("hyper params: ", ps) + param_string = "".join([f"-{key}{ps[key]}" for key in ps]) + exp_name = f"{model_name}{param_string}" + print(f"\nSeed {self.seed} RUNNING:{exp_name}") + + runs = [] + runs_custom = [] + times = [] + for _ in range(self.repeat_exp): + start = time.time() + model = self.define_model(model_name, ps) + + inputs = {"x": self.x, "y": self.y, + "train_mask": self.train_mask, + "val_mask": self.val_mask, + "test_mask": self.test_mask} + + metrics = model.fit(num_epochs=self.config["num_epochs"], + patience=self.config["patience"], + loss_fn=f"{self.seed_folder}/\ + {exp_name}.txt", + metric_name="loss" + if self.task == "regression" + else "accuracy", **inputs) + finish = time.time() + best_loss = min(metrics["loss"], key=lambda x: x[1]) + best_custom = max(metrics["r2" if self.task == "regression" + else "accuracy"], + key=lambda x: x[1]) + runs.append(best_loss) + runs_custom.append(best_custom) + times.append(finish - start) + self.store_results[exp_name] = (list(map(np.mean, + zip(*runs))), + list(map(np.mean, + zip(*runs_custom))), + np.mean(times)) + + def define_model(self, model_name, ps): + """Define model.""" + if model_name == "catboost": + return GBDTCatBoost(self.task, **ps) + elif model_name == "lightgbm": + return GBDTLGBM(self.task, **ps) + + def create_save_folder(self, seed): + """Create folder to save output.""" + self.seed_folder = f"{self.save_folder}/{seed}" + os.makedirs(self.seed_folder, exist_ok=True) + + def split_masks(self, seed): + """Split masks.""" + self.train_mask = self.masks[seed]["train"] + self.val_mask = self.masks[seed]["val"] + self.test_mask = self.masks[seed]["test"] + + def save_results(self, seed): + """Save results.""" + self.seed_results[seed] = self.store_results + with open(f"{self.save_folder}/seed_results.json", "w+", + encoding="utf-8") as f: + json.dump(self.seed_results, f) + + self.aggregated = self.aggregate_results() + with open(f"{self.save_folder}/aggregated_results.json", "w+", + encoding="utf-8") as f: + json.dump(self.aggregated, f) + + def aggregate_results(self): + """Aggregate results.""" + model_best_score = ddict(list) + model_best_time = ddict(list) + + results = self.seed_results + # print("results:", results) + for seed_tuple in results.items(): + # print("seed_tuple", seed_tuple) + # print("seed_tuple[1]", seed_tuple[1]) + model_results_for_seed = ddict(list) + for _, output in seed_tuple[1].items(): + model_name = self.args.model + if self.task == "regression": # rmse metric + val_metric, test_metric = output[0][1], output[0][2] + cur_time = output[2] + else: # accuracy metric + val_metric, test_metric = output[1][1], output[1][2] + cur_time = output[2] + model_results_for_seed[model_name].append((val_metric, + test_metric, + cur_time)) + + for model_name, model_results in model_results_for_seed.items(): + if self.task == "regression": + best_result = min(model_results) # rmse + else: + best_result = max(model_results) # accuracy + model_best_score[model_name].append(best_result[1]) + model_best_time[model_name].append(best_result[2]) + + aggregated = {} + for model, scores in model_best_score.items(): + aggregated[model] = (np.mean(scores), np.std(scores), + np.mean(model_best_time[model]), + np.std(model_best_time[model])) + return aggregated + + def run(self, + save_folder: str = None, + task: str = "NodeClassification", + repeat_exp: int = 1, + max_seeds: int = 5, + ): + """Run the model.""" + start2run = time.time() + self.repeat_exp = repeat_exp + self.max_seeds = max_seeds + print(self.args.dataset, task, repeat_exp, max_seeds) + + self.task = task + self.save_folder = save_folder + self.get_input() + + self.seed_results = {} + for ix, seed in enumerate(self.masks): + print(f"{self.args.dataset} Seed {seed}") + self.seed = seed + + self.create_save_folder(seed) + self.split_masks(seed) + + self.store_results = {} + self.run_one_model("configs/" + self.args.model + ".yaml", + self.args.model) + + self.save_results(seed) + if ix+1 >= max_seeds: + break + + print(f"Finished {self.args.dataset}: {time.time() - start2run} sec.") + + +def main(): + """Load dataset and train the model.""" + # Load cmd line args + args = parse_args() + print(args) + # Load config file + model_cfg = load_config_file(args.model_cfg) + train_cfg = load_config_file(args.train_cfg) + set_seed(train_cfg["seed"]) + + RunModel(args, model_cfg, train_cfg).run() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/NodeRegression/train_minibatch.py b/benchmarks/NodeRegression/train_minibatch.py new file mode 100644 index 00000000..5acc0006 --- /dev/null +++ b/benchmarks/NodeRegression/train_minibatch.py @@ -0,0 +1,207 @@ +""" +Train for node classification dataset. + +References: +https://github.com/dmlc/dgl/blob/master/examples/pytorch/gat/train.py +https://github.com/pyg-team/pytorch_geometric/blob/master/graphgym/main.py +https://docs.dgl.ai/guide/minibatch-node.html?highlight=sampling +""" + + +import time +import re +import torch +from torch import nn +import numpy as np +import dgl +import gli +from utils import generate_model, parse_args, \ + load_config_file, check_multiple_split,\ + EarlyStopping, set_seed +from gli.utils import to_dense +from dgl.dataloading import MultiLayerFullNeighborSampler as Sampler + + +# def accuracy(logits, labels): +# """Calculate accuracy.""" +# _, indices = torch.max(logits, dim=1) +# correct = torch.sum(indices == labels) +# return correct.item() * 1.0 / len(labels) + + +def evaluate(model, dataloader, eval_func): + """Evaluate model.""" + model.eval() + ys = [] + y_hats = [] + for _, _, blocks in dataloader: + with torch.no_grad(): + input_features = blocks[0].srcdata["NodeFeature"] + ys.append(blocks[-1].dstdata["NodeLabel"]) + y_hats.append(model(blocks, input_features)) + return eval_func(torch.cat(y_hats).squeeze(), torch.cat(ys).float()) + + +def main(): + """Load dataset and train the model.""" + # Load cmd line args + args = parse_args() + print(args) + # Load config file + model_cfg = load_config_file(args.model_cfg) + train_cfg = load_config_file(args.train_cfg) + set_seed(train_cfg["seed"]) + + # load and preprocess dataset + if args.gpu < 0: + device = "cpu" + cuda = False + else: + device = args.gpu + cuda = True + + data = gli.dataloading.get_gli_dataset(args.dataset, args.task, + device=device) + # check EdgeFeature and multi-modal node features + edge_cnt = node_cnt = 0 + if len(data.features) > 1: + for _, element in enumerate(data.features): + if "Edge" in element: + edge_cnt += 1 + if "Node" in element: + node_cnt += 1 + if edge_cnt >= 1: + raise NotImplementedError("Edge feature is not supported yet.") + elif node_cnt >= 2: + raise NotImplementedError("Multi-modal node features\ + is not supported yet.") + + g = data[0] + indice = data.get_node_indices() + + g = to_dense(g) + # add self loop + if train_cfg["self_loop"]: + g = dgl.remove_self_loop(g) + g = dgl.add_self_loop(g) + + feature_name = re.search(r".*Node/(\w+)", data.features[0]).group(1) + features = g.ndata[feature_name] + + # for multi-split dataset, choose 0-th split for now + if check_multiple_split(args.dataset): + train_mask = train_mask[:, 0] + val_mask = val_mask[:, 0] + test_mask = test_mask[:, 0] + + in_feats = features.shape[1] + n_classes = data.num_labels + n_edges = g.number_of_edges() + + sampler = Sampler(model_cfg["num_layers"]) + train_dataloader = dgl.dataloading.DataLoader( + g, indice["train_set"], sampler, + batch_size=train_cfg["batch_size"], + device=device, + shuffle=True, + drop_last=False) + + valid_dataloader = dgl.dataloading.DataLoader( + g, indice["val_set"], sampler, + device=device, + batch_size=train_cfg["batch_size"], + shuffle=True, + drop_last=False) + + test_dataloader = dgl.dataloading.DataLoader( + g, indice["test_set"], sampler, + device=device, + batch_size=train_cfg["batch_size"], + shuffle=True, + drop_last=False) + + print(f"""----Data statistics------' + #Edges {n_edges} + #Classes {n_classes}""") + + # create model, supporting only single label task + label_number = 1 + model = generate_model(args, g, in_feats, label_number, **model_cfg) + + print(model) + if cuda: + model.cuda() + + # create loss function and evalution function + if train_cfg["loss_fcn"] == "mse": + eval_func = loss_fcn = nn.MSELoss() + elif train_cfg["loss_fcn"] == "mae": + eval_func = loss_fcn = nn.L1Loss() + else: + raise NotImplementedError(f"Loss function \ + {train_cfg['loss_fcn']} is not supported.") + + # use optimizer + optimizer = torch.optim.Adam( + model.parameters(), lr=train_cfg["lr"], + weight_decay=train_cfg["weight_decay"]) + + ckpt_name = args.model + "_" + args.dataset + "_" + ckpt_name += args.train_cfg + stopper = EarlyStopping(ckpt_name=ckpt_name, + early_stop=train_cfg["early_stopping"], + patience=50) + + # initialize graph + dur = [] + for epoch in range(train_cfg["max_epoch"]): + model.train() + if epoch >= 3: + if cuda: + torch.cuda.synchronize() + t0 = time.time() + + for it, (_, _, blocks) in enumerate(train_dataloader): + if cuda: + blocks = [b.to(torch.device("cuda")) for b in blocks] + input_features = blocks[0].srcdata["NodeFeature"] + output_labels = blocks[-1].dstdata["NodeLabel"] + + # When labels contains -1, modify labels + if min(output_labels) < 0: + output_labels = output_labels * (output_labels >= 0) + + logits = model(blocks, input_features) + loss = loss_fcn(logits.squeeze(), output_labels.float()) + optimizer.zero_grad() + loss.backward() + optimizer.step() + if it % 20 == 0: + # train_acc = loss_fcn(logits.squeeze(), output_labels.float()) + print("Loss", loss.item()) + + if epoch >= 3: + if cuda: + torch.cuda.synchronize() + dur.append(time.time() - t0) + + val_loss = evaluate(model, valid_dataloader, eval_func) + print(f"Epoch {epoch:05d} | Time(s) {np.mean(dur):.4f}" + f"| Loss {loss:.4f} | " + f" Val Loss {val_loss.item():.4f} | " + f"ETputs(KTEPS) {n_edges / np.mean(dur) / 1000:.2f}") + + if stopper.step(val_loss.item(), model): + break + + print() + + model.load_state_dict(torch.load(stopper.ckpt_dir)) + + loss = evaluate(model, test_dataloader, eval_func) + val_loss = stopper.best_score + print(f"Test loss {loss:.4f}, Val loss {val_loss:.4f}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/NodeRegression/utils.py b/benchmarks/NodeRegression/utils.py new file mode 100644 index 00000000..682fe141 --- /dev/null +++ b/benchmarks/NodeRegression/utils.py @@ -0,0 +1,241 @@ +""" +Utility functions. + +References: +https://github.com/pyg-team/pytorch_geometric/blob/ +575611f4f5e2209c7923dba977a1eebc207bd2e2/torch_geometric/ +graphgym/cmd_args.py +https://github.com/dmlc/dgl/blob/a107993f106cecb1c375f7a6ae41088d04f29e29/ +examples/pytorch/caregnn/utils.py +https://github.com/CUAI/Non-Homophily-Large-Scale/blob/master/data_utils.py +""" +import argparse +import yaml +import os +import fnmatch +import json +import random +import shutil +import torch +import torch.nn.functional as F +import numpy as np +from models.gcn import GCN +from models.gat import GAT +from models.monet import MoNet +from models.graph_sage import GraphSAGE +from models.mlp import MLP +from models.gcn_minibatch import GCNminibatch +from models.mixhop import MixHop +from models.linkx import LINKX + +Models_need_to_be_densed = ["GCN", "GraphSAGE", "GAT", "MixHop", "LINKX"] +Datasets_need_to_be_undirected = ["pokec", "genius", "penn94", "twitch-gamers"] + + +def generate_model(args, g, in_feats, n_classes, **model_cfg): + """Generate required model.""" + # create models + if args.model == "GCN": + model = GCN(g=g, + in_feats=in_feats, + n_hidden=model_cfg["num_hidden"], + n_classes=n_classes, + n_layers=model_cfg["num_layers"], + activation=F.relu, + dropout=model_cfg["dropout"]) + elif args.model == "GAT": + heads = ([model_cfg["num_heads"]] * (model_cfg["num_layers"]))\ + + [model_cfg["num_out_heads"]] + model = GAT(g, + model_cfg["num_layers"], + in_feats, + model_cfg["num_hidden"], + n_classes, + heads, + F.elu, + model_cfg["dropout"], + model_cfg["dropout"], + model_cfg["negative_slope"], + model_cfg["residual"]) + elif args.model == "MoNet": + model = MoNet(g, + in_feats, + model_cfg["num_hidden"], + n_classes, + model_cfg["num_layers"], + model_cfg["pseudo_dim"], + model_cfg["num_kernels"], + model_cfg["dropout"]) + elif args.model == "GraphSAGE": + model = GraphSAGE(g, + in_feats, + model_cfg["num_hidden"], + n_classes, + model_cfg["num_layers"], + F.relu, + model_cfg["dropout"], + model_cfg["aggregator_type"]) + elif args.model == "MLP": + model = MLP(in_feats, + model_cfg["num_hidden"], + n_classes, + model_cfg["num_layers"], + F.relu, + model_cfg["dropout"]) + elif args.model == "GCN_minibatch": + model = GCNminibatch(in_feats, + model_cfg["num_hidden"], + n_classes, + model_cfg["num_layers"], + F.relu, + model_cfg["dropout"]) + elif args.model == "MixHop": + model = MixHop(g, + in_dim=in_feats, + hid_dim=model_cfg["num_hidden"], + out_dim=n_classes, + p=model_cfg["p"], + num_layers=model_cfg["num_layers"], + input_dropout=model_cfg["dropout"], + layer_dropout=model_cfg["layer_dropout"], + batchnorm=model_cfg["batchnorm"]) + elif args.model == "LINKX": + model = LINKX(g=g, + in_channels=in_feats, + num_nodes=g.ndata["NodeFeature"].shape[0], + hidden_channels=model_cfg["num_hidden"], + out_channels=n_classes, + num_layers=model_cfg["num_layers"], + dropout=model_cfg["dropout"], + inner_activation=model_cfg["inner_activation"], + inner_dropout=model_cfg["inner_dropout"], + init_layers_A=model_cfg["init_layers_A"], + init_layers_X=model_cfg["init_layers_X"]) + try: + model + except UnboundLocalError as exc: + raise NameError(f"model {args.model} is not supported yet.") from exc + else: + return model + + +def parse_args(): + """Parse the command line arguments.""" + parser = argparse.ArgumentParser(description="train for node\ + classification") + parser.add_argument("--model-cfg", type=str, + default="configs/model_default.yaml", + help="The model configuration file path.") + parser.add_argument("--train-cfg", type=str, + default="configs/train_default.yaml", + help="The training configuration file path.") + parser.add_argument("--model", type=str, default="GCN", + help="model to be used. GCN, GAT, MoNet,\ + GraphSAGE, MLP, LINKX, MixHop for now") + parser.add_argument("--dataset", type=str, default="cora", + help="dataset to be trained") + parser.add_argument("--task", type=str, + default="NodeRegression", + help="task name. NodeClassification,\ + GraphClassification, LinkPrediction,\ + TimeDependentLinkPrediction,\ + KGRelationPrediction, NodeRegression.\ + KGEntityPrediction, GraphRegression,\ + for now") + parser.add_argument("--task-id", type=int, default=1, + help="task id, starting from 1") + parser.add_argument("--gpu", type=int, default=-1, + help="which GPU to use. Set -1 to use CPU.") + parser.add_argument("--verbose", type=bool, default=False, + help="whether to print verbosely") + return parser.parse_args() + + +def load_config_file(path): + """Load yaml files.""" + with open(path, "r", encoding="utf-8") as stream: + try: + parsed_yaml = yaml.full_load(stream) + print(parsed_yaml) + return parsed_yaml + except yaml.YAMLError as exc: + print(exc) + + +def check_multiple_split(dataset): + """Check whether the dataset has multiple splits.""" + dataset_directory = os.path.dirname(os.path.dirname(os.getcwd())) \ + + "/datasets/" + dataset + for file in os.listdir(dataset_directory): + if fnmatch.fnmatch(file, "task*.json"): + with open(dataset_directory + "/" + file, encoding="utf-8") as f: + task_dict = json.load(f) + if "num_splits" in task_dict and task_dict["num_splits"] > 1: + return 1 + else: + return 0 + + +class EarlyStopping: + """Do early stopping.""" + + def __init__(self, ckpt_name, early_stop, patience=50): + """Init early stopping.""" + self.patience = patience + self.counter = 0 + self.best_score = None + self.early_stop = False + self.early_stop_flag = early_stop + self.dir_name = "checkpoints/" + if ~os.path.isdir(self.dir_name): + os.makedirs(self.dir_name, exist_ok=True) + ckpt_name = ckpt_name.replace("/", "_") + ckpt_name = os.path.splitext(ckpt_name)[0] + self.ckpt_dir = self.dir_name + ckpt_name + "_checkpoint.pt" + + def step(self, loss, model): + """Step early stopping.""" + score = loss + if self.best_score is None: + self.best_score = score + self.save_checkpoint(model) + elif score > self.best_score: + if self.early_stop_flag: + self.counter += 1 + print(f"EarlyStopping counter: {self.counter}\ + out of {self.patience}") + if self.counter >= self.patience: + self.early_stop = True + else: + self.best_score = score + self.save_checkpoint(model) + self.counter = 0 + return self.early_stop + + def save_checkpoint(self, model): + """Save model when validation loss decrease.""" + torch.save(model.state_dict(), self.ckpt_dir) + + +def makedirs_rm_exist(dir_name): + """Make a directory, remove any existing data.""" + if os.path.isdir(dir_name): + shutil.rmtree(dir_name) + os.makedirs(dir_name, exist_ok=True) + + +def set_seed(seed): + """Set random seed.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + +def get_label_number(labels): + """Return the label number of dataset.""" + if len(labels.shape) > 1: + return labels.shape[1] + else: + return 1