Graph-Learning-Benchmarks · JackLiuyiyao · Jan 31, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 2, 2024
diff --git a/datasets/arxiv-2023/LICENSE b/datasets/arxiv-2023/LICENSE
diff --git a/datasets/arxiv-2023/LLM b/datasets/arxiv-2023/LLM
diff --git a/datasets/arxiv-2023/README.md b/datasets/arxiv-2023/README.md
@@ -0,0 +1,82 @@
+# ARXIV-2023
+
+## Dataset Description
+
+A text attributed graph dataset where each node is associated with multiple text attributes.
+It is collected to be compared with ogbn-arxiv. Both datasets represent directed citation networks where each node corresponds to a paper published on arXiv and each edge indicates one paper citing another.
+
+Statistics:
+- Nodes: 33868
+- Edges: 305672
+- Number of Classes: 40
+
+#### Citation
+
+- Original Source
+	+ [Website](https://github.com/TRAIS-Lab/LLM-Structured-Data)
+
+
+
+```
+@misc{huang2023llms,
+      title={Can LLMs Effectively Leverage Graph Structural Information: When and Why}, 
+      author={Jin Huang and Xingjian Zhang and Qiaozhu Mei and Jiaqi Ma},
+      year={2023},
+      eprint={2309.16595},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
+
+- Current Version
+	+ [Website](https://github.com/TRAIS-Lab/LLM-Structured-Data)
+
+
+
+```
+@misc{huang2023llms,
+      title={Can LLMs Effectively Leverage Graph Structural Information: When and Why}, 
+      author={Jin Huang and Xingjian Zhang and Qiaozhu Mei and Jiaqi Ma},
+      year={2023},
+      eprint={2309.16595},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
+
+
+## Available Tasks
+
+
+- Task type: `NodeClassification`
+
+
+#### Citation
+
+```
+@misc{huang2023llms,
+      title={Can LLMs Effectively Leverage Graph Structural Information: When and Why}, 
+      author={Jin Huang and Xingjian Zhang and Qiaozhu Mei and Jiaqi Ma},
+      year={2023},
+      eprint={2309.16595},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
+
+<!-- Insert the BibTeX citation into the above code block. -->
+
+## Preprocessing
+
+The data files and task config file in GLI format are transformed in arxiv-2023.ipynb file. Raw data aquried in TRAIS-Lab/LLM-Structured-Data folder.
+
+### Requirements
+
+```
+openai
+pytorch
+PyG
+ogb
+```
+
+
diff --git a/datasets/arxiv-2023/arxiv-2023.ipynb b/datasets/arxiv-2023/arxiv-2023.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# arxiv-2023 conversion script"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/51/yl5_04f90f13_y68cyyqz0j80000gn/T/ipykernel_48974/3045227301.py:3: DeprecationWarning: \n",
+      "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
+      "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
+      "but was not found to be installed on your system.\n",
+      "If this would cause problems for you,\n",
+      "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
+      "        \n",
+      "  import pandas as pd\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "import numpy\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_path=\"./LLM/dataset/arxiv_2023\"\n",
+    "# Load processed data\n",
+    "edge_index = torch.load(os.path.join(base_path, \"processed\", \"edge_index.pt\"))\n",
+    "    \n",
+    "# Load raw data\n",
+    "# edge_df = pd.read_csv(os.path.join(base_path, \"raw\", \"edge.csv.gz\"), compression='gzip')\n",
+    "titles_df = pd.read_csv(os.path.join(base_path, \"raw\", \"titles.csv.gz\"), compression='gzip')\n",
+    "abstracts_df = pd.read_csv(os.path.join(base_path, \"raw\", \"abstracts.csv.gz\"), compression='gzip')\n",
+    "ids_df = pd.read_csv(os.path.join(base_path, \"raw\", \"ids.csv.gz\"), compression='gzip')\n",
+    "labels_df = pd.read_csv(os.path.join(base_path, \"raw\", \"labels.csv.gz\"), compression='gzip')\n",
+    "    \n",
+    "# Load split data\n",
+    "train_id_df = pd.read_csv(os.path.join(base_path, \"split\", \"train.csv.gz\"), compression='gzip')\n",
+    "val_id_df = pd.read_csv(os.path.join(base_path, \"split\", \"valid.csv.gz\"), compression='gzip')\n",
+    "test_id_df = pd.read_csv(os.path.join(base_path, \"split\", \"test.csv.gz\"), compression='gzip')\n",
+    "    \n",
+    "num_nodes = len(ids_df)\n",
+    "titles = titles_df['titles'].tolist()\n",
+    "abstracts = abstracts_df['abstracts'].tolist()\n",
+    "ids = ids_df['ids'].tolist()\n",
+    "labels = labels_df['labels'].tolist()\n",
+    "train_id = train_id_df['train_id'].tolist()\n",
+    "val_id = val_id_df['val_id'].tolist()\n",
+    "test_id = test_id_df['test_id'].tolist()\n",
+    "\n",
+    "features = torch.load(os.path.join(base_path, \"processed\", \"features.pt\"))\n",
+    "\n",
+    "y = torch.load(os.path.join(base_path, \"processed\", \"labels.pt\"))\n",
+    "    \n",
+    "train_mask = torch.tensor([x in train_id for x in range(num_nodes)])\n",
+    "val_mask = torch.tensor([x in val_id for x in range(num_nodes)])\n",
+    "test_mask = torch.tensor([x in test_id for x in range(num_nodes)])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gli.io import save_graph, Attribute\n",
+    "node_attrs=[\n",
+    "  Attribute(\n",
+    "    \"Titles\",\n",
+    "    numpy.array(titles),\n",
+    "    \"Title of each node\",\n",
+    "    \"str\",\n",
+    "    \"Tensor\",\n",
+    "  ),\n",
+    "  Attribute(\n",
+    "    \"Abstracts\",\n",
+    "    numpy.array(abstracts),\n",
+    "    \"Abstract of each article(node)\",\n",
+    "    \"str\",\n",
+    "    \"Tensor\",\n",
+    "  ),\n",
+    "  Attribute(\n",
+    "    \"Ids\",\n",
+    "    numpy.array([str(id) for id in ids]),\n",
+    "    \"Id of each article(node)\",\n",
+    "    \"str\",\n",
+    "    \"Tensor\",\n",
+    "  ),\n",
+    "  Attribute(\n",
+    "    \"Labels\",\n",
+    "    numpy.array(labels),\n",
+    "    \"Label\",\n",
+    "    \"str\",\n",
+    "    \"Tensor\",\n",
+    "  ),\n",
+    "  \n",
+    "]\n",
+    "\n",
+    "metadata = save_graph(\n",
+    "  name=\"arxiv-2023\",\n",
+    "  edge=numpy.array(edge_index).T,\n",
+    "  num_nodes=num_nodes,\n",
+    "  node_attrs=node_attrs,\n",
+    "  description=\"ARXIV-2023 dataset.\",\n",
+    "  cite=\"@misc{huang2023llms,\\ntitle={Can LLMs Effectively Leverage Graph Structural Information: When and Why},\\nauthor={Jin Huang and Xingjian Zhang and Qiaozhu Mei and Jiaqi Ma},\\nyear={2023},\\neprint={2309.16595},\\narchivePrefix={arXiv},\\nprimaryClass={cs.LG}\\n}\",\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([    4,     6,     9, ..., 33865, 33866, 33867])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from gli.io import save_task_node_classification\n",
+    "\n",
+    "task_data = save_task_node_classification(\n",
+    "  name=\"arxiv-2023\",\n",
+    "  description=\"Node classification on arxiv-2023 dataset.\",\n",
+    "  feature=[\"Node/Titles\",\"Node/Abstracts\"],\n",
+    "  target=\"Node/Labels\",\n",
+    "  num_classes=40,\n",
+    "  train_set=train_mask.nonzero().squeeze().numpy(),\n",
+    "  val_set=val_mask.nonzero().nonzero().squeeze().numpy(),\n",
+    "  test_set=test_mask.nonzero().nonzero().squeeze().numpy(),\n",
+    "  task_id=\"1\"\n",
+    ")\n",
+    "train_mask.nonzero().squeeze().numpy()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/datasets/arxiv-2023/metadata.json b/datasets/arxiv-2023/metadata.json
@@ -0,0 +1,48 @@
+{
+    "description": "ARXIV-2023 dataset.",
+    "data": {
+        "Node": {
+            "Titles": {
+                "description": "Title of each node",
+                "type": "str",
+                "format": "Tensor",
+                "file": "arxiv-2023__graph__4c1b3c1c22882e1d9660a6e5cafcd1a4.npz",
+                "key": "Node_Titles"
+            },
+            "Abstracts": {
+                "description": "Abstract of each article(node)",
+                "type": "str",
+                "format": "Tensor",
+                "file": "arxiv-2023__graph__4c1b3c1c22882e1d9660a6e5cafcd1a4.npz",
+                "key": "Node_Abstracts"
+            },
+            "Ids": {
+                "description": "Id of each article(node)",
+                "type": "str",
+                "format": "Tensor",
+                "file": "arxiv-2023__graph__4c1b3c1c22882e1d9660a6e5cafcd1a4.npz",
+                "key": "Node_Ids"
+            },
+            "Labels": {
+                "description": "Label",
+                "type": "str",
+                "format": "Tensor",
+                "file": "arxiv-2023__graph__4c1b3c1c22882e1d9660a6e5cafcd1a4.npz",
+                "key": "Node_Labels"
+            }
+        },
+        "Edge": {
+            "_Edge": {
+                "file": "arxiv-2023__graph__4c1b3c1c22882e1d9660a6e5cafcd1a4.npz",
+                "key": "Edge_Edge"
+            }
+        },
+        "Graph": {
+            "_NodeList": {
+                "file": "arxiv-2023__graph__Graph_NodeList__a133ca6cee0eff3cc4ae10d024cc0c02.sparse.npz"
+            }
+        }
+    },
+    "citation": "@misc{huang2023llms,\ntitle={Can LLMs Effectively Leverage Graph Structural Information: When and Why},\nauthor={Jin Huang and Xingjian Zhang and Qiaozhu Mei and Jiaqi Ma},\nyear={2023},\neprint={2309.16595},\narchivePrefix={arXiv},\nprimaryClass={cs.LG}\n}",
+    "is_heterogeneous": false
+}
diff --git a/datasets/arxiv-2023/task_node_classification_1.json b/datasets/arxiv-2023/task_node_classification_1.json
@@ -0,0 +1,22 @@
+{
+    "description": "Node classification on arxiv-2023 dataset.",
+    "type": "NodeClassification",
+    "feature": [
+        "Node/Titles",
+        "Node/Abstracts"
+    ],
+    "target": "Node/Labels",
+    "num_classes": 40,
+    "train_set": {
+        "file": "arxiv-2023__task_node_classification_1__707e9444940a9744a72ae8a990fe9136.npz",
+        "key": "train_set"
+    },
+    "val_set": {
+        "file": "arxiv-2023__task_node_classification_1__707e9444940a9744a72ae8a990fe9136.npz",
+        "key": "val_set"
+    },
+    "test_set": {
+        "file": "arxiv-2023__task_node_classification_1__707e9444940a9744a72ae8a990fe9136.npz",
+        "key": "test_set"
+    }
+}