Graph-Learning-Benchmarks · Jn-Huang · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 # MacOS .DS_Store files
 .DS_Store
 
+# Raw dataset folders
+cora_raw/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/datasets/cora/cora.ipynb b/datasets/cora/cora.ipynb
@@ -7,6 +7,16 @@
     "# Cora Example"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -16,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -54,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -75,9 +85,53 @@
     "print(edge.shape)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load raw text for cora dataset"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/jinhuang/opt/miniconda3/envs/arxiv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['title', 'abs', 'label'])\n",
+      "title ['Title: The megaprior heuristic for discovering protein sequence patterns  ']\n",
+      "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ']\n",
+      "label ['Neural Networks']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append(\"../../\")\n",
+    "from gli.raw_text_utils import load_data\n",
+    "\n",
+    "_, raw_text_dict = load_data(dataset=\"cora\", use_text=True)\n",
+    "\n",
+    "print(raw_text_dict.keys())\n",
+    "\n",
+    "for key, item in raw_text_dict.items():\n",
+    "    print(key, item[:1])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -97,9 +151,31 @@
     "        \"Node labels of Cora dataset, int ranged from 1 to 7.\",\n",
     "        \"int\",\n",
     "        \"Tensor\",\n",
+    "    ),\n",
+    "    Attribute(\n",
+    "        \"NodeRawTextTitle\",\n",
+    "        raw_text_dict[\"title\"],\n",
+    "        \"Raw text of title of each node in Cora dataset, list of strings.\",\n",
+    "        \"str\",\n",
+    "        \"List[str]\"\n",
+    "    ),\n",
+    "    Attribute(\n",
+    "        \"NodeRawTextAbstract\",\n",
+    "        raw_text_dict[\"abs\"],\n",
+    "        \"Raw text of abstract of each node in Cora dataset, list of strings.\",\n",
+    "        \"str\",\n",
+    "        \"List[str]\"\n",
+    "    ),\n",
+    "    Attribute(\n",
+    "        \"NodeRawTextLabel\",\n",
+    "        raw_text_dict[\"label\"],\n",
+    "        \"Raw text of label of each node in Cora dataset, list of strings.\",\n",
+    "        \"str\",\n",
+    "        \"List[str]\"\n",
     "    )\n",
     "]\n",
     "\n",
+    "\n",
     "metadata = save_graph(\n",
     "    name=\"cora\",\n",
     "    edge=edge,\n",
@@ -120,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -143,6 +219,27 @@
       "        \"format\": \"Tensor\",\n",
       "        \"file\": \"cora__graph__6c912909fa18eff10797210ea5e485fe.npz\",\n",
       "        \"key\": \"Node_NodeLabel\"\n",
+      "      },\n",
+      "      \"NodeRawTextTitle\": {\n",
+      "        \"description\": \"Raw text of title of each node in Cora dataset, list of strings.\",\n",
+      "        \"type\": \"str\",\n",
+      "        \"format\": \"List[str]\",\n",
+      "        \"optional file\": \"cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz\",\n",
+      "        \"key\": \"Node_NodeRawTextTitle\"\n",
+      "      },\n",
+      "      \"NodeRawTextAbstract\": {\n",
+      "        \"description\": \"Raw text of abstract of each node in Cora dataset, list of strings.\",\n",
+      "        \"type\": \"str\",\n",
+      "        \"format\": \"List[str]\",\n",
+      "        \"optional file\": \"cora__graph__Node_NodeRawTextAbstract__d0e5436087314624c74a9f040d6f394f.optional.npz\",\n",
+      "        \"key\": \"Node_NodeRawTextAbstract\"\n",
+      "      },\n",
+      "      \"NodeRawTextLabel\": {\n",
+      "        \"description\": \"Raw text of label of each node in Cora dataset, list of strings.\",\n",
+      "        \"type\": \"str\",\n",
+      "        \"format\": \"List[str]\",\n",
+      "        \"optional file\": \"cora__graph__Node_NodeRawTextLabel__06d184316789acc0902db2b8c1472f95.optional.npz\",\n",
+      "        \"key\": \"Node_NodeRawTextLabel\"\n",
       "      }\n",
       "    },\n",
       "    \"Edge\": {\n",
@@ -177,7 +274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -188,7 +285,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -216,7 +313,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -260,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -275,7 +372,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/jimmy/Projects/Private/gli/gli/utils.py:254: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n",
+      "/Users/jinhuang/Documents/research/gli/datasets/cora/../../gli/utils.py:262: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1682343673238/work/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n",
       "  return torch.sparse_csr_tensor(crow_indices,\n"
      ]
     },
@@ -287,7 +384,7 @@
        "      edata_schemes={})"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -305,33 +402,79 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After adding LICENSE and README.md, the dataset directory will be the following."
+    "Loading data with raw text."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[1;36m.\u001b[00m\n",
-      "├── LICENSE\n",
-      "├── README.md\n",
-      "├── cora.ipynb\n",
-      "├── cora__graph__6c912909fa18eff10797210ea5e485fe.npz\n",
-      "├── cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz\n",
-      "├── cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz\n",
-      "├── cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz\n",
-      "├── metadata.json\n",
-      "└── task_node_classification_1.json\n",
-      "\n",
-      "0 directories, 9 files\n"
+      "All data files already exist. Skip downloading.\n",
+      "CORA dataset.\n",
+      "All data files already exist. Skip downloading.\n",
+      "Node classification on CORA dataset. Planetoid split.\n",
+      "Graph(num_nodes=2708, num_edges=10556,\n",
+      "      ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}\n",
+      "      edata_schemes={})\n"
      ]
     }
    ],
+   "source": [
+    "from gli.dataloading import get_gli_dataset\n",
+    "\n",
+    "dataset = get_gli_dataset(\"cora\", \"NodeClassification\", load_raw_text=True, verbose=True)\n",
+    "\n",
+    "data = dataset[0]\n",
+    "\n",
+    "print(data)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The raw text are saved in:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('Title: The megaprior heuristic for discovering protein sequence patterns  ',\n",
+       " 'Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ',\n",
+       " 'Neural Networks')"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.NodeRawTextTitle[0], data.NodeRawTextAbstract[0], data.NodeRawTextLabel[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After adding LICENSE and README.md, the dataset directory will be the following."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "!tree ."
    ]
@@ -353,7 +496,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.17"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/datasets/cora/metadata.json b/datasets/cora/metadata.json
@@ -14,6 +14,27 @@
                 "format": "Tensor",
                 "file": "cora__graph__6c912909fa18eff10797210ea5e485fe.npz",
                 "key": "Node_NodeLabel"
+            },
+            "NodeRawTextTitle": {
+                "description": "Raw text of title of each node in Cora dataset, list of strings.",
+                "type": "str",
+                "format": "List[str]",
+                "optional file": "cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz",
+                "key": "Node_NodeRawTextTitle"
+            },
+            "NodeRawTextAbstract": {
+                "description": "Raw text of abstract of each node in Cora dataset, list of strings.",
+                "type": "str",
+                "format": "List[str]",
+                "optional file": "cora__graph__Node_NodeRawTextAbstract__d0e5436087314624c74a9f040d6f394f.optional.npz",
+                "key": "Node_NodeRawTextAbstract"
+            },
+            "NodeRawTextLabel": {
+                "description": "Raw text of label of each node in Cora dataset, list of strings.",
+                "type": "str",
+                "format": "List[str]",
+                "optional file": "cora__graph__Node_NodeRawTextLabel__06d184316789acc0902db2b8c1472f95.optional.npz",
+                "key": "Node_NodeRawTextLabel"
             }
         },
         "Edge": {