InternScience · ChenZiHong-Gavin · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/graphgen/bases/__init__.py b/graphgen/bases/__init__.py
@@ -13,4 +13,4 @@
     StorageNameSpace,
 )
 from .base_tokenizer import BaseTokenizer
-from .datatypes import Chunk, QAPair, Token
+from .datatypes import Chunk, Config, Node, QAPair, Token
diff --git a/graphgen/bases/base_partitioner.py b/graphgen/bases/base_partitioner.py
@@ -7,7 +7,7 @@
 
 class BasePartitioner(ABC):
     @abstractmethod
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         **kwargs: Any,
@@ -20,39 +20,34 @@ async def partition(
         """
 
     @staticmethod
-    async def community2batch(
-        communities: List[Community], g: BaseGraphStorage
-    ) -> list[
-        tuple[
-            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
-        ]
+    def community2batch(
+        comm: Community, g: BaseGraphStorage
+    ) -> tuple[
+        list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
     ]:
         """
         Convert communities to batches of nodes and edges.
-        :param communities
+        :param comm: Community
         :param g: Graph storage instance
         :return: List of batches, each batch is a tuple of (nodes, edges)
         """
-        batches = []
-        for comm in communities:
-            nodes = comm.nodes
-            edges = comm.edges
-            nodes_data = []
-            for node in nodes:
-                node_data = g.get_node(node)
-                if node_data:
-                    nodes_data.append((node, node_data))
-            edges_data = []
-            for u, v in edges:
-                edge_data = g.get_edge(u, v)
+        nodes = comm.nodes
+        edges = comm.edges
+        nodes_data = []
+        for node in nodes:
+            node_data = g.get_node(node)
+            if node_data:
+                nodes_data.append((node, node_data))
+        edges_data = []
+        for u, v in edges:
+            edge_data = g.get_edge(u, v)
+            if edge_data:
+                edges_data.append((u, v, edge_data))
+            else:
+                edge_data = g.get_edge(v, u)
                 if edge_data:
-                    edges_data.append((u, v, edge_data))
-                else:
-                    edge_data = g.get_edge(v, u)
-                    if edge_data:
-                        edges_data.append((v, u, edge_data))
-            batches.append((nodes_data, edges_data))
-        return batches
+                    edges_data.append((v, u, edge_data))
+        return nodes_data, edges_data
 
     @staticmethod
     def _build_adjacency_list(

diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py
@@ -1,8 +1,10 @@
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
+import pandas as pd
 import requests
+from ray.data import Dataset
 
 
 class BaseReader(ABC):
@@ -14,52 +16,65 @@ def __init__(self, text_column: str = "content"):
         self.text_column = text_column
 
     @abstractmethod
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
         """
         Read data from the specified file path.
 
-        :param file_path: Path to the input file.
-        :return: List of dictionaries containing the data.
+        :param input_path: Path to the input file or list of file paths.
+        :return: Ray Dataset containing the read data.
         """
 
-    @staticmethod
-    def filter(data: List[dict]) -> List[dict]:
+    def _should_keep_item(self, item: Dict[str, Any]) -> bool:
+        """
+        Determine whether to keep the given item based on the text column.
+
+        :param item: Dictionary representing a data entry.
+        :return: True if the item should be kept, False otherwise.
         """
-        Filter out entries with empty or missing text in the specified column.
+        item_type = item.get("type")
+        assert item_type in [
+            "text",
+            "image",
+            "table",
+            "equation",
+            "protein",
+        ], f"Unsupported item type: {item_type}"
+        if item_type == "text":
+            content = item.get(self.text_column, "").strip()
+            return bool(content)
+        return True
 
-        :param data: List of dictionaries containing the data.
-        :return: Filtered list of dictionaries.
+    def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame:
+        """
+        Validate data format.
         """
+        if "type" not in batch.columns:
+            raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}")
 
-        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
-            """
-            Check if an image exists at the given local path or URL.
-            :param path_or_url: Local file path or remote URL of the image.
-            :param timeout: Timeout for remote URL requests in seconds.
-            :return: True if the image exists, False otherwise.
-            """
-            if not path_or_url:
-                return False
-            if not path_or_url.startswith(("http://", "https://", "ftp://")):
-                path = path_or_url.replace("file://", "", 1)
-                path = os.path.abspath(path)
-                return os.path.isfile(path)
-            try:
-                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
-                return resp.status_code == 200
-            except requests.RequestException:
-                return False
+        if "text" in batch["type"].values:
+            if self.text_column not in batch.columns:
+                raise ValueError(
+                    f"Missing '{self.text_column}' column for text documents"
+                )
 
-        filtered_data = []
-        for item in data:
-            if item.get("type") == "text":
-                content = item.get("content", "").strip()
-                if content:
-                    filtered_data.append(item)
-            elif item.get("type") in ("image", "table", "equation"):
-                img_path = item.get("img_path")
-                if _image_exists(img_path):
-                    filtered_data.append(item)
-            else:
-                filtered_data.append(item)
-        return filtered_data
+        return batch
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
diff --git a/graphgen/bases/base_splitter.py b/graphgen/bases/base_splitter.py
@@ -4,7 +4,7 @@
 from typing import Callable, Iterable, List, Literal, Optional, Union
 
 from graphgen.bases.datatypes import Chunk
-from graphgen.utils import logger
+from graphgen.utils.log import logger
 
 
 class BaseSplitter(ABC):
@@ -33,7 +33,7 @@ def split_text(self, text: str) -> List[str]:
         """
         Split the input text into smaller chunks.
 
-        :param text: The input text to be split.
+        :param text: The input text to be chunk.
         :return: A list of text chunks.
         """
 
@@ -111,7 +111,7 @@ def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
     def _split_text_with_regex(
         text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
     ) -> List[str]:
-        # Now that we have the separator, split the text
+        # Now that we have the separator, chunk the text
         if separator:
             if keep_separator:
                 # The parentheses in the pattern keep the delimiters in the result.

diff --git a/graphgen/bases/base_storage.py b/graphgen/bases/base_storage.py
@@ -16,23 +16,6 @@ def query_done_callback(self):
         """commit the storage operations after querying"""
 
 
-class BaseListStorage(Generic[T], StorageNameSpace):
-    def all_items(self) -> list[T]:
-        raise NotImplementedError
-
-    def get_by_index(self, index: int) -> Union[T, None]:
-        raise NotImplementedError
-
-    def append(self, data: T):
-        raise NotImplementedError
-
-    def upsert(self, data: list[T]):
-        raise NotImplementedError
-
-    def drop(self):
-        raise NotImplementedError
-
-
 class BaseKVStorage(Generic[T], StorageNameSpace):
     def all_keys(self) -> list[str]:
         raise NotImplementedError

diff --git a/graphgen/bases/datatypes.py b/graphgen/bases/datatypes.py
@@ -2,6 +2,8 @@
 from dataclasses import dataclass, field
 from typing import List, Union
 
+from pydantic import BaseModel, Field, field_validator
+
 
 @dataclass
 class Chunk:
@@ -48,3 +50,45 @@ class Community:
     nodes: List[str] = field(default_factory=list)
     edges: List[tuple] = field(default_factory=list)
     metadata: dict = field(default_factory=dict)
+
+
+class Node(BaseModel):
+    id: str = Field(..., description="unique node id")
+    op_name: str = Field(..., description="operator name")
+    type: str = Field(
+        ..., description="task type, e.g., map, filter, flatmap, aggregate, map_batch"
+    )
+    params: dict = Field(default_factory=dict, description="operator parameters")
+    dependencies: List[str] = Field(
+        default_factory=list, description="list of dependent node ids"
+    )
+    execution_params: dict = Field(
+        default_factory=dict, description="execution parameters like replicas, batch_size"
+    )
+
+    @classmethod
+    @field_validator("type")
+    def validate_type(cls, v: str) -> str:
+        valid_types = {"map", "filter", "flatmap", "aggregate", "map_batch"}
+        if v not in valid_types:
+            raise ValueError(f"Invalid node type: {v}. Must be one of {valid_types}.")
+        return v
+
+
+class Config(BaseModel):
+    global_params: dict = Field(
+        default_factory=dict, description="global context for the computation graph"
+    )
+
+    nodes: List[Node] = Field(
+        ..., min_length=1, description="list of nodes in the computation graph"
+    )
+
+    @classmethod
+    @field_validator("nodes")
+    def validate_unique_ids(cls, v: List[Node]) -> List[Node]:
+        ids = [node.id for node in v]
+        if len(ids) != len(set(ids)):
+            duplicates = {id_ for id_ in ids if ids.count(id_) > 1}
+            raise ValueError(f"Duplicate node ids found: {duplicates}")
+        return v
diff --git a/graphgen/common/__init__.py b/graphgen/common/__init__.py
@@ -0,0 +1,2 @@
+from .init_llm import init_llm
+from .init_storage import init_storage
diff --git a/graphgen/operators/init/init_llm.py → graphgen/common/init_llm.py b/graphgen/operators/init/init_llm.py → graphgen/common/init_llm.py
@@ -29,6 +29,7 @@ def create_llm_wrapper(backend: str, config: Dict[str, Any]) -> BaseLLMWrapper:
             return HTTPClient(**config)
         if backend in ("openai_api", "azure_openai_api"):
             from graphgen.models.llm.api.openai_client import OpenAIClient
+
             # pass in concrete backend to the OpenAIClient so that internally we can distinguish
             # between OpenAI and Azure OpenAI
             return OpenAIClient(**config, backend=backend)
@@ -79,3 +80,6 @@ def init_llm(model_type: str) -> Optional[BaseLLMWrapper]:
     backend = config.pop("backend")
     llm_wrapper = LLMFactory.create_llm_wrapper(backend, config)
     return llm_wrapper
+
+
+# TODO: use ray serve when loading large models to avoid re-loading in each actor
diff --git a/graphgen/common/init_storage.py b/graphgen/common/init_storage.py
@@ -0,0 +1,28 @@
+from graphgen.models import JsonKVStorage, NetworkXStorage
+
+
+class StorageFactory:
+    """
+    Factory class to create storage instances based on backend.
+    Supported backends:
+        kv_storage(key-value storage):
+            - json_kv: JsonKVStorage
+        graph_storage:
+            - networkx: NetworkXStorage (graph storage)
+    """
+
+    @staticmethod
+    def create_storage(backend: str, working_dir: str, namespace: str):
+        if backend == "json_kv":
+            return JsonKVStorage(working_dir, namespace=namespace)
+
+        if backend == "networkx":
+            return NetworkXStorage(working_dir, namespace=namespace)
+
+        raise NotImplementedError(
+            f"Storage backend '{backend}' is not implemented yet."
+        )
+
+
+def init_storage(backend: str, working_dir: str, namespace: str):
+    return StorageFactory.create_storage(backend, working_dir, namespace)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .init_llm import init_llm
		from .init_storage import init_storage