InternScience · ChenZiHong-Gavin · Dec 16, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/.env.example b/.env.example
@@ -35,3 +35,23 @@ TRAINEE_API_KEY=
 #
 # TRAINEE_BACKEND=huggingface
 # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+
+# # sglang
+# SYNTHESIZER_BACKEND=sglang
+# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# SYNTHESIZER_TP_SIZE=1
+# SYNTHESIZER_NUM_GPUS=1
+
+# TRAINEE_BACKEND=sglang
+# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# SYNTHESIZER_TP_SIZE=1
+# SYNTHESIZER_NUM_GPUS=1
+
+# # vllm
+# SYNTHESIZER_BACKEND=vllm
+# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# SYNTHESIZER_NUM_GPUS=1
+
+# TRAINEE_BACKEND=vllm
+# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+# TRAINEE_NUM_GPUS=1
diff --git a/README.md b/README.md
@@ -193,42 +193,105 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
      ```
    - Set the following environment variables:
      ```bash
-     # Synthesizer is the model used to construct KG and generate data
-     SYNTHESIZER_MODEL=your_synthesizer_model_name
-     SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model
-     SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model
-     # Trainee is the model used to train with the generated data
-     TRAINEE_MODEL=your_trainee_model_name
-     TRAINEE_BASE_URL=your_base_url_for_trainee_model
-     TRAINEE_API_KEY=your_api_key_for_trainee_model
+      # Tokenizer
+      TOKENIZER_MODEL=
+
+      # LLM
+      # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt
+      # Synthesizer is the model used to construct KG and generate data
+      # Trainee is the model used to train with the generated data
+
+      # http_api / openai_api
+      SYNTHESIZER_BACKEND=openai_api
+      SYNTHESIZER_MODEL=gpt-4o-mini
+      SYNTHESIZER_BASE_URL=
+      SYNTHESIZER_API_KEY=
+      TRAINEE_BACKEND=openai_api
+      TRAINEE_MODEL=gpt-4o-mini
+      TRAINEE_BASE_URL=
+      TRAINEE_API_KEY=
+
+      # azure_openai_api
+      # SYNTHESIZER_BACKEND=azure_openai_api
+      # The following is the same as your "Deployment name" in Azure
+      # SYNTHESIZER_MODEL=<your-deployment-name>
+      # SYNTHESIZER_BASE_URL=https://<your-resource-name>.openai.azure.com/openai/deployments/<your-deployment-name>/chat/completions
+      # SYNTHESIZER_API_KEY=
+      # SYNTHESIZER_API_VERSION=<api-version>
+
+      # # ollama_api
+      # SYNTHESIZER_BACKEND=ollama_api
+      # SYNTHESIZER_MODEL=gemma3
+      # SYNTHESIZER_BASE_URL=http://localhost:11434
+      #
+      # Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs.
+
+      # # huggingface
+      # SYNTHESIZER_BACKEND=huggingface
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      #
+      # TRAINEE_BACKEND=huggingface
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+
+      # # sglang
+      # SYNTHESIZER_BACKEND=sglang
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+
+      # TRAINEE_BACKEND=sglang
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+
+      # # vllm
+      # SYNTHESIZER_BACKEND=vllm
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_NUM_GPUS=1
+
+      # TRAINEE_BACKEND=vllm
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # TRAINEE_NUM_GPUS=1
      ```
-2. (Optional) Customize generation parameters in `graphgen/configs/` folder.
+2. (Optional) Customize generation parameters in `config.yaml` .
 
    Edit the corresponding YAML file, e.g.:
 
     ```yaml
-      # configs/cot_config.yaml
-      input_file: resources/input_examples/jsonl_demo.jsonl
-      output_data_type: cot
-      tokenizer: cl100k_base
+      # examples/generate/generate_aggregated_qa/aggregated_config.yaml
+      global_params:
+      working_dir: cache
+      graph_backend: kuzu # graph database backend, support: kuzu, networkx
+      kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+      nodes:
+        - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+          op_name: read
+          type: source
+          dependencies: []
+          params:
+            input_path:
+              - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
       # additional settings...
     ```
 
 3. Generate data
 
    Pick the desired format and run the matching script:
-
-   | Format       | Script to run                                  | Notes                                                             |
-   |--------------|------------------------------------------------|-------------------------------------------------------------------|
-   | `cot`        | `bash scripts/generate/generate_cot.sh`        | Chain-of-Thought Q\&A pairs                                       |
-   | `atomic`     | `bash scripts/generate/generate_atomic.sh`     | Atomic Q\&A pairs covering basic knowledge                        |
-   | `aggregated` | `bash scripts/generate/generate_aggregated.sh` | Aggregated Q\&A pairs incorporating complex, integrated knowledge |
-   | `multi-hop`  | `bash scripts/generate/generate_multihop.sh`   | Multi-hop reasoning Q\&A pairs                                    |
+
+   | Format       | Script to run                                                          | Notes                                                                      |
+   | ------------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------- |
+   | `cot`        | `bash examples/generate/generate_cot_qa/generate_cot.sh`               | Chain-of-Thought Q\&A pairs                                                |
+   | `atomic`     | `bash examples/generate/generate_atomic_qa/generate_atomic.sh`         | Atomic Q\&A pairs covering basic knowledge                                 |
+   | `aggregated` | `bash examples/generate/generate_aggregated_qa/generate_aggregated.sh` | Aggregated Q\&A pairs incorporating complex, integrated knowledge          |
+   | `multi-hop`  | `examples/generate/generate_multi_hop_qa/generate_multi_hop.sh`        | Multi-hop reasoning Q\&A pairs                                             |
+   | `vqa`        | `bash examples/generate/generate_vqa/generate_vqa.sh`                  | Visual Question Answering pairs combining visual and textual understanding |
 
 
 4. Get the generated data
    ```bash
-   ls cache/data/graphgen
+   ls cache/output
    ```
 
 ### Run with Docker

diff --git a/README_zh.md b/README_zh.md
@@ -190,42 +190,106 @@ GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期
      ```
    - 设置以下环境变量：
      ```bash
-     # Synthesizer 用于构建知识图谱并生成数据
-     SYNTHESIZER_MODEL=your_synthesizer_model_name
-     SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model
-     SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model
-     # Trainee 用于使用生成数据进行训练
-     TRAINEE_MODEL=your_trainee_model_name
-     TRAINEE_BASE_URL=your_base_url_for_trainee_model
-     TRAINEE_API_KEY=your_api_key_for_trainee_model
+      # Tokenizer
+      TOKENIZER_MODEL=
+
+      # LLM
+      # 支持不同的后端：http_api、openai_api、ollama_api、ollama、huggingface、tgi、sglang、tensorrt
+      # Synthesizer 用于构建知识图谱并生成数据
+      # Trainee 用于使用生成数据进行训练
+
+      # http_api / openai_api
+      SYNTHESIZER_BACKEND=openai_api
+      SYNTHESIZER_MODEL=gpt-4o-mini
+      SYNTHESIZER_BASE_URL=
+      SYNTHESIZER_API_KEY=
+      TRAINEE_BACKEND=openai_api
+      TRAINEE_MODEL=gpt-4o-mini
+      TRAINEE_BASE_URL=
+      TRAINEE_API_KEY=
+
+      # azure_openai_api
+      # SYNTHESIZER_BACKEND=azure_openai_api
+      # The following is the same as your "Deployment name" in Azure
+      # SYNTHESIZER_MODEL=<your-deployment-name>
+      # SYNTHESIZER_BASE_URL=https://<your-resource-name>.openai.azure.com/openai/deployments/<your-deployment-name>/chat/completions
+      # SYNTHESIZER_API_KEY=
+      # SYNTHESIZER_API_VERSION=<api-version>
+
+      # # ollama_api
+      # SYNTHESIZER_BACKEND=ollama_api
+      # SYNTHESIZER_MODEL=gemma3
+      # SYNTHESIZER_BASE_URL=http://localhost:11434
+      #
+      # Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs.
+
+      # # huggingface
+      # SYNTHESIZER_BACKEND=huggingface
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      #
+      # TRAINEE_BACKEND=huggingface
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+
+      # # sglang
+      # SYNTHESIZER_BACKEND=sglang
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+
+      # TRAINEE_BACKEND=sglang
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_TP_SIZE=1
+      # SYNTHESIZER_NUM_GPUS=1
+
+      # # vllm
+      # SYNTHESIZER_BACKEND=vllm
+      # SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # SYNTHESIZER_NUM_GPUS=1
+
+      # TRAINEE_BACKEND=vllm
+      # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+      # TRAINEE_NUM_GPUS=1
      ```
 2. （可选）如需修改默认生成配置，可编辑 `graphgen/configs/` 文件夹中的 YAML 文件.
 
    例如：
 
     ```yaml
-      # configs/cot_config.yaml
-      input_file: resources/input_examples/jsonl_demo.jsonl
-      output_data_type: cot
-      tokenizer: cl100k_base
+      # examples/generate/generate_aggregated_qa/aggregated_config.yaml
+      global_params:
+      working_dir: cache
+      graph_backend: kuzu # graph database backend, support: kuzu, networkx
+      kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+      nodes:
+        - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+          op_name: read
+          type: source
+          dependencies: []
+          params:
+            input_path:
+              - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
       # 其他设置...
     ```
 
 3. 生成数据
 
    选择所需格式并运行对应脚本：
 
-   | 格式           | 运行脚本                                           | 说明           |
-   |--------------|------------------------------------------------|--------------|
-   | `cot`        | `bash scripts/generate/generate_cot.sh`        | 思维链问答对       |
-   | `atomic`     | `bash scripts/generate/generate_atomic.sh`     | 覆盖基础知识的原子问答对 |
-   | `aggregated` | `bash scripts/generate/generate_aggregated.sh` | 整合复杂知识的聚合问答对 |
-   | `multi-hop`  | `bash scripts/generate/generate_multihop.sh`   | 多跳推理问答对      |
+   | 格式           | 运行脚本                                                                   | 说明              |
+   | ------------ | ---------------------------------------------------------------------- | --------------- |
+   | `cot`        | `bash examples/generate/generate_cot_qa/generate_cot.sh`               | 思维链问答对          |
+   | `atomic`     | `bash examples/generate/generate_atomic_qa/generate_atomic.sh`         | 覆盖基础知识的原子问答对    |
+   | `aggregated` | `bash examples/generate/generate_aggregated_qa/generate_aggregated.sh` | 整合复杂知识的聚合问答对    |
+   | `multi-hop`  | `bash examples/generate/generate_multi_hop_qa/generate_multi_hop.sh`   | 多跳推理问答对         |
+   | `vqa`        | `bash examples/generate/generate_vqa/generate_vqa.sh`                  | 视觉问答对，结合视觉和文本理解 |
+
 
 
 4. 查看生成结果
    ```bash
-   ls cache/data/graphgen
+   ls cache/output
    ```
 
 ### 使用 Docker 运行

diff --git a/baselines/BDS/bds.py b/baselines/BDS/bds.py
@@ -8,8 +8,8 @@
 from tqdm.asyncio import tqdm as tqdm_async
 
 from graphgen.bases import BaseLLMWrapper
+from graphgen.common import init_llm
 from graphgen.models import NetworkXStorage
-from graphgen.operators import init_llm
 from graphgen.utils import create_event_loop
 
 QA_GENERATION_PROMPT = """
@@ -54,9 +54,7 @@ def _post_process(text: str) -> dict:
 
 class BDS:
     def __init__(self, llm_client: BaseLLMWrapper = None, max_concurrent: int = 1000):
-        self.llm_client: BaseLLMWrapper = llm_client or init_llm(
-            "synthesizer"
-        )
+        self.llm_client: BaseLLMWrapper = llm_client or init_llm("synthesizer")
         self.max_concurrent: int = max_concurrent
 
     def generate(self, tasks: List[dict]) -> List[dict]:

diff --git a/scripts/baselines/generate_all_baselines.sh → examples/baselines/generate_all_baselines.sh b/scripts/baselines/generate_all_baselines.sh → examples/baselines/generate_all_baselines.sh
diff --git a/scripts/baselines/generate_bds.sh → examples/baselines/generate_bds.sh b/scripts/baselines/generate_bds.sh → examples/baselines/generate_bds.sh
diff --git a/scripts/baselines/generate_entigraph.sh → examples/baselines/generate_entigraph.sh b/scripts/baselines/generate_entigraph.sh → examples/baselines/generate_entigraph.sh
diff --git a/scripts/baselines/generate_genie.sh → examples/baselines/generate_genie.sh b/scripts/baselines/generate_genie.sh → examples/baselines/generate_genie.sh
diff --git a/scripts/baselines/generate_longform.sh → examples/baselines/generate_longform.sh b/scripts/baselines/generate_longform.sh → examples/baselines/generate_longform.sh
diff --git a/scripts/baselines/generate_selfqa.sh → examples/baselines/generate_selfqa.sh b/scripts/baselines/generate_selfqa.sh → examples/baselines/generate_selfqa.sh
diff --git a/scripts/baselines/generate_wrap.sh → examples/baselines/generate_wrap.sh b/scripts/baselines/generate_wrap.sh → examples/baselines/generate_wrap.sh
diff --git a/scripts/evaluate/evaluate.sh → examples/evaluate/evaluate.sh b/scripts/evaluate/evaluate.sh → examples/evaluate/evaluate.sh
diff --git a/examples/extract/extract_schema_guided/README.md b/examples/extract/extract_schema_guided/README.md
@@ -0,0 +1 @@
+# Extract Schema-Guided Information from Documents
diff --git a/examples/extract/extract_schema_guided/extract_schema_guided.sh b/examples/extract/extract_schema_guided/extract_schema_guided.sh
@@ -0,0 +1,3 @@
+python3 -m graphgen.run \
+--config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml \
+--output_dir cache/
diff --git a/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml b/examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml
@@ -0,0 +1,35 @@
+global_params:
+  working_dir: cache
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+      input_path:
+        - examples/input_examples/extract_demo.txt
+
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
+    params:
+      chunk_size: 20480 # larger chunk size for better context
+      chunk_overlap: 2000
+
+  - id: extract
+    op_name: extract
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      method: schema_guided
+      schema_path: graphgen/templates/extraction/schemas/legal_contract.json
diff --git a/examples/generate/generate_aggregated_qa/README.md b/examples/generate/generate_aggregated_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Aggregated QAs
+
+Aggregated mode is one of three question-answering scenarios in GraphGen (alongside atomic and multi-hop) designed to generate synthetic training data that incorporates complex, integrated knowledge from multiple sources.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Extract Schema-Guided Information from Documents
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Generate Aggregated QAs

		Aggregated mode is one of three question-answering scenarios in GraphGen (alongside atomic and multi-hop) designed to generate synthetic training data that incorporates complex, integrated knowledge from multiple sources.