Skip to content

Commit 837a1ee

Browse files
feat: add bds baseline (#48)
* feat: add bds baseline * fix(baselines): fix tokenizer * Update baselines/LongForm/README.md Co-authored-by: Copilot <[email protected]> * fix: fix OpenAIClient import error --------- Co-authored-by: Copilot <[email protected]>
1 parent 720ba9b commit 837a1ee

File tree

20 files changed

+544
-17
lines changed

20 files changed

+544
-17
lines changed

baselines/BDS/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Bottom-up Domain-specific Superintelligence: A Reliable Knowledge Graph is What We Need
2+
https://arxiv.org/pdf/2507.13966

baselines/BDS/__init__.py

Whitespace-only changes.

baselines/BDS/bds.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import argparse
2+
import asyncio
3+
import json
4+
import os
5+
from dataclasses import dataclass
6+
from typing import List
7+
8+
import networkx as nx
9+
from dotenv import load_dotenv
10+
from tqdm.asyncio import tqdm as tqdm_async
11+
12+
from graphgen.models import NetworkXStorage, OpenAIClient, Tokenizer
13+
from graphgen.utils import create_event_loop
14+
15+
QA_GENERATION_PROMPT = """
16+
Create an agriculture examination question for advanced agricultural students that tests the relationship between {src} and {tgt}. The relationship is: {path}. The question should:
17+
1. Be in multiple choice format (4 options)
18+
2. Require agriculture reasoning along the relationship
19+
3. Include a brief farm or field scenario
20+
4. Not directly mention the relationship in the question stem
21+
5. Have one clearly correct answer
22+
Format:
23+
<Question>
24+
[Farm or Field Scenario]
25+
</Question>
26+
<Options>
27+
A. [Option]
28+
B. [Option]
29+
C. [Option]
30+
D. [Option]
31+
</Options>
32+
<Answer>:
33+
[Correct Option Letter]
34+
</Answer>
35+
"""
36+
37+
38+
def _post_process(text: str) -> dict:
39+
try:
40+
q = text.split("<Question>")[1].split("</Question>")[0].strip()
41+
opts = text.split("<Options>")[1].split("</Options>")[0].strip().splitlines()
42+
opts = [o.strip() for o in opts if o.strip()]
43+
ans = text.split("<Answer>:")[1].strip()[0].upper()
44+
return {
45+
"question": q,
46+
"options": opts,
47+
"answer": ord(ans) - ord("A"),
48+
"raw": text,
49+
}
50+
except Exception as e: # pylint: disable=broad-except
51+
print(f"Error in post-processing: {e}")
52+
return {}
53+
54+
55+
@dataclass
56+
class BDS:
57+
llm_client: OpenAIClient = None
58+
max_concurrent: int = 1000
59+
60+
def generate(self, tasks: List[dict]) -> List[dict]:
61+
loop = create_event_loop()
62+
return loop.run_until_complete(self._async_generate(tasks))
63+
64+
async def _async_generate(self, tasks: List[dict]) -> List[dict]:
65+
sem = asyncio.Semaphore(self.max_concurrent)
66+
67+
async def job(item):
68+
async with sem:
69+
path_str = " -> ".join([f"({h},{r},{t})" for h, r, t in item["path"]])
70+
prompt = QA_GENERATION_PROMPT.format(
71+
src=item["src"], tgt=item["tgt"], path=path_str
72+
)
73+
resp = await self.llm_client.generate_answer(prompt)
74+
return _post_process(resp)
75+
76+
tasks = [job(it) for it in tasks]
77+
results = []
78+
for coro in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)):
79+
try:
80+
if r := await coro:
81+
results.append(r)
82+
except Exception as e: # pylint: disable=broad-except
83+
print("Error:", e)
84+
return results
85+
86+
87+
if __name__ == "__main__":
88+
parser = argparse.ArgumentParser()
89+
parser.add_argument(
90+
"--input_file",
91+
help="GraphML input file path.",
92+
default="resources/input_examples/graphml_demo.graphml",
93+
type=str,
94+
)
95+
parser.add_argument(
96+
"--output_file",
97+
help="Output file path.",
98+
default="cache/data/bds_qa.jsonl",
99+
type=str,
100+
)
101+
args = parser.parse_args()
102+
103+
load_dotenv()
104+
105+
tokenizer_instance: Tokenizer = Tokenizer(
106+
model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base")
107+
)
108+
llm_client = OpenAIClient(
109+
model_name=os.getenv("SYNTHESIZER_MODEL"),
110+
api_key=os.getenv("SYNTHESIZER_API_KEY"),
111+
base_url=os.getenv("SYNTHESIZER_BASE_URL"),
112+
tokenizer_instance=tokenizer_instance,
113+
)
114+
bds = BDS(llm_client=llm_client)
115+
116+
graph = NetworkXStorage.load_nx_graph(args.input_file)
117+
118+
MAX_PATH = 20000
119+
all_paths = []
120+
121+
G = graph.to_directed() if not graph.is_directed() else graph
122+
print(G)
123+
124+
source_nodes = [n for n in G.nodes if G.out_degree(n) > 0][:1000]
125+
126+
for src in source_nodes:
127+
for path in nx.all_simple_paths(G, source=src, target=list(G.nodes), cutoff=3):
128+
if len(path) == 4:
129+
all_paths.append(path)
130+
if len(all_paths) >= MAX_PATH:
131+
break
132+
if len(all_paths) >= MAX_PATH:
133+
break
134+
if len(all_paths) >= MAX_PATH:
135+
break
136+
137+
print(f"Found {len(all_paths)} 4-node paths")
138+
139+
items = []
140+
for path in all_paths:
141+
path_edges = []
142+
for i in range(len(path) - 1):
143+
edge_data = G.get_edge_data(path[i], path[i + 1])
144+
if edge_data is None:
145+
edge_data = G.get_edge_data(path[i + 1], path[i])
146+
if edge_data is None:
147+
print(f"Warning: No edge data between {path[i]} and {path[i+1]}")
148+
relation = "related_to"
149+
else:
150+
relation = edge_data.get("relation", "related_to")
151+
path_edges.append((path[i], relation, path[i + 1]))
152+
items.append({"src": path[0], "tgt": path[-1], "path": path_edges})
153+
154+
print(f"Prepared {len(items)} items for question generation")
155+
156+
qa_pairs = bds.generate(items)
157+
print(f"Generated {len(qa_pairs)} QA pairs")
158+
159+
# Save results
160+
with open(args.output_file, "w", encoding="utf-8") as f:
161+
json.dump(qa_pairs, f, indent=4, ensure_ascii=False)

baselines/EntiGraph/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# EntiGraph
2+
https://arxiv.org/abs/2409.07431
3+
https://github.com/zitongyang/synthetic_continued_pretraining

baselines/EntiGraph/entigraph.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
# https://arxiv.org/abs/2409.07431
2-
# https://github.com/zitongyang/synthetic_continued_pretraining
3-
41
import argparse
52
import asyncio
63
import json

baselines/Genie/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Genie
2+
https://arxiv.org/pdf/2401.14367

baselines/Genie/genie.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# https://arxiv.org/pdf/2401.14367
2-
31
import argparse
42
import asyncio
53
import json

baselines/LongForm/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# LongForm
2+
https://arxiv.org/pdf/2304.08460
3+
https://github.com/akoksal/LongForm/tree/main

baselines/LongForm/longform.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
# https://arxiv.org/pdf/2304.08460
2-
# https://github.com/akoksal/LongForm/tree/main
3-
41
import argparse
52
import asyncio
63
import json

baselines/SELF-QA/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# SELF-QA
2+
https://arxiv.org/abs/2305.11952

0 commit comments

Comments
 (0)