Skip to content

Commit 761e64b

Browse files
ChenZiHong-Gavingithub-code-quality[bot]gemini-code-assist[bot]
authored
refactor: replace sqlite with rocksdb (#109)
* refactor: replace sqlite with rocksdb * Potential fix for pull request finding 'Empty except' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> * Update graphgen/models/storage/rocksdb_cache.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 13c2fa1 commit 761e64b

File tree

5 files changed

+50
-5
lines changed

5 files changed

+50
-5
lines changed

graphgen/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,5 @@
3333
from .searcher.web.bing_search import BingSearch
3434
from .searcher.web.google_search import GoogleSearch
3535
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
36-
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
36+
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache
3737
from .tokenizer import Tokenizer
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .json_storage import JsonKVStorage, JsonListStorage
22
from .networkx_storage import NetworkXStorage
3+
from .rocksdb_cache import RocksDBCache
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from pathlib import Path
2+
from typing import Any, Iterator, Optional
3+
4+
# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
5+
# pylint: disable=no-name-in-module
6+
from rocksdict import Rdict
7+
8+
9+
class RocksDBCache:
10+
def __init__(self, cache_dir: str):
11+
self.db_path = Path(cache_dir)
12+
self.db = Rdict(str(self.db_path))
13+
14+
def get(self, key: str) -> Optional[Any]:
15+
return self.db.get(key)
16+
17+
def set(self, key: str, value: Any):
18+
self.db[key] = value
19+
20+
def delete(self, key: str):
21+
try:
22+
del self.db[key]
23+
except KeyError:
24+
# If the key does not exist, do nothing (deletion is idempotent for caches)
25+
pass
26+
27+
def close(self):
28+
if hasattr(self, "db") and self.db is not None:
29+
self.db.close()
30+
self.db = None
31+
32+
def __del__(self):
33+
# Ensure the database is closed when the object is destroyed
34+
self.close()
35+
36+
def __enter__(self):
37+
return self
38+
39+
def __exit__(self, exc_type, exc_val, exc_tb):
40+
self.close()
41+
42+
def __iter__(self) -> Iterator[str]:
43+
return iter(self.db.keys())

graphgen/operators/read/parallel_file_scanner.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,15 @@
44
from pathlib import Path
55
from typing import Any, Dict, List, Set, Union
66

7-
from diskcache import Cache
8-
7+
from graphgen.models import RocksDBCache
98
from graphgen.utils import logger
109

1110

1211
class ParallelFileScanner:
1312
def __init__(
1413
self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
1514
):
16-
self.cache = Cache(cache_dir)
15+
self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
1716
self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
1817
self.rescan = rescan
1918
self.max_workers = max_workers

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ requests
2020
fastapi
2121
trafilatura
2222
aiohttp
23-
diskcache
2423
socksio
2524

2625
leidenalg
2726
igraph
2827
python-louvain
2928

29+
# storage
30+
rocksdict
31+
3032
# KG
3133
rdflib
3234

0 commit comments

Comments
 (0)