|
24 | 24 | level=logging.DEBUG
|
25 | 25 | )
|
26 | 26 |
|
| 27 | +""" |
| 28 | +This is intended as a simple end-to-end example of how to get your data into |
| 29 | +the format that PyTorch BigGraph expects using SQL. It's implemented in SQLite |
| 30 | +for portability, but similar techniques scale to 100bn edges using cloud |
| 31 | +databases such as BigQuery. This pipeline can be split into three different |
| 32 | +components: |
| 33 | +
|
| 34 | +1. Data preparation |
| 35 | +2. Data verification/checking |
| 36 | +3. Training |
| 37 | +
|
| 38 | +To run the pipeline, you'll first need to download the edges.csv file, |
| 39 | +available HERE (TODO: INSERT LINK). This graph was constructed by |
| 40 | +taking the [ogbl-citation2](https://github.com/snap-stanford/ogb) graph, and |
| 41 | +adding edges for both paper-citations and years-published. While this graph |
| 42 | +might not make a huge amount of sense, it's intended to largely fulfill a |
| 43 | +pedagogical purpose. In the data preparation stage, we first load the graph |
| 44 | +into a SQLite database, and then we transform and partition it. |
| 45 | +""" |
| 46 | + |
27 | 47 | def remap_relationships(conn):
|
28 | 48 | """
|
29 | 49 | A function to remap relationships using SQL queries.
|
@@ -286,17 +306,18 @@ def write_training_data(outdir, rels, entity2partitions, conn):
|
286 | 306 |
|
287 | 307 | def main(NPARTS=2, edge_file_name='edges.csv', outdir='training_data/'):
|
288 | 308 | conn = sqlite3.connect("citationv2.db")
|
289 |
| - load_edges(edge_file_name, conn) |
| 309 | + # load_edges(edge_file_name, conn) |
290 | 310 |
|
291 | 311 | entity2partitions = {
|
292 | 312 | 'paper': NPARTS,
|
293 | 313 | 'year': 1,
|
294 | 314 | }
|
295 | 315 |
|
296 | 316 | rels = remap_relationships(conn)
|
297 |
| - remap_entities(conn, entity2partitions) |
298 |
| - remap_edges(conn, rels, entity2partitions) |
299 |
| - out = Path(outdir).mkdir(parents=True, exist_ok=True) |
| 317 | + # remap_entities(conn, entity2partitions) |
| 318 | + # remap_edges(conn, rels, entity2partitions) |
| 319 | + Path(outdir).mkdir(parents=True, exist_ok=True) |
| 320 | + out = Path(outdir) |
300 | 321 | write_training_data(out, rels, entity2partitions, conn)
|
301 | 322 |
|
302 | 323 |
|
|
0 commit comments