add docker for local development

EBISPOT · Apr 23, 2024 · 79624ad · 79624ad
1 parent c63980e
commit 79624ad
Show file tree

Hide file tree

Showing 32 changed files with 471 additions and 102 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,10 @@
+tmp
+.git
+00_fetch_data
+*.gz
+*.tsv
+*.csv
+*.xml
+
+
+
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ foo.jsonl
 *.jsonl
 *.gz
 *.csv
+*.xml
 cargo_home
 testowls
 .~lock.*

diff --git a/02_equivalences/grebi_assign_ids/src/main.rs b/02_equivalences/grebi_assign_ids/src/main.rs
@@ -30,13 +30,6 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 
 fn main() {
 
-    let args: Vec<String> = env::args().collect();
-
-    if args.len() < 2 {
-        eprintln!("Usage: grebi_assign_ids");
-        std::process::exit(1);
-    }
-
     let args = Args::parse();
     let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();
 

diff --git a/02_equivalences/grebi_assign_ids_worker.slurm.py b/02_equivalences/grebi_assign_ids_worker.slurm.py
@@ -56,7 +56,7 @@ def main():
 
     cmd = ' '.join([
         'zcat ' + shlex.quote(nodes_jsonl_gz_filename),
-        '| ./target/release/grebi_assign_ids --groups-txt ', shlex.quote(groups_txt_path),
+        '| ./target/release/grebi_assign_ids --groups-txt', shlex.quote(groups_txt_path),
         '>', shlex.quote(expanded_subjects_jsonl_filename)
     ])
 

diff --git a/03_merge/grebi_merge.slurm.py b/03_merge/grebi_merge.slurm.py
@@ -34,7 +34,7 @@ def main():
     result_filenames = list(filter(lambda x: exists(x.split(':')[1]), result_filenames))
 
 
-    out_path = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.")
+    out_path = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.")
     os.makedirs(os.path.dirname(out_path), exist_ok=True)
 
     cmd = './target/release/grebi_merge ' + ' '.join(result_filenames)

diff --git a/04_index/grebi_index.slurm.py b/04_index/grebi_index.slurm.py
@@ -18,9 +18,9 @@ def main():
     with open(config_filename, 'r') as f:
         config = json.load(f)
 
-    input_merged_gz_filenames = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.*")
-    out_path = os.path.join(config['worker_output_dir'], "04_index", "rocksdb")
-    output_metadata_filename = os.path.join(config['worker_output_dir'], "04_index", "metadata.json")
+    input_merged_gz_filenames = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.*")
+    out_path = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "rocksdb")
+    output_metadata_filename = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "metadata.json")
 
     os.makedirs(os.path.dirname(out_path), exist_ok=True)
 

diff --git a/05_materialize_edges/grebi_rocks2neo.py b/05_materialize_edges/grebi_rocks2neo.py
@@ -19,20 +19,20 @@ def main():
     with open(config_filename, 'r') as f:
         config = json.load(f)
 
-    input_merged_gz_filenames = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.0*")
+    input_merged_gz_filenames = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.0*")
 
     all_files = glob.glob(input_merged_gz_filenames)
     max_file_num = max(list(map(lambda f: int(f.split('.')[-2]), all_files)))
     print(get_time() + " --- Max file num: " + str(max_file_num))
 
-    os.makedirs(os.path.join(config['persistent_output_dir'], '05_materialize_edges'), exist_ok=True)
+    os.makedirs(os.path.join(os.environ['GREBI_NFS_TMP'], '05_materialize_edges'), exist_ok=True)
 
     if config['use_slurm'] == True:
         print("Running rocks2neo on slurm (use_slurm = true)")
         slurm_cmd = ' '.join([
             'sbatch',
             '--wait',
-            '-o ' + os.path.abspath(os.path.join(config['persistent_output_dir'], '05_materialize_edges', 'rocks2neo_%a.log')),
+            '-o ' + os.path.abspath(os.path.join(os.environ['GREBI_NFS_TMP'], '05_materialize_edges', 'rocks2neo_%a.log')),
             '--array=0-' + str(max_file_num) + '%' + str(config['slurm_max_workers']['extract']),
             '--time=' + config['slurm_max_time']['extract'],
             '--mem=' + config['slurm_max_memory']['extract'],
@@ -42,7 +42,7 @@ def main():
         if os.system(slurm_cmd) != 0:
             print("rocks2neo failed")
             exit(1)
-        os.system("tail -n +1 " + os.path.abspath(os.path.join(config['persistent_output_dir'], '05_materialize_edges', '*.log')))
+        os.system("tail -n +1 " + os.path.abspath(os.path.join(os.environ['GREBI_NFS_TMP'], '05_materialize_edges', '*.log')))
     else:
         for n in range(max_file_num+1):
             print("Running " + str(n) + " of " + str(max_file_num))

diff --git a/05_materialize_edges/grebi_rocks2neo.slurm.py b/05_materialize_edges/grebi_rocks2neo.slurm.py
@@ -26,13 +26,13 @@ def main():
     with open(config_filename, 'r') as f:
         config = json.load(f)
 
-    input_merged_gz_filenames = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.0*")
-    input_metadata_filename = os.path.join(config['worker_output_dir'], "04_index", "metadata.json")
-    input_rocksdb_path = os.path.join(config['worker_output_dir'], "04_index", "rocksdb")
-    # out_nodes_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4nodes_" + task_id + ".csv.gz")
-    # out_edges_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4edges_" + task_id + ".csv.gz")
-    out_nodes_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4nodes_" + task_id + ".csv")
-    out_edges_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4edges_" + task_id + ".csv")
+    input_merged_gz_filenames = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.0*")
+    input_metadata_filename = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "metadata.json")
+    input_rocksdb_path = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "rocksdb")
+    # out_nodes_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4nodes_" + task_id + ".csv.gz")
+    # out_edges_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4edges_" + task_id + ".csv.gz")
+    out_nodes_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4nodes_" + task_id + ".csv")
+    out_edges_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4edges_" + task_id + ".csv")
 
     os.makedirs(os.path.dirname(out_edges_path), exist_ok=True)
 

diff --git a/05_materialize_edges/grebi_rocks2neo/src/main.rs b/05_materialize_edges/grebi_rocks2neo/src/main.rs
@@ -152,22 +152,18 @@ fn write_node(entity:&SlicedEntity, all_node_props:&Vec<String>, nodes_writer:&m
     nodes_writer.write_all(b"\",\"").unwrap();
 
     // :LABEL
-    let mut is_first = true;
+    nodes_writer.write_all(b"GraphNode;").unwrap();
     entity.props.iter().for_each(|prop| {
         if prop.key == "grebi:type".as_bytes() {
-            if is_first {
-                is_first = false;
-            } else {
-                nodes_writer.write_all(b";").unwrap();
-            }
+            nodes_writer.write_all(b";").unwrap();
             parse_json_and_write(prop.value, nodes_writer);
         }
     });
 
     nodes_writer.write_all(b"\",\"").unwrap();
 
     // grebi:datasources
-    is_first = true;
+    let mut is_first = true;
     entity.datasources.iter().for_each(|ds| {
         if is_first {
             is_first = false;
@@ -219,7 +215,7 @@ fn write_node(entity:&SlicedEntity, all_node_props:&Vec<String>, nodes_writer:&m
 
 fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, db:&DB, all_edge_props:&Vec<String>, edges_writer: &mut BufWriter<File>, exclude:&HashSet<Vec<u8>>, datasources:&Vec<&[u8]>) {
 
-    if exclude.contains(prop.key) {
+    if prop.key.eq(b"id") || exclude.contains(prop.key) {
         return;
     }
 

diff --git a/06_create_db/neo4j/neo4j_import.slurm.py b/06_create_db/neo4j/neo4j_import.slurm.py
@@ -19,11 +19,11 @@ def main():
     with open(config_filename, 'r') as f:
         config = json.load(f)
 
-    nodes = glob.glob(os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4nodes_*"))
-    edges = glob.glob(os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4edges_*"))
-    neo_path = os.path.join(config['worker_output_dir'], "06_create_db", "neo4j")
-    neo_data_path = os.path.join(config['worker_output_dir'], "06_create_db", "neo4j", "data")
-    neo_logs_path = os.path.join(config['worker_output_dir'], "06_create_db", "neo4j", "logs")
+    nodes = glob.glob(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4nodes_*"))
+    edges = glob.glob(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4edges_*"))
+    neo_path = os.path.join(os.environ['GREBI_HPS_TMP'], "06_create_db", "neo4j")
+    neo_data_path = os.path.join(os.environ['GREBI_HPS_TMP'], "06_create_db", "neo4j", "data")
+    neo_logs_path = os.path.join(os.environ['GREBI_HPS_TMP'], "06_create_db", "neo4j", "logs")
 
     os.system('rm -rf ' + shlex.quote(neo_path))
     os.makedirs(neo_data_path, exist_ok=True)
@@ -43,7 +43,7 @@ def main():
         cmd = ' '.join([
             'JAVA_OPTS=\'-server -Xms50g -Xmx50g\'',
             'singularity run',
-            '--bind ' + os.path.abspath(os.path.join(config['worker_output_dir'], "05_materialize_edges")) + ':/mnt',
+            '--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges")) + ':/mnt',
             '--bind ' + shlex.quote(neo_data_path) + ':/data',
             '--bind ' + shlex.quote(neo_logs_path) + ':/logs',
             '--writable-tmpfs',
@@ -52,7 +52,7 @@ def main():
     else:
         cmd = ' '.join([
             'docker run',
-            '-v ' + os.path.abspath(os.path.join(config['worker_output_dir'], "05_materialize_edges")) + ':/mnt',
+            '-v ' + os.path.abspath(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges")) + ':/mnt',
             '-v ' + shlex.quote(neo_data_path) + ':/data',
             '-v ' + shlex.quote(neo_logs_path) + ':/logs',
             'neo4j:5.18.0'

diff --git a/Dockerfile.dataload b/Dockerfile.dataload
@@ -0,0 +1,11 @@
+
+
+FROM rust:1.74-buster as builder
+
+RUN apt-get update && apt-get install -y cmake clang
+
+COPY 01* 02* 03* 04* 05* 06* prefix_maps scripts Cargo.* build.rs grebi_shared /work/
+
+RUN cd /work && ls && chmod +x /work/scripts/*.sh
+RUN cd /work && cargo build --release
+
diff --git a/README.md b/README.md
@@ -6,10 +6,12 @@ EBI Codon HPC pipeline for building integrated knowledge graphs from [EMBL-EBI r
 * [GWAS Catalog](https://www.ebi.ac.uk/gwas)
 * [OLS](https://www.ebi.ac.uk/ols4)
 * [Reactome](https://reactome.org/)
+* [OpenTargets](https://www.opentargets.org/)
 
 GrEBI also imports complementary datasets, so far:
 
-* The [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/)
+* [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/)
+* [Ubergraph](https://github.com/INCATools/ubergraph)
 
 The resulting graphs can be downloaded from https://ftp.ebi.ac.uk/pub/databases/spot/kg/
 

diff --git a/configs/datasource_configs/impc.json b/configs/datasource_configs/impc.json
@@ -164,6 +164,16 @@
                 { "name": "--json-inject-key-prefix", "value": "impc:" },
                 { "name": "--json-inject-value-prefix", "value": "pmId:pmid:" }
             ]
+        },
+        {
+            "ingest_files": ["/nfs/production/parkinso/spot/jmcl/impc-kg/mouse_allele_json/*.json.gz"],
+            "ingest_script": "./target/release/grebi_ingest_json",
+            "ingest_args": [
+                { "name": "--json-rename-field", "value": "mouseAlleleId:id" },
+                { "name": "--json-inject-type", "value": "impc:Allele" },
+                { "name": "--json-inject-key-prefix", "value": "impc:" },
+                { "name": "--json-inject-value-prefix", "value": "ensemblAccId:ENSEMBL:" }
+            ]
         }
     ]
 }
diff --git a/configs/datasource_configs/ols.json b/configs/datasource_configs/ols.json
@@ -6,7 +6,7 @@
             "ingest_files": ["/nfs/production/parkinso/spot/ols4/prod/slurm_pipeline/ontologies.json.gz"],
             "ingest_script": "./target/release/grebi_ingest_ols",
             "ingest_args": [
-                { "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi" }
+                { "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl" }
             ]
         }
     ]

diff --git a/configs/pipeline_configs/ebi_full.json b/configs/pipeline_configs/ebi_full.json
@@ -1,6 +1,4 @@
 {
-    "persistent_output_dir": "/nfs/production/parkinso/spot/grebi/tmp/ebi_full",
-    "worker_output_dir": "/hps/nobackup/parkinso/spot/grebi/tmp/ebi_full",
     "use_slurm":true,
     "slurm_max_workers": {
         "ingest": 100,
@@ -13,15 +11,17 @@
         "assign_ids": "64G",
         "merge": "16G",
         "index": "64G",
-        "extract": "8G"
+        "extract": "8G",
+        "materialize_edges": "8:0:0"
     },
     "slurm_max_time": {
         "ingest": "8:0:0",
         "build_equiv_groups": "23:0:0",
         "assign_ids": "23:0:0",
         "merge": "23:0:0",
         "index": "23:0:0",
-        "extract": "23:0:0"
+        "extract": "23:0:0",
+        "materialize_edges": "8:0:0"
     },
     "bytes_per_merged_file": 104857600,
     "equivalence_props": [

diff --git a/configs/pipeline_configs/ebi_full_monarch.json b/configs/pipeline_configs/ebi_full_monarch.json
@@ -1,7 +1,5 @@
 {
-    "persistent_output_dir": "/nfs/production/parkinso/spot/grebi/tmp/ebi_full_monarch",
-    "worker_output_dir": "/hps/nobackup/parkinso/spot/grebi/tmp/ebi_full_monarch",
-    "use_slurm":true,
+    "use_slurm": true,
     "slurm_max_workers": {
         "ingest": 100,
         "assign_ids": 100,
@@ -13,15 +11,17 @@
         "assign_ids": "64G",
         "merge": "16G",
         "index": "64G",
-        "extract": "8G"
+        "extract": "8G",
+        "materialize_edges": "8:0:0"
     },
     "slurm_max_time": {
         "ingest": "8:0:0",
         "build_equiv_groups": "23:0:0",
         "assign_ids": "23:0:0",
         "merge": "23:0:0",
         "index": "23:0:0",
-        "extract": "23:0:0"
+        "extract": "23:0:0",
+        "materialize_edges": "8:0:0"
     },
     "bytes_per_merged_file": 104857600,
     "equivalence_props": [

diff --git a/configs/pipeline_configs/ebi_test.json b/configs/pipeline_configs/ebi_test.json
@@ -0,0 +1,48 @@
+{
+    "use_slurm":true,
+    "slurm_max_workers": {
+        "ingest": 100,
+        "assign_ids": 100,
+        "extract": 100
+    },
+    "slurm_max_memory": {
+        "ingest": "32G",
+        "build_equiv_groups": "64G",
+        "assign_ids": "64G",
+        "merge": "16G",
+        "index": "64G",
+        "extract": "8G",
+        "materialize_edges": "8:0:0"
+    },
+    "slurm_max_time": {
+        "ingest": "8:0:0",
+        "build_equiv_groups": "23:0:0",
+        "assign_ids": "23:0:0",
+        "merge": "23:0:0",
+        "index": "23:0:0",
+        "extract": "23:0:0",
+        "materialize_edges": "8:0:0"
+    },
+    "bytes_per_merged_file": 104857600,
+    "equivalence_props": [
+        "owl:equivalentClass",
+        "owl:equivalentProperty",
+        "owl:sameAs",
+        "grebi:equivalentTo",
+        "ols:iri",
+        "hgnc:ensembl_gene_id",
+        "obo:chebi/inchi",
+        "obo:chebi/inchikey",
+        "obo:chebi/smiles",
+        "impc:pmId"
+    ],
+    "exclude_edges": [
+        "ols:iri",
+        "ols:shortForm",
+        "ols:curie",
+        "oboinowl:id"
+    ],
+    "datasource_configs": [
+        "./configs/datasource_configs/ols.json"
+    ]
+}
diff --git a/configs/pipeline_configs/local_test.json b/configs/pipeline_configs/local_test.json
@@ -1,6 +1,4 @@
 {
-    "persistent_output_dir": "./tmp",
-    "worker_output_dir": "./tmp",
     "use_slurm": false,
     "bytes_per_merged_file": 104857600,
     "equivalence_props": [
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ foo.jsonl @@
     *.jsonl
     *.gz
     *.csv
+    *.xml
     cargo_home
     testowls
     .~lock.*
@@ Expand Down @@