Skip to content

Commit

Permalink
add docker for local development
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl authored Apr 23, 2024
1 parent c63980e commit 79624ad
Show file tree
Hide file tree
Showing 32 changed files with 471 additions and 102 deletions.
10 changes: 10 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
tmp
.git
00_fetch_data
*.gz
*.tsv
*.csv
*.xml



1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ foo.jsonl
*.jsonl
*.gz
*.csv
*.xml
cargo_home
testowls
.~lock.*
Expand Down
7 changes: 0 additions & 7 deletions 02_equivalences/grebi_assign_ids/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,6 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

fn main() {

let args: Vec<String> = env::args().collect();

if args.len() < 2 {
eprintln!("Usage: grebi_assign_ids");
std::process::exit(1);
}

let args = Args::parse();
let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();

Expand Down
2 changes: 1 addition & 1 deletion 02_equivalences/grebi_assign_ids_worker.slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def main():

cmd = ' '.join([
'zcat ' + shlex.quote(nodes_jsonl_gz_filename),
'| ./target/release/grebi_assign_ids --groups-txt ', shlex.quote(groups_txt_path),
'| ./target/release/grebi_assign_ids --groups-txt', shlex.quote(groups_txt_path),
'>', shlex.quote(expanded_subjects_jsonl_filename)
])

Expand Down
2 changes: 1 addition & 1 deletion 03_merge/grebi_merge.slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def main():
result_filenames = list(filter(lambda x: exists(x.split(':')[1]), result_filenames))


out_path = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.")
out_path = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.")
os.makedirs(os.path.dirname(out_path), exist_ok=True)

cmd = './target/release/grebi_merge ' + ' '.join(result_filenames)
Expand Down
6 changes: 3 additions & 3 deletions 04_index/grebi_index.slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ def main():
with open(config_filename, 'r') as f:
config = json.load(f)

input_merged_gz_filenames = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.*")
out_path = os.path.join(config['worker_output_dir'], "04_index", "rocksdb")
output_metadata_filename = os.path.join(config['worker_output_dir'], "04_index", "metadata.json")
input_merged_gz_filenames = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.*")
out_path = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "rocksdb")
output_metadata_filename = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "metadata.json")

os.makedirs(os.path.dirname(out_path), exist_ok=True)

Expand Down
8 changes: 4 additions & 4 deletions 05_materialize_edges/grebi_rocks2neo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@ def main():
with open(config_filename, 'r') as f:
config = json.load(f)

input_merged_gz_filenames = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.0*")
input_merged_gz_filenames = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.0*")

all_files = glob.glob(input_merged_gz_filenames)
max_file_num = max(list(map(lambda f: int(f.split('.')[-2]), all_files)))
print(get_time() + " --- Max file num: " + str(max_file_num))

os.makedirs(os.path.join(config['persistent_output_dir'], '05_materialize_edges'), exist_ok=True)
os.makedirs(os.path.join(os.environ['GREBI_NFS_TMP'], '05_materialize_edges'), exist_ok=True)

if config['use_slurm'] == True:
print("Running rocks2neo on slurm (use_slurm = true)")
slurm_cmd = ' '.join([
'sbatch',
'--wait',
'-o ' + os.path.abspath(os.path.join(config['persistent_output_dir'], '05_materialize_edges', 'rocks2neo_%a.log')),
'-o ' + os.path.abspath(os.path.join(os.environ['GREBI_NFS_TMP'], '05_materialize_edges', 'rocks2neo_%a.log')),
'--array=0-' + str(max_file_num) + '%' + str(config['slurm_max_workers']['extract']),
'--time=' + config['slurm_max_time']['extract'],
'--mem=' + config['slurm_max_memory']['extract'],
Expand All @@ -42,7 +42,7 @@ def main():
if os.system(slurm_cmd) != 0:
print("rocks2neo failed")
exit(1)
os.system("tail -n +1 " + os.path.abspath(os.path.join(config['persistent_output_dir'], '05_materialize_edges', '*.log')))
os.system("tail -n +1 " + os.path.abspath(os.path.join(os.environ['GREBI_NFS_TMP'], '05_materialize_edges', '*.log')))
else:
for n in range(max_file_num+1):
print("Running " + str(n) + " of " + str(max_file_num))
Expand Down
14 changes: 7 additions & 7 deletions 05_materialize_edges/grebi_rocks2neo.slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ def main():
with open(config_filename, 'r') as f:
config = json.load(f)

input_merged_gz_filenames = os.path.join(config['worker_output_dir'], "03_merge", "merged.jsonl.0*")
input_metadata_filename = os.path.join(config['worker_output_dir'], "04_index", "metadata.json")
input_rocksdb_path = os.path.join(config['worker_output_dir'], "04_index", "rocksdb")
# out_nodes_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4nodes_" + task_id + ".csv.gz")
# out_edges_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4edges_" + task_id + ".csv.gz")
out_nodes_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4nodes_" + task_id + ".csv")
out_edges_path = os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4edges_" + task_id + ".csv")
input_merged_gz_filenames = os.path.join(os.environ['GREBI_HPS_TMP'], "03_merge", "merged.jsonl.0*")
input_metadata_filename = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "metadata.json")
input_rocksdb_path = os.path.join(os.environ['GREBI_HPS_TMP'], "04_index", "rocksdb")
# out_nodes_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4nodes_" + task_id + ".csv.gz")
# out_edges_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4edges_" + task_id + ".csv.gz")
out_nodes_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4nodes_" + task_id + ".csv")
out_edges_path = os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4edges_" + task_id + ".csv")

os.makedirs(os.path.dirname(out_edges_path), exist_ok=True)

Expand Down
12 changes: 4 additions & 8 deletions 05_materialize_edges/grebi_rocks2neo/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,22 +152,18 @@ fn write_node(entity:&SlicedEntity, all_node_props:&Vec<String>, nodes_writer:&m
nodes_writer.write_all(b"\",\"").unwrap();

// :LABEL
let mut is_first = true;
nodes_writer.write_all(b"GraphNode;").unwrap();
entity.props.iter().for_each(|prop| {
if prop.key == "grebi:type".as_bytes() {
if is_first {
is_first = false;
} else {
nodes_writer.write_all(b";").unwrap();
}
nodes_writer.write_all(b";").unwrap();
parse_json_and_write(prop.value, nodes_writer);
}
});

nodes_writer.write_all(b"\",\"").unwrap();

// grebi:datasources
is_first = true;
let mut is_first = true;
entity.datasources.iter().for_each(|ds| {
if is_first {
is_first = false;
Expand Down Expand Up @@ -219,7 +215,7 @@ fn write_node(entity:&SlicedEntity, all_node_props:&Vec<String>, nodes_writer:&m

fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, db:&DB, all_edge_props:&Vec<String>, edges_writer: &mut BufWriter<File>, exclude:&HashSet<Vec<u8>>, datasources:&Vec<&[u8]>) {

if exclude.contains(prop.key) {
if prop.key.eq(b"id") || exclude.contains(prop.key) {
return;
}

Expand Down
14 changes: 7 additions & 7 deletions 06_create_db/neo4j/neo4j_import.slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ def main():
with open(config_filename, 'r') as f:
config = json.load(f)

nodes = glob.glob(os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4nodes_*"))
edges = glob.glob(os.path.join(config['worker_output_dir'], "05_materialize_edges", "n4edges_*"))
neo_path = os.path.join(config['worker_output_dir'], "06_create_db", "neo4j")
neo_data_path = os.path.join(config['worker_output_dir'], "06_create_db", "neo4j", "data")
neo_logs_path = os.path.join(config['worker_output_dir'], "06_create_db", "neo4j", "logs")
nodes = glob.glob(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4nodes_*"))
edges = glob.glob(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges", "n4edges_*"))
neo_path = os.path.join(os.environ['GREBI_HPS_TMP'], "06_create_db", "neo4j")
neo_data_path = os.path.join(os.environ['GREBI_HPS_TMP'], "06_create_db", "neo4j", "data")
neo_logs_path = os.path.join(os.environ['GREBI_HPS_TMP'], "06_create_db", "neo4j", "logs")

os.system('rm -rf ' + shlex.quote(neo_path))
os.makedirs(neo_data_path, exist_ok=True)
Expand All @@ -43,7 +43,7 @@ def main():
cmd = ' '.join([
'JAVA_OPTS=\'-server -Xms50g -Xmx50g\'',
'singularity run',
'--bind ' + os.path.abspath(os.path.join(config['worker_output_dir'], "05_materialize_edges")) + ':/mnt',
'--bind ' + os.path.abspath(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges")) + ':/mnt',
'--bind ' + shlex.quote(neo_data_path) + ':/data',
'--bind ' + shlex.quote(neo_logs_path) + ':/logs',
'--writable-tmpfs',
Expand All @@ -52,7 +52,7 @@ def main():
else:
cmd = ' '.join([
'docker run',
'-v ' + os.path.abspath(os.path.join(config['worker_output_dir'], "05_materialize_edges")) + ':/mnt',
'-v ' + os.path.abspath(os.path.join(os.environ['GREBI_HPS_TMP'], "05_materialize_edges")) + ':/mnt',
'-v ' + shlex.quote(neo_data_path) + ':/data',
'-v ' + shlex.quote(neo_logs_path) + ':/logs',
'neo4j:5.18.0'
Expand Down
11 changes: 11 additions & 0 deletions Dockerfile.dataload
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@


FROM rust:1.74-buster as builder

RUN apt-get update && apt-get install -y cmake clang

COPY 01* 02* 03* 04* 05* 06* prefix_maps scripts Cargo.* build.rs grebi_shared /work/

RUN cd /work && ls && chmod +x /work/scripts/*.sh
RUN cd /work && cargo build --release

4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ EBI Codon HPC pipeline for building integrated knowledge graphs from [EMBL-EBI r
* [GWAS Catalog](https://www.ebi.ac.uk/gwas)
* [OLS](https://www.ebi.ac.uk/ols4)
* [Reactome](https://reactome.org/)
* [OpenTargets](https://www.opentargets.org/)

GrEBI also imports complementary datasets, so far:

* The [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/)
* [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/)
* [Ubergraph](https://github.com/INCATools/ubergraph)

The resulting graphs can be downloaded from https://ftp.ebi.ac.uk/pub/databases/spot/kg/

Expand Down
10 changes: 10 additions & 0 deletions configs/datasource_configs/impc.json
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,16 @@
{ "name": "--json-inject-key-prefix", "value": "impc:" },
{ "name": "--json-inject-value-prefix", "value": "pmId:pmid:" }
]
},
{
"ingest_files": ["/nfs/production/parkinso/spot/jmcl/impc-kg/mouse_allele_json/*.json.gz"],
"ingest_script": "./target/release/grebi_ingest_json",
"ingest_args": [
{ "name": "--json-rename-field", "value": "mouseAlleleId:id" },
{ "name": "--json-inject-type", "value": "impc:Allele" },
{ "name": "--json-inject-key-prefix", "value": "impc:" },
{ "name": "--json-inject-value-prefix", "value": "ensemblAccId:ENSEMBL:" }
]
}
]
}
2 changes: 1 addition & 1 deletion configs/datasource_configs/ols.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"ingest_files": ["/nfs/production/parkinso/spot/ols4/prod/slurm_pipeline/ontologies.json.gz"],
"ingest_script": "./target/release/grebi_ingest_ols",
"ingest_args": [
{ "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi" }
{ "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl" }
]
}
]
Expand Down
8 changes: 4 additions & 4 deletions configs/pipeline_configs/ebi_full.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
{
"persistent_output_dir": "/nfs/production/parkinso/spot/grebi/tmp/ebi_full",
"worker_output_dir": "/hps/nobackup/parkinso/spot/grebi/tmp/ebi_full",
"use_slurm":true,
"slurm_max_workers": {
"ingest": 100,
Expand All @@ -13,15 +11,17 @@
"assign_ids": "64G",
"merge": "16G",
"index": "64G",
"extract": "8G"
"extract": "8G",
"materialize_edges": "8:0:0"
},
"slurm_max_time": {
"ingest": "8:0:0",
"build_equiv_groups": "23:0:0",
"assign_ids": "23:0:0",
"merge": "23:0:0",
"index": "23:0:0",
"extract": "23:0:0"
"extract": "23:0:0",
"materialize_edges": "8:0:0"
},
"bytes_per_merged_file": 104857600,
"equivalence_props": [
Expand Down
10 changes: 5 additions & 5 deletions configs/pipeline_configs/ebi_full_monarch.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
{
"persistent_output_dir": "/nfs/production/parkinso/spot/grebi/tmp/ebi_full_monarch",
"worker_output_dir": "/hps/nobackup/parkinso/spot/grebi/tmp/ebi_full_monarch",
"use_slurm":true,
"use_slurm": true,
"slurm_max_workers": {
"ingest": 100,
"assign_ids": 100,
Expand All @@ -13,15 +11,17 @@
"assign_ids": "64G",
"merge": "16G",
"index": "64G",
"extract": "8G"
"extract": "8G",
"materialize_edges": "8:0:0"
},
"slurm_max_time": {
"ingest": "8:0:0",
"build_equiv_groups": "23:0:0",
"assign_ids": "23:0:0",
"merge": "23:0:0",
"index": "23:0:0",
"extract": "23:0:0"
"extract": "23:0:0",
"materialize_edges": "8:0:0"
},
"bytes_per_merged_file": 104857600,
"equivalence_props": [
Expand Down
48 changes: 48 additions & 0 deletions configs/pipeline_configs/ebi_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"use_slurm":true,
"slurm_max_workers": {
"ingest": 100,
"assign_ids": 100,
"extract": 100
},
"slurm_max_memory": {
"ingest": "32G",
"build_equiv_groups": "64G",
"assign_ids": "64G",
"merge": "16G",
"index": "64G",
"extract": "8G",
"materialize_edges": "8:0:0"
},
"slurm_max_time": {
"ingest": "8:0:0",
"build_equiv_groups": "23:0:0",
"assign_ids": "23:0:0",
"merge": "23:0:0",
"index": "23:0:0",
"extract": "23:0:0",
"materialize_edges": "8:0:0"
},
"bytes_per_merged_file": 104857600,
"equivalence_props": [
"owl:equivalentClass",
"owl:equivalentProperty",
"owl:sameAs",
"grebi:equivalentTo",
"ols:iri",
"hgnc:ensembl_gene_id",
"obo:chebi/inchi",
"obo:chebi/inchikey",
"obo:chebi/smiles",
"impc:pmId"
],
"exclude_edges": [
"ols:iri",
"ols:shortForm",
"ols:curie",
"oboinowl:id"
],
"datasource_configs": [
"./configs/datasource_configs/ols.json"
]
}
2 changes: 0 additions & 2 deletions configs/pipeline_configs/local_test.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
{
"persistent_output_dir": "./tmp",
"worker_output_dir": "./tmp",
"use_slurm": false,
"bytes_per_merged_file": 104857600,
"equivalence_props": [
Expand Down
Loading

0 comments on commit 79624ad

Please sign in to comment.