From 32c6a94da42047658ce3c03d025394f39d885912 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Fri, 3 Apr 2026 13:46:45 -0700 Subject: [PATCH 01/14] cleaning up legacy logging util usage --- orion/build_manager.py | 87 +++++++------- orion/ingest_pipeline.py | 7 +- orion/kgx_file_merger.py | 7 +- orion/kgx_file_normalizer.py | 44 ++++---- orion/kgx_file_writer.py | 18 ++- orion/loader_interface.py | 8 +- orion/logging.py | 49 ++++++++ orion/memgraph_tools.py | 6 +- orion/merging.py | 7 +- orion/neo4j_tools.py | 6 +- orion/normalization.py | 27 ++--- orion/supplementation.py | 19 ++-- orion/utils.py | 106 +++--------------- parsers/LitCoin/src/bagel/bagel_gpt.py | 6 +- parsers/PHAROS/src/legacy_pharos_mysql.py | 5 +- parsers/PHAROS/src/loadPHAROS.py | 2 +- .../src/get_uniref_taxon_indexes.py | 5 +- parsers/ViralProteome/src/loadUniRef.py | 4 +- 18 files changed, 179 insertions(+), 234 deletions(-) create mode 100644 orion/logging.py diff --git a/orion/build_manager.py b/orion/build_manager.py index 5fb6c2ce..33afc1d6 100644 --- a/orion/build_manager.py +++ b/orion/build_manager.py @@ -8,7 +8,8 @@ from pathlib import Path from xxhash import xxh64_hexdigest -from orion.utils import LoggingUtil, GetDataPullError +from orion.utils import GetDataPullError +from orion.logging import get_orion_logger from orion.data_sources import get_available_data_sources, get_data_source_metadata_path from orion.exceptions import DataVersionError, GraphSpecError from orion.ingest_pipeline import IngestPipeline @@ -27,6 +28,8 @@ from orion.kgx_metadata import KGXGraphMetadata, KGXKnowledgeSource, generate_kgx_schema_file +logger = get_orion_logger("orion.build_manager") + NODES_FILENAME = 'nodes.jsonl' EDGES_FILENAME = 'edges.jsonl' REDUNDANT_EDGES_FILENAME = 'redundant_edges.jsonl' @@ -41,10 +44,6 @@ def __init__(self, graph_specs_dir=None, graph_output_dir=None): - self.logger = LoggingUtil.init_logging("ORION.orion.GraphBuilder", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) - self.graphs_dir = graph_output_dir if graph_output_dir else self.get_graph_output_dir() self.ingest_pipeline = IngestPipeline() # access to the data sources and their metadata self.graph_specs = {} # graph_id -> GraphSpec all potential graphs that could be built, including sub-graphs @@ -54,7 +53,7 @@ def __init__(self, def build_graph(self, graph_spec: GraphSpec): graph_id = graph_spec.graph_id - self.logger.info(f'Building graph {graph_id}...') + logger.info(f'Building graph {graph_id}...') graph_version = self.determine_graph_version(graph_spec) graph_metadata = self.get_graph_metadata(graph_id, graph_version) @@ -64,28 +63,28 @@ def build_graph(self, graph_spec: GraphSpec): # check for previous builds of this same graph build_status = graph_metadata.get_build_status() if build_status == Metadata.IN_PROGRESS: - self.logger.info(f'Graph {graph_id} version {graph_version} has status: in progress. ' + logger.info(f'Graph {graph_id} version {graph_version} has status: in progress. ' f'This means either the graph is already in the process of being built, ' f'or an error occurred previously that could not be handled. ' f'You may need to clean up and/or remove the failed build.') return False if build_status == Metadata.BROKEN or build_status == Metadata.FAILED: - self.logger.info(f'Graph {graph_id} version {graph_version} previously failed to build. Skipping..') + logger.info(f'Graph {graph_id} version {graph_version} previously failed to build. Skipping..') return False if build_status == Metadata.STABLE: self.build_results[graph_id] = {'version': graph_version} - self.logger.info(f'Graph {graph_id} version {graph_version} was already built.') + logger.info(f'Graph {graph_id} version {graph_version} was already built.') else: # if we get here we need to build the graph - self.logger.info(f'Building graph {graph_id} version {graph_version}, checking dependencies...') + logger.info(f'Building graph {graph_id} version {graph_version}, checking dependencies...') if not self.build_dependencies(graph_spec): - self.logger.warning(f'Aborting graph {graph_spec.graph_id} version {graph_version}, building ' + logger.warning(f'Aborting graph {graph_spec.graph_id} version {graph_version}, building ' f'dependencies failed.') return False - self.logger.info(f'Building graph {graph_id} version {graph_version}. ' + logger.info(f'Building graph {graph_id} version {graph_version}. ' f'Dependencies ready, merging sources...') graph_metadata.set_build_status(Metadata.IN_PROGRESS) graph_metadata.set_graph_version(graph_version) @@ -106,52 +105,52 @@ def build_graph(self, graph_spec: GraphSpec): if "merge_error" in merge_metadata: graph_metadata.set_build_error(merge_metadata["merge_error"], current_time) graph_metadata.set_build_status(Metadata.FAILED) - self.logger.error(f'Merge error occured while building graph {graph_id}: ' + logger.error(f'Merge error occured while building graph {graph_id}: ' f'{merge_metadata["merge_error"]}') return False graph_metadata.set_build_info(merge_metadata, current_time) graph_metadata.set_build_status(Metadata.STABLE) - self.logger.info(f'Building graph {graph_id} complete!') + logger.info(f'Building graph {graph_id} complete!') self.build_results[graph_id] = {'version': graph_version} nodes_filepath = os.path.join(graph_output_dir, NODES_FILENAME) edges_filepath = os.path.join(graph_output_dir, EDGES_FILENAME) if not graph_metadata.has_qc(): - self.logger.info(f'Running QC for graph {graph_id}...') + logger.info(f'Running QC for graph {graph_id}...') qc_results = validate_graph(nodes_file_path=nodes_filepath, edges_file_path=edges_filepath, graph_id=graph_id, graph_version=graph_version, - logger=self.logger) + logger=logger) graph_metadata.set_qc_results(qc_results) if qc_results['pass']: - self.logger.info(f'QC passed for graph {graph_id}.') + logger.info(f'QC passed for graph {graph_id}.') else: - self.logger.warning(f'QC failed for graph {graph_id}.') + logger.warning(f'QC failed for graph {graph_id}.') # Generate KGX metadata and schema files if not self.has_kgx_metadata(graph_output_dir): - self.logger.info(f'Generating KGX metadata for {graph_id}...') + logger.info(f'Generating KGX metadata for {graph_id}...') self.generate_kgx_metadata_files(graph_metadata=graph_metadata, graph_output_dir=graph_output_dir, graph_output_url=graph_output_url) - self.logger.info(f'KGX metadata generated for {graph_id}.') + logger.info(f'KGX metadata generated for {graph_id}.') if not self.has_kgx_schema(graph_output_dir): - self.logger.info(f'Generating KGX Schema for {graph_id}...') + logger.info(f'Generating KGX Schema for {graph_id}...') generate_kgx_schema_file(nodes_filepath=nodes_filepath, edges_filepath=edges_filepath, output_dir=graph_output_dir, graph_output_url=graph_output_url, graph_name=graph_spec.graph_name, biolink_version=graph_metadata.get_biolink_version()) - self.logger.info(f'KGX Schema generated for {graph_id}.') + logger.info(f'KGX Schema generated for {graph_id}.') needs_meta_kg = not self.has_meta_kg(graph_directory=graph_output_dir) needs_test_data = not self.has_test_data(graph_directory=graph_output_dir) if needs_meta_kg or needs_test_data: - self.logger.info(f'Generating MetaKG and test data for {graph_id}...') + logger.info(f'Generating MetaKG and test data for {graph_id}...') self.generate_meta_kg_and_test_data(graph_directory=graph_output_dir, generate_meta_kg=needs_meta_kg, generate_test_data=needs_test_data) @@ -170,16 +169,16 @@ def build_graph(self, graph_spec: GraphSpec): # combinations, like: # output_format: [['redundant', 'neo4j', 'answercoalesce'], ['collapsed_qualifiers'], ['neo4j']] if 'redundant_jsonl' in output_formats: - self.logger.info(f'Generating redundant edge KG for {graph_id}...') + logger.info(f'Generating redundant edge KG for {graph_id}...') redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) generate_redundant_kg(edges_filepath, redundant_filepath) if 'redundant_neo4j' in output_formats: - self.logger.info(f'Generating redundant edge KG for {graph_id}...') + logger.info(f'Generating redundant edge KG for {graph_id}...') redundant_filepath = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) if not os.path.exists(redundant_filepath): generate_redundant_kg(edges_filepath, redundant_filepath) - self.logger.info(f'Starting Neo4j dump pipeline for redundant {graph_id}...') + logger.info(f'Starting Neo4j dump pipeline for redundant {graph_id}...') dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, edges_filepath=redundant_filepath, output_directory=graph_output_dir, @@ -192,16 +191,16 @@ def build_graph(self, graph_spec: GraphSpec): dump_url=f'{graph_output_url}graph_{graph_version}_redundant.db.dump') if 'collapsed_qualifiers_jsonl' in output_formats: - self.logger.info(f'Generating collapsed qualifier predicates KG for {graph_id}...') + logger.info(f'Generating collapsed qualifier predicates KG for {graph_id}...') collapsed_qualifiers_filepath = edges_filepath.replace(EDGES_FILENAME, COLLAPSED_QUALIFIERS_FILENAME) generate_collapsed_qualifiers_kg(edges_filepath, collapsed_qualifiers_filepath) if 'collapsed_qualifiers_neo4j' in output_formats: - self.logger.info(f'Generating collapsed qualifier predicates KG for {graph_id}...') + logger.info(f'Generating collapsed qualifier predicates KG for {graph_id}...') collapsed_qualifiers_filepath = edges_filepath.replace(EDGES_FILENAME, COLLAPSED_QUALIFIERS_FILENAME) if not os.path.exists(collapsed_qualifiers_filepath): generate_collapsed_qualifiers_kg(edges_filepath, collapsed_qualifiers_filepath) - self.logger.info(f'Starting Neo4j dump pipeline for {graph_id} with collapsed qualifiers...') + logger.info(f'Starting Neo4j dump pipeline for {graph_id} with collapsed qualifiers...') dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, edges_filepath=collapsed_qualifiers_filepath, output_directory=graph_output_dir, @@ -215,7 +214,7 @@ def build_graph(self, graph_spec: GraphSpec): f'_collapsed_qualifiers.db.dump') if 'neo4j' in output_formats: - self.logger.info(f'Starting Neo4j dump pipeline for {graph_id}...') + logger.info(f'Starting Neo4j dump pipeline for {graph_id}...') dump_success = create_neo4j_dump(nodes_filepath=nodes_filepath, edges_filepath=edges_filepath, output_directory=graph_output_dir, @@ -228,7 +227,7 @@ def build_graph(self, graph_spec: GraphSpec): dump_url=f'{graph_output_url}graph_{graph_version}.db.dump') if 'memgraph' in output_formats: - self.logger.info(f'Starting memgraph dump pipeline for {graph_id}...') + logger.info(f'Starting memgraph dump pipeline for {graph_id}...') dump_success = create_memgraph_dump(nodes_filepath=nodes_filepath, edges_filepath=edges_filepath, output_directory=graph_output_dir, @@ -241,7 +240,7 @@ def build_graph(self, graph_spec: GraphSpec): dump_url=f'{graph_output_url}memgraph_{graph_version}.cypher') if 'answercoalesce' in output_formats: - self.logger.info(f'Generating answercoalesce files for {graph_id}...') + logger.info(f'Generating answercoalesce files for {graph_id}...') if 'redundant_jsonl' in output_formats or 'redundant_neo4j' in output_formats: edge_filepath_to_use = edges_filepath.replace(EDGES_FILENAME, REDUNDANT_EDGES_FILENAME) else: @@ -262,7 +261,7 @@ def determine_graph_version(self, graph_spec: GraphSpec): for source in graph_spec.sources: if not source.source_version: source.source_version = self.ingest_pipeline.get_latest_source_version(source.id) - self.logger.info(f'Using {source.id} version: {source.version}') + logger.info(f'Using {source.id} version: {source.version}') # for sub-graphs, if a graph version isn't specified, # use the graph spec for that subgraph to determine a graph version @@ -271,7 +270,7 @@ def determine_graph_version(self, graph_spec: GraphSpec): subgraph_graph_spec = self.graph_specs.get(subgraph.id, None) if subgraph_graph_spec: subgraph.graph_version = self.determine_graph_version(subgraph_graph_spec) - self.logger.info(f'Using subgraph {graph_spec.graph_id} version: {subgraph.graph_version}') + logger.info(f'Using subgraph {graph_spec.graph_id} version: {subgraph.graph_version}') else: raise GraphSpecError(f'Subgraph {subgraph.id} requested for graph {graph_spec.graph_id} ' f'but the version was not specified and could not be determined without ' @@ -293,7 +292,7 @@ def determine_graph_version(self, graph_spec: GraphSpec): for sub_graph_source in graph_spec.subgraphs]) graph_version = xxh64_hexdigest(composite_version_string) graph_spec.graph_version = graph_version - self.logger.info(f'Version determined for graph {graph_spec.graph_id}: {graph_version} ({composite_version_string})') + logger.info(f'Version determined for graph {graph_spec.graph_id}: {graph_version} ({composite_version_string})') return graph_version def build_dependencies(self, graph_spec: GraphSpec): @@ -306,12 +305,12 @@ def build_dependencies(self, graph_spec: GraphSpec): # subgraph as generated by the current graph spec, otherwise we won't be able to build it. subgraph_graph_spec = self.graph_specs.get(subgraph_id, None) if not subgraph_graph_spec: - self.logger.warning(f'Subgraph {subgraph_id} version {subgraph_version} was requested for graph ' + logger.warning(f'Subgraph {subgraph_id} version {subgraph_version} was requested for graph ' f'{graph_id} but it was not found and could not be built without a Graph Spec.') return False if subgraph_version != subgraph_graph_spec.graph_version: - self.logger.error(f'Subgraph {subgraph_id} version {subgraph_version} was specified, but that ' + logger.error(f'Subgraph {subgraph_id} version {subgraph_version} was specified, but that ' f'version of the graph could not be found. It can not be built now because the ' f'current version is {subgraph_graph_spec.graph_version}. Either specify a ' f'version that is already built, or remove the subgraph version specification to ' @@ -319,7 +318,7 @@ def build_dependencies(self, graph_spec: GraphSpec): return False # here the graph specs and versions all look right, but we still need to build the subgraph - self.logger.warning(f'Graph {graph_id}, subgraph dependency {subgraph_id} is not ready. Building now..') + logger.warning(f'Graph {graph_id}, subgraph dependency {subgraph_id} is not ready. Building now..') subgraph_build_success = self.build_graph(subgraph_graph_spec) if not subgraph_build_success: return False @@ -333,7 +332,7 @@ def build_dependencies(self, graph_spec: GraphSpec): subgraph_edges_path = self.get_graph_edges_file_path(subgraph_dir) subgraph_source.file_paths = [subgraph_nodes_path, subgraph_edges_path] else: - self.logger.warning(f'Attempting to build graph {graph_id} failed, dependency subgraph {subgraph_id} ' + logger.warning(f'Attempting to build graph {graph_id} failed, dependency subgraph {subgraph_id} ' f'version {subgraph_version} was not built successfully.') return False @@ -344,7 +343,7 @@ def build_dependencies(self, graph_spec: GraphSpec): release_version = data_source.generate_version() release_metadata = source_metadata.get_release_info(release_version) if release_metadata is None: - self.logger.info( + logger.info( f'Attempting to build graph {graph_id}, ' f'dependency {source_id} is not ready. Building now...') pipeline_sucess = self.ingest_pipeline.run_pipeline(source_id, @@ -353,7 +352,7 @@ def build_dependencies(self, graph_spec: GraphSpec): normalization_scheme=data_source.normalization_scheme, supplementation_version=data_source.supplementation_version) if not pipeline_sucess: - self.logger.info(f'While attempting to build {graph_spec.graph_id}, ' + logger.info(f'While attempting to build {graph_spec.graph_id}, ' f'data source pipeline failed for dependency {source_id}...') return False release_metadata = source_metadata.get_release_info(release_version) @@ -391,7 +390,7 @@ def generate_meta_kg_and_test_data(self, graph_edges_file_path = os.path.join(graph_directory, EDGES_FILENAME) mkgb = MetaKnowledgeGraphBuilder(nodes_file_path=graph_nodes_file_path, edges_file_path=graph_edges_file_path, - logger=self.logger) + logger=logger) if generate_meta_kg: meta_kg_file_path = os.path.join(graph_directory, META_KG_FILENAME) mkgb.write_meta_kg_to_file(meta_kg_file_path) @@ -522,7 +521,7 @@ def load_graph_specs(self, graph_specs_dir=None): graph_specs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'graph_specs') graph_spec_path = os.path.join(graph_specs_dir, graph_spec_file) if os.path.exists(graph_spec_path): - self.logger.info(f'Loading graph spec: {graph_spec_file}') + logger.info(f'Loading graph spec: {graph_spec_file}') with open(graph_spec_path) as graph_spec_file: graph_spec_yaml = yaml.safe_load(graph_spec_file) self.parse_graph_spec(graph_spec_yaml) @@ -634,7 +633,7 @@ def parse_data_source_spec(self, source_yml): source_id = source_yml['source_id'] if source_id not in get_available_data_sources(): error_message = f'Data source {source_id} is not a valid data source id.' - self.logger.error(error_message + " " + + logger.error(error_message + " " + f'Valid sources are: {", ".join(get_available_data_sources())}') raise GraphSpecError(error_message) diff --git a/orion/ingest_pipeline.py b/orion/ingest_pipeline.py index 208011ac..e80eeb7a 100644 --- a/orion/ingest_pipeline.py +++ b/orion/ingest_pipeline.py @@ -7,7 +7,8 @@ from orion.data_sources import SourceDataLoaderClassFactory, RESOURCE_HOGS, get_available_data_sources from orion.exceptions import DataVersionError -from orion.utils import LoggingUtil, GetDataPullError +from orion.utils import GetDataPullError +from orion.logging import get_orion_logger from orion.kgx_file_normalizer import KGXFileNormalizer from orion.kgx_validation import validate_graph from orion.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, NormalizationFailedError @@ -18,9 +19,7 @@ SOURCE_DATA_LOADER_CLASSES = SourceDataLoaderClassFactory() -logger = LoggingUtil.init_logging("ORION.orion.IngestPipeline", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) +logger = get_orion_logger("orion.ingest_pipeline") class IngestPipeline: diff --git a/orion/kgx_file_merger.py b/orion/kgx_file_merger.py index 6e0b423a..c9f0fa47 100644 --- a/orion/kgx_file_merger.py +++ b/orion/kgx_file_merger.py @@ -3,15 +3,14 @@ import json from datetime import datetime from itertools import chain -from orion.utils import LoggingUtil, quick_jsonl_file_iterator +from orion.utils import quick_jsonl_file_iterator +from orion.logging import get_orion_logger from orion.kgxmodel import GraphSpec, GraphSource, SubGraphSource from orion.biolink_constants import SUBJECT_ID, OBJECT_ID from orion.merging import GraphMerger, DiskGraphMerger, MemoryGraphMerger from orion.ingest_pipeline import RESOURCE_HOGS -logger = LoggingUtil.init_logging("ORION.orion.KGXFileMerger", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) +logger = get_orion_logger("orion.kgx_file_merger") CONNECTED_EDGE_SUBSET = 'connected_edge_subset' DONT_MERGE = 'dont_merge_edges' diff --git a/orion/kgx_file_normalizer.py b/orion/kgx_file_normalizer.py index 3e7628ce..1c55238a 100644 --- a/orion/kgx_file_normalizer.py +++ b/orion/kgx_file_normalizer.py @@ -7,7 +7,8 @@ SUBCLASS_OF, ORIGINAL_OBJECT, ORIGINAL_SUBJECT) from orion.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, EdgeNormalizationResult, \ NormalizationFailedError -from orion.utils import LoggingUtil, chunk_iterator +from orion.utils import chunk_iterator +from orion.logging import get_orion_logger from orion.kgx_file_writer import KGXFileWriter @@ -16,16 +17,11 @@ EDGE_NORMALIZATION_BATCH_SIZE = 1_000_000 -# -# This piece takes KGX-like files and normalizes the nodes and edges for biolink compliance. -# Then it writes the normalized nodes and edges to new files. -# -class KGXFileNormalizer: +logger = get_orion_logger("orion.kgx_file_normalizer") - logger = LoggingUtil.init_logging("ORION.orion.KGXFileNormalizer", - line_format='medium', - level=logging.INFO, - log_file_path=os.getenv('ORION_LOGS')) +# KGXFileNormalizer takes KGX jsonl files, normalizes nodes using Babel's Node Normalizer and converts edge +# predicates to biolink compliant curies according to biolink model predicate mappings, outputting normalized KGX files. +class KGXFileNormalizer: def __init__(self, source_nodes_file_path: str, @@ -105,7 +101,7 @@ def normalize_node_file(self): variant_nodes_split_count = 0 variant_nodes_post_norm = 0 - self.logger.info(f'Normalizing nodes and writing to file...') + logger.info(f'Normalizing nodes and writing to file...') try: with jsonlines.open(self.source_nodes_file_path) as source_json_reader,\ KGXFileWriter(nodes_output_file_path=self.nodes_output_file_path) as output_file_writer: @@ -131,7 +127,7 @@ def normalize_node_file(self): # because nodes that fail to normalize are removed from the list regular_nodes_pre_norm += len(regular_nodes) if regular_nodes: - self.logger.debug(f'Normalizing {len(regular_nodes)} regular nodes...') + logger.debug(f'Normalizing {len(regular_nodes)} regular nodes...') try: self.node_normalizer.normalize_node_data(regular_nodes) except Exception as e: @@ -139,12 +135,12 @@ def normalize_node_file(self): actual_error=e) regular_nodes_post_norm += len(regular_nodes) if regular_nodes: - self.logger.info(f'Normalized {regular_nodes_pre_norm} nodes so far...') + logger.info(f'Normalized {regular_nodes_pre_norm} nodes so far...') variant_nodes_pre_norm += len(variant_nodes) if self.has_sequence_variants: if not self.sequence_variants_pre_normalized: - self.logger.debug(f'Normalizing {len(variant_nodes)} sequence variant nodes...') + logger.debug(f'Normalizing {len(variant_nodes)} sequence variant nodes...') self.node_normalizer.normalize_sequence_variants(variant_nodes) else: # skip normalizing variants but still @@ -165,13 +161,13 @@ def normalize_node_file(self): variant_nodes_split_count = 0 variant_nodes_post_norm += len(variant_nodes) if variant_nodes: - self.logger.info(f'Normalized {variant_nodes_pre_norm} variant nodes so far...') + logger.info(f'Normalized {variant_nodes_pre_norm} variant nodes so far...') if regular_nodes: - self.logger.debug(f'Writing nodes to file...') + logger.debug(f'Writing nodes to file...') output_file_writer.write_normalized_nodes(regular_nodes) if variant_nodes: - self.logger.debug(f'Writing sequence variant nodes to file...') + logger.debug(f'Writing sequence variant nodes to file...') output_file_writer.write_normalized_nodes(variant_nodes) # grab the number of repeat writes from the file writer @@ -186,7 +182,7 @@ def normalize_node_file(self): f'{e.line}' raise NormalizationFailedError(error_message=norm_error_msg, actual_error=e) - self.logger.debug(f'Writing normalization map to file...') + logger.debug(f'Writing normalization map to file...') normalization_map_info = {'normalization_map': self.node_normalizer.node_normalization_lookup} with open(self.node_norm_map_file_path, "w") as node_norm_map_file: json.dump(normalization_map_info, node_norm_map_file, indent=4) @@ -195,7 +191,7 @@ def normalize_node_file(self): regular_node_norm_failures = self.node_normalizer.failed_to_normalize_ids variant_node_norm_failures = self.node_normalizer.failed_to_normalize_variant_ids if regular_node_norm_failures or variant_node_norm_failures: - self.logger.debug(f'Writing normalization failures to file...') + logger.debug(f'Writing normalization failures to file...') with open(self.node_norm_failures_file_path, "w") as failed_norm_file: for failed_node_id in regular_node_norm_failures: failed_norm_file.write(f'{failed_node_id}\n') @@ -247,7 +243,7 @@ def normalize_edge_file(self): current_edge_norm_failures = self.edge_normalizer.normalize_edge_data(edges_subset) if current_edge_norm_failures: edge_norm_failures.update(current_edge_norm_failures) - self.logger.error( + logger.error( f'Edge normalization service failed to return results for {edge_norm_failures}') for edge in edges_subset: @@ -263,7 +259,7 @@ def normalize_edge_file(self): else: normalized_object_ids = node_norm_lookup[edge[OBJECT_ID]] except KeyError as e: - self.logger.error(f"One of the node IDs from the edge file was missing from the normalizer look up, " + logger.error(f"One of the node IDs from the edge file was missing from the normalizer look up, " f"it's probably not in the node file. ({e})") if not (normalized_subject_ids and normalized_object_ids): edges_failed_due_to_nodes += 1 @@ -277,7 +273,7 @@ def normalize_edge_file(self): normalized_edge_properties = edge_norm_result.properties except KeyError as e: norm_error_msg = f'Edge norm lookup failure - missing {edge[PREDICATE]}!' - self.logger.error(norm_error_msg) + logger.error(norm_error_msg) raise NormalizationFailedError(error_message=norm_error_msg, actual_error=e) else: normalized_predicate = edge[PREDICATE] @@ -331,14 +327,14 @@ def normalize_edge_file(self): if edge_count > 1: edge_splits += edge_count - 1 - self.logger.info(f'Processed {number_of_source_edges} edges so far...') + logger.info(f'Processed {number_of_source_edges} edges so far...') except OSError as e: norm_error_msg = f'Error normalizing edges file {self.source_edges_file_path}' raise NormalizationFailedError(error_message=norm_error_msg, actual_error=e) try: - self.logger.debug(f'Writing predicate map to file...') + logger.debug(f'Writing predicate map to file...') edge_norm_json = {} for original_predicate, edge_normalization in edge_norm_lookup.items(): edge_norm_json[original_predicate] = edge_normalization.__dict__ diff --git a/orion/kgx_file_writer.py b/orion/kgx_file_writer.py index 01f268ca..0295dfba 100644 --- a/orion/kgx_file_writer.py +++ b/orion/kgx_file_writer.py @@ -2,18 +2,16 @@ import jsonlines import logging -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger from orion.kgxmodel import kgxnode, kgxedge from orion.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \ SUBJECT_ID, OBJECT_ID, PREDICATE -class KGXFileWriter: +logger = get_orion_logger("orion.kgx_file_writer") + - logger = LoggingUtil.init_logging("ORION.orion.KGXFileWriter", - line_format='medium', - level=logging.INFO, - log_file_path=os.getenv('ORION_LOGS')) +class KGXFileWriter: """ constructor :param nodes_output_file_path: the file path for the nodes file @@ -35,7 +33,7 @@ def __init__(self, if nodes_output_file_path: if os.path.isfile(nodes_output_file_path): # TODO verify - do we really want to overwrite existing files? we could remove them on previous errors instead - self.logger.warning(f'KGXFileWriter warning.. file already existed: {nodes_output_file_path}! Overwriting it!') + logger.warning(f'KGXFileWriter warning.. file already existed: {nodes_output_file_path}! Overwriting it!') self.nodes_output_file_handler = open(nodes_output_file_path, 'w') self.nodes_jsonl_writer = jsonlines.Writer(self.nodes_output_file_handler) @@ -43,7 +41,7 @@ def __init__(self, if edges_output_file_path: if os.path.isfile(edges_output_file_path): # TODO verify - do we really want to overwrite existing files? we could remove them on previous errors instead - self.logger.warning(f'KGXFileWriter warning.. file already existed: {edges_output_file_path}! Overwriting it!') + logger.warning(f'KGXFileWriter warning.. file already existed: {edges_output_file_path}! Overwriting it!') self.edges_output_file_handler = open(edges_output_file_path, 'w') self.edges_jsonl_writer = jsonlines.Writer(self.edges_output_file_handler) @@ -103,7 +101,7 @@ def __write_node_to_file(self, node): self.nodes_jsonl_writer.write(node) self.nodes_written += 1 except jsonlines.InvalidLineError as e: - self.logger.error(f'KGXFileWriter: Failed to write json data: {e.line}.') + logger.error(f'KGXFileWriter: Failed to write json data: {e.line}.') raise e def write_edge(self, @@ -155,5 +153,5 @@ def __write_edge_to_file(self, edge): self.edges_jsonl_writer.write(edge) self.edges_written += 1 except jsonlines.InvalidLineError as e: - self.logger.error(f'KGXFileWriter: Failed to write json data: {e.line}.') + logger.error(f'KGXFileWriter: Failed to write json data: {e.line}.') raise e diff --git a/orion/loader_interface.py b/orion/loader_interface.py index 3c771618..2f033f55 100644 --- a/orion/loader_interface.py +++ b/orion/loader_interface.py @@ -3,7 +3,7 @@ import json import inspect from orion.kgx_file_writer import KGXFileWriter -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger class SourceDataLoader: @@ -46,10 +46,8 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.output_file_writer: KGXFileWriter = None # create a logger - self.logger = LoggingUtil.init_logging(f"ORION.parsers.{self.get_name()}", - level=logging.INFO, - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) + # this uses an instance level logger instead of a module level because the name changes based on the ingest + self.logger = get_orion_logger(f"parsers.{self.get_name()}") def get_latest_source_version(self): """Determine and return the latest source version ie. a unique identifier associated with the latest version.""" diff --git a/orion/logging.py b/orion/logging.py new file mode 100644 index 00000000..f8a95513 --- /dev/null +++ b/orion/logging.py @@ -0,0 +1,49 @@ +import os +import logging +from logging.handlers import RotatingFileHandler + +from orion.config import config + + +def get_orion_logger(name): + """ + Logging utility controlling format and setting initial logging level + """ + + # get the logger with the specified name + logger = logging.getLogger(name) + + # if it already has handlers, it was already instantiated - return it + if logger.hasHandlers(): + return logger + + formatter = logging.Formatter('%(asctime)-15s - %(funcName)s(): %(message)s') + + level = logging.DEBUG if config.ORION_TEST_MODE else logging.INFO + logger.setLevel(level) + + # if ORION_LOGS is set, write logs to files there + if config.ORION_LOGS is not None: + # create a rotating file handler, 100mb max per file with a max number of 10 files + file_handler = RotatingFileHandler(filename=os.path.join(config.ORION_LOGS, name + '.log'), maxBytes=100000000, backupCount=10) + + # set the formatter + file_handler.setFormatter(formatter) + + # set the log level + file_handler.setLevel(level) + + # add the handler to the logger + logger.addHandler(file_handler) + + # create a stream handler as well (default to console/stdout) + stream_handler = logging.StreamHandler() + + # set the formatter on the console stream + stream_handler.setFormatter(formatter) + + # add the console handler to the logger + logger.addHandler(stream_handler) + + # return to the caller + return logger \ No newline at end of file diff --git a/orion/memgraph_tools.py b/orion/memgraph_tools.py index 35b9157a..c0eedef7 100644 --- a/orion/memgraph_tools.py +++ b/orion/memgraph_tools.py @@ -1,10 +1,8 @@ import os import orion.kgx_file_converter as kgx_file_converter -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger -logger = LoggingUtil.init_logging("ORION.orion.memgraph_tools", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) +logger = get_orion_logger("orion.memgraph_tools") def create_memgraph_dump(nodes_filepath: str, diff --git a/orion/merging.py b/orion/merging.py index 25bded66..ca41d115 100644 --- a/orion/merging.py +++ b/orion/merging.py @@ -5,7 +5,8 @@ from xxhash import xxh64_hexdigest from orion.biolink_utils import BiolinkUtils from orion.biolink_constants import * -from orion.utils import quick_json_loads, quick_json_dumps, LoggingUtil +from orion.utils import quick_json_loads, quick_json_dumps +from orion.logging import get_orion_logger ORION_UUID_NAMESPACE = uuid.UUID('e2a5b21f-4e4d-4a6e-b64a-1f3c78e2a9d0') @@ -15,9 +16,7 @@ # TODO ideally we'd make the biolink model version configurable here bmt = BiolinkUtils() -logger = LoggingUtil.init_logging("ORION.orion.merging", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) +logger = get_orion_logger("orion.merging") # Key functions for identifying duplicates during entity merging. # Add entries to CUSTOM_KEY_FUNCTIONS to define custom matching logic for specific properties. diff --git a/orion/neo4j_tools.py b/orion/neo4j_tools.py index 300890b3..75890c0c 100644 --- a/orion/neo4j_tools.py +++ b/orion/neo4j_tools.py @@ -4,12 +4,10 @@ import subprocess import orion.kgx_file_converter as kgx_file_converter from orion.biolink_constants import NAMED_THING -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger -logger = LoggingUtil.init_logging("ORION.orion.neo4j_tools", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) +logger = get_orion_logger("orion.neo4j_tools") class Neo4jTools: diff --git a/orion/normalization.py b/orion/normalization.py index 750c1175..ad33a300 100644 --- a/orion/normalization.py +++ b/orion/normalization.py @@ -8,7 +8,9 @@ from robokop_genetics.genetics_normalization import GeneticsNormalizer from orion.biolink_constants import * -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger + +logger = get_orion_logger("orion.normalization") NORMALIZATION_CODE_VERSION = '1.4' @@ -66,7 +68,6 @@ class NodeNormalizer: """ def __init__(self, - log_level=logging.INFO, node_normalization_version: str = 'latest', biolink_version: str = 'latest', strict_normalization: bool = True, @@ -74,14 +75,8 @@ def __init__(self, include_taxa: bool = False): """ constructor - :param log_level - overrides default log level :param node_normalization_version - not implemented yet """ - # create a logger - self.logger = LoggingUtil.init_logging("ORION.orion.NodeNormalizer", - level=log_level, - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) # storage for regular nodes that failed to normalize self.failed_to_normalize_ids = set() # storage for variant nodes that failed to normalize @@ -121,7 +116,7 @@ def hit_node_norm_service(self, curies, retries=0): raise NormalizationFailedError(error_message=error_message) else: error_message = f'Node norm response code: {resp.status_code} (curies: {curies})' - self.logger.error(error_message) + logger.error(error_message) resp.raise_for_status() def normalize_node_data(self, node_list: list, batch_size: int = 5000) -> list: @@ -282,7 +277,7 @@ def normalize_node_data(self, node_list: list, batch_size: int = 5000) -> list: if self.strict_normalization: node_list[:] = [d for d in node_list if d is not None] - self.logger.debug(f'End of normalize_node_data.') + logger.debug(f'End of normalize_node_data.') # return the failed list to the caller return failed_to_normalize @@ -390,14 +385,10 @@ class EdgeNormalizer: DEFAULT_EDGE_NORM_ENDPOINT = f'https://bl-lookup-sri.renci.org/' def __init__(self, - edge_normalization_version: str = 'latest', - log_level=logging.INFO): + edge_normalization_version: str = 'latest'): """ constructor - :param log_level - overrides default log level """ - # create a logger - self.logger = LoggingUtil.init_logging("ORION.orion.EdgeNormalizer", level=log_level, line_format='medium', log_file_path=os.getenv('ORION_LOGS')) # normalization map for future look up of all normalized predicates self.edge_normalization_lookup = {} self.cached_edge_norms = {} @@ -458,7 +449,7 @@ def normalize_edge_data(self, # hit the edge normalization service request_url = f'{self.edge_norm_endpoint}resolve_predicate?version={self.edge_norm_version}&predicate=' request_url += '&predicate='.join(predicate_chunk) - self.logger.debug(f'Sending request: {request_url}') + logger.debug(f'Sending request: {request_url}') resp: requests.models.Response = requests.get(request_url) # if we get a success status code @@ -473,7 +464,7 @@ def normalize_edge_data(self, else: # this is a real error with the edge normalizer so we bail error_message = f'Edge norm response code: {resp.status_code}' - self.logger.error(error_message) + logger.error(error_message) resp.raise_for_status() # move on down the list @@ -505,7 +496,7 @@ def normalize_edge_data(self, # if something failed to normalize output it # if failed_to_normalize: - # self.logger.error(f'Failed to normalize: {", ".join(failed_to_normalize)}') + # logger.error(f'Failed to normalize: {", ".join(failed_to_normalize)}') # return the failed list to the caller return failed_to_normalize diff --git a/orion/supplementation.py b/orion/supplementation.py index 9f3a7ff9..4d17bf79 100644 --- a/orion/supplementation.py +++ b/orion/supplementation.py @@ -9,10 +9,12 @@ from collections import defaultdict from orion.biolink_constants import * from orion.normalization import FALLBACK_EDGE_PREDICATE, NormalizationScheme -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger from orion.kgx_file_writer import KGXFileWriter from orion.kgx_file_normalizer import KGXFileNormalizer +logger = get_orion_logger("orion.supplementation") + SNPEFF_PROVENANCE = "infores:robokop-snpeff" # These are terms from Sequence Ontology that SNPEFF uses for annotations. SNPEFF doesn't provide the SO identifiers, @@ -61,15 +63,12 @@ class SequenceVariantSupplementation: def __init__(self, output_dir="."): - self.logger = LoggingUtil.init_logging("ORION.orion.SequenceVariantSupplementation", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) workspace_dir = os.getenv("ORION_STORAGE", output_dir) # if the snpEff dir exists, assume we already downloaded it self.snpeff_dir = path.join(workspace_dir, "snpEff") if not path.isdir(self.snpeff_dir): - self.logger.info('SNPEFF not found, downloading and installing..') + logger.info('SNPEFF not found, downloading and installing..') # TODO # Snpeff is building their latest versions with Java 21 which is not compatible with the docker @@ -98,15 +97,15 @@ def find_supplemental_data(self, workspace_dir = supp_nodes_norm_file_path.rsplit("/", 1)[0] vcf_file_path = f'{workspace_dir}/variants.vcf' - self.logger.info('Creating VCF file from source nodes..') + logger.info('Creating VCF file from source nodes..') self.create_vcf_from_variant_nodes(nodes_file_path, vcf_file_path) - self.logger.info('Running SNPEFF, creating annotated VCF..') + logger.info('Running SNPEFF, creating annotated VCF..') annotated_vcf_path = f'{workspace_dir}/variants_ann.vcf' self.run_snpeff(vcf_file_path, annotated_vcf_path) - self.logger.info('Converting annotated VCF to KGX File..') + logger.info('Converting annotated VCF to KGX File..') supplementation_metadata = self.convert_snpeff_to_kgx(annotated_vcf_path, supp_nodes_file_path, supp_edges_file_path) @@ -114,7 +113,7 @@ def find_supplemental_data(self, os.remove(vcf_file_path) os.remove(annotated_vcf_path) - self.logger.info('Normalizing Supplemental KGX File..') + logger.info('Normalizing Supplemental KGX File..') file_normalizer = KGXFileNormalizer(source_nodes_file_path=supp_nodes_file_path, nodes_output_file_path=supp_nodes_norm_file_path, node_norm_map_file_path=supp_node_norm_map_file_path, @@ -148,7 +147,7 @@ def run_snpeff(self, if snpeff_results.returncode != 0: error_message = f'SNPEFF subprocess error (ExitCode {snpeff_results.returncode}): ' \ f'{snpeff_results.stderr.decode("UTF-8")}' - self.logger.error(error_message) + logger.error(error_message) raise SupplementationFailedError(error_message) def convert_snpeff_to_kgx(self, diff --git a/orion/utils.py b/orion/utils.py index c3a97f7f..843bcb19 100644 --- a/orion/utils.py +++ b/orion/utils.py @@ -1,6 +1,4 @@ import os -import logging -import tarfile import gzip import requests import orjson @@ -14,82 +12,9 @@ from csv import DictReader from ftplib import FTP from datetime import datetime -from logging.handlers import RotatingFileHandler +from orion.logging import get_orion_logger - -class LoggingUtil(object): - """ - creates and configures a logger - """ - @staticmethod - def init_logging(name, level=logging.INFO, line_format='minimum', log_file_path=None): - """ - Logging utility controlling format and setting initial logging level - """ - - # get the logger with the specified name - logger = logging.getLogger(name) - - # if it already has handlers, it was already instantiated - return it - if logger.hasHandlers(): - return logger - - # define the various output formats - format_type = { - "minimum": '%(message)s', - "short": '%(funcName)s(): %(message)s', - "medium": '%(asctime)-15s - %(funcName)s(): %(message)s', - "long": '%(asctime)-15s - %(filename)s %(funcName)s() %(levelname)s: %(message)s' - }[line_format] - - # create a formatter - formatter = logging.Formatter(format_type) - - # set the logging level - if os.getenv('ORION_TEST_MODE'): - level = logging.DEBUG - logger.setLevel(level) - - # if there was a file path passed in use it - if log_file_path is not None: - # create a rotating file handler, 100mb max per file with a max number of 10 files - file_handler = RotatingFileHandler(filename=os.path.join(log_file_path, name + '.log'), maxBytes=100000000, backupCount=10) - - # set the formatter - file_handler.setFormatter(formatter) - - # set the log level - file_handler.setLevel(level) - - # add the handler to the logger - logger.addHandler(file_handler) - - # create a stream handler as well (default to console) - stream_handler = logging.StreamHandler() - - # set the formatter on the console stream - stream_handler.setFormatter(formatter) - - # add the console handler to the logger - logger.addHandler(stream_handler) - - # return to the caller - return logger - - @staticmethod - def print_debug_msg(msg: str): - """ - Adds a timestamp to a printed message - - :param msg: the message that gets appended onto a timestamp and output to console - :return: None - """ - - # get the timestamp - now: datetime = datetime.now() - - # output the text - print(f'{now.strftime("%Y/%m/%d %H:%M:%S")} - {msg}') +logger = get_orion_logger("orion.utils") class GetDataPullError(Exception): @@ -102,13 +27,10 @@ class GetData: Class that contains methods that can be used to get various data sets. """ - def __init__(self, log_level=logging.INFO): + def __init__(self): """ constructor - :param log_level - overrides default log level """ - # create a logger - self.logger = LoggingUtil.init_logging("ORION.orion.GetData", level=log_level, line_format='medium', log_file_path=os.getenv('ORION_LOGS')) @staticmethod def pull_via_ftp_binary(ftp_site, ftp_dir, ftp_file): @@ -183,7 +105,7 @@ def get_ftp_file_date(self, ftp_site, ftp_dir, ftp_file, exclude_day=False) -> s except Exception as e: error_message = f'Error getting modification date for ftp file: {ftp_site}{ftp_dir}{ftp_file}. {e}' - self.logger.error(error_message) + logger.error(error_message) raise GetDataPullError(error_message) def pull_via_ftp(self, ftp_site: str, ftp_dir: str, ftp_files: list, data_file_path: str) -> int: @@ -212,7 +134,7 @@ def pull_via_ftp(self, ftp_site: str, ftp_dir: str, ftp_files: list, data_file_p # for each file requested for f in ftp_files: - self.logger.debug(f'Retrieving {ftp_site}{ftp_dir}{f} -> {data_file_path}') + logger.debug(f'Retrieving {ftp_site}{ftp_dir}{f} -> {data_file_path}') # does the file exist and has data in it try: @@ -233,15 +155,15 @@ def pull_via_ftp(self, ftp_site: str, ftp_dir: str, ftp_files: list, data_file_p # progress output if file_counter % 50 == 0: - self.logger.debug(f'{file_counter} files retrieved, {len(ftp_files) - file_counter} to go.') + logger.debug(f'{file_counter} files retrieved, {len(ftp_files) - file_counter} to go.') - self.logger.debug(f'{file_counter} file(s) retrieved of {len(ftp_files)} requested.') + logger.debug(f'{file_counter} file(s) retrieved of {len(ftp_files)} requested.') # close the ftp object ftp.quit() except Exception as e: error_message = f'GetDataPullError pull_via_ftp() failed for {ftp_site}. Exception: {e}' - self.logger.error(error_message) + logger.error(error_message) raise GetDataPullError(error_message) # return pass/fail to the caller @@ -257,7 +179,7 @@ def get_http_file_modified_date(self, file_url: str): return modified_datetime.strftime("%-m_%-d_%Y") except Exception as e: error_message = f'Error getting modification date for http file: {file_url}. {repr(e)}-{e}' - self.logger.error(error_message) + logger.error(error_message) raise GetDataPullError(error_message) def pull_via_http(self, url: str, data_dir: str, is_gzip=False, saved_file_name: str = None) -> int: @@ -286,7 +208,7 @@ def pull_via_http(self, url: str, data_dir: str, is_gzip=False, saved_file_name: # check if the file exists already if not os.path.exists(os.path.join(data_dir, data_file)): - self.logger.debug(f'Retrieving {url} -> {data_dir}') + logger.debug(f'Retrieving {url} -> {data_dir}') try: hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'} req = request.Request(url, headers=hdr) @@ -314,7 +236,7 @@ def pull_via_http(self, url: str, data_dir: str, is_gzip=False, saved_file_name: fp.write(buffer) except Exception as e: error_message = f'GetDataPullError pull_via_http() failed. URL: {url}. Exception: {e}' - self.logger.error(error_message) + logger.error(error_message) raise GetDataPullError(error_message) else: @@ -333,7 +255,7 @@ def get_swiss_prot_id_set(self, data_dir: str, debug_mode=False) -> set: :return: a set of uniprot kb ids """ - self.logger.debug('Start of swiss-prot curated uniprot id retrieval') + logger.debug('Start of swiss-prot curated uniprot id retrieval') # init the return value ret_val: set = set() @@ -362,11 +284,11 @@ def get_swiss_prot_id_set(self, data_dir: str, debug_mode=False) -> set: ret_val.add(item.strip(';\n')) # do not remove the file if in debug mode - # if self.logger.level != logging.DEBUG and not debug_mode: + # if logger.level != logging.DEBUG and not debug_mode: # # remove the target file # os.remove(os.path.join(data_dir, data_file_name)) - self.logger.debug(f'End of swiss-prot uniprot id retrieval. {len(ret_val)} retrieved.') + logger.debug(f'End of swiss-prot uniprot id retrieval. {len(ret_val)} retrieved.') # return the list return ret_val diff --git a/parsers/LitCoin/src/bagel/bagel_gpt.py b/parsers/LitCoin/src/bagel/bagel_gpt.py index 843ba4d3..c38ec552 100644 --- a/parsers/LitCoin/src/bagel/bagel_gpt.py +++ b/parsers/LitCoin/src/bagel/bagel_gpt.py @@ -3,16 +3,14 @@ from collections import defaultdict from orion.config import CONFIG -from orion.utils import LoggingUtil +from orion.logging import get_orion_logger OPENAI_API_KEY = CONFIG.get("OPENAI_API_KEY") LLM_RESULTS = [] -logger = LoggingUtil.init_logging("ORION.orion.BagelGPT", - line_format='medium', - log_file_path=os.getenv('ORION_LOGS')) +logger = get_orion_logger("orion.bagel_gpt") def ask_classes_and_descriptions(text, term, termlist, abstract_id, requests_session): """Get GPT results based only on the labels of the terms.""" diff --git a/parsers/PHAROS/src/legacy_pharos_mysql.py b/parsers/PHAROS/src/legacy_pharos_mysql.py index fc354bef..a5961be8 100644 --- a/parsers/PHAROS/src/legacy_pharos_mysql.py +++ b/parsers/PHAROS/src/legacy_pharos_mysql.py @@ -1,12 +1,13 @@ import os import mysql.connector import logging -from orion.utils import LoggingUtil, GetData, NodeNormUtils, EdgeNormUtils +from orion.utils import GetData, NodeNormUtils, EdgeNormUtils +from orion.logging import get_orion_logger from pathlib import Path # create a logger -logger = LoggingUtil.init_logging("ORION.PHAROS.PHAROSLoader", line_format='medium', log_file_path=os.path.join(Path(__file__).parents[2], 'logs')) +logger = get_orion_logger("parsers.pharos") class PharosMySQL(): def __init__(self, context): diff --git a/parsers/PHAROS/src/loadPHAROS.py b/parsers/PHAROS/src/loadPHAROS.py index 44a95f17..e3449523 100644 --- a/parsers/PHAROS/src/loadPHAROS.py +++ b/parsers/PHAROS/src/loadPHAROS.py @@ -108,7 +108,7 @@ def get_latest_source_version(self) -> str: return 'v6_13_4' def get_data(self): - gd: GetData = GetData(self.logger.level) + gd: GetData = GetData() byte_count: int = gd.pull_via_http(f'{self.data_url}{self.data_file}', self.data_path) if not byte_count: diff --git a/parsers/ViralProteome/src/get_uniref_taxon_indexes.py b/parsers/ViralProteome/src/get_uniref_taxon_indexes.py index aa681b58..5f46ae78 100644 --- a/parsers/ViralProteome/src/get_uniref_taxon_indexes.py +++ b/parsers/ViralProteome/src/get_uniref_taxon_indexes.py @@ -1,11 +1,12 @@ import os import argparse # from parsers.ViralProteome.src.loadUniRef import UniRefSimLoader -from orion.utils import LoggingUtil, GetData +from orion.utils import GetData +from orion.logging import get_orion_logger from pathlib import Path # create a logger -logger = LoggingUtil.init_logging("ORION.ViralProteome.get_uniref_taxon_indexes", line_format='medium', log_file_path=os.path.join(Path(__file__).parents[2], 'logs')) +logger = get_orion_logger("parsers.get_uniref_taxon_indexes") if __name__ == '__main__': diff --git a/parsers/ViralProteome/src/loadUniRef.py b/parsers/ViralProteome/src/loadUniRef.py index be5181bf..e8d019f3 100644 --- a/parsers/ViralProteome/src/loadUniRef.py +++ b/parsers/ViralProteome/src/loadUniRef.py @@ -5,7 +5,7 @@ import datetime from xml.etree import ElementTree as ETree -from orion.utils import LoggingUtil, GetData +from orion.utils import GetData from orion.kgx_file_writer import KGXFileWriter from orion.loader_interface import SourceDataLoader @@ -81,7 +81,7 @@ def get_uniref_data(self) -> set: """ # get a reference to the get data util class - gd: GetData = GetData(self.logger.level) + gd: GetData = GetData() # are we in test mode if not self.test_mode: From c5fcf13f65da025b5edd0db07f994fd70dea032a Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Fri, 3 Apr 2026 13:59:35 -0700 Subject: [PATCH 02/14] reorganizing .env and set_up_test_env script --- .env | 5 ---- .env.example | 59 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 2 +- docs/ORION.ipynb | 8 +++++-- set_up_dev_env.sh | 17 +++++++++++++ set_up_test_env.sh | 40 ------------------------------- 6 files changed, 83 insertions(+), 48 deletions(-) delete mode 100644 .env create mode 100644 .env.example create mode 100644 set_up_dev_env.sh delete mode 100644 set_up_test_env.sh diff --git a/.env b/.env deleted file mode 100644 index da0872f7..00000000 --- a/.env +++ /dev/null @@ -1,5 +0,0 @@ -OPENAI_API_KEY=fake-key-do-not-commit-a-real-one!!! -OPENAI_API_ORGANIZATION=fake-org-do-not-commit-a-real-one!!! -BAGEL_SERVICE_USERNAME=fake-username-do-not-commit-a-real-one!!! -BAGEL_SERVICE_PASSWORD=fake-password-do-not-commit-a-real-one!!! -SHARED_SOURCE_DATA_PATH=/tmp/shared_data diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..2165f415 --- /dev/null +++ b/.env.example @@ -0,0 +1,59 @@ +# ---- Storage & Output ---- + +# Directory for source data downloads and ingest pipeline files +# ORION_STORAGE= + +# Directory for final graph releases +# ORION_GRAPHS= + +# Directory for log files (if unset, logs go to stdout only) +# ORION_LOGS= + +# Base URL used when generating graph metadata +# ORION_OUTPUT_URL=https://localhost/ + +# ---- Graph Spec ---- + +# Local graph spec filename (set one of ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL, not both) +# ORION_GRAPH_SPEC=example-graph-spec.yaml + +# URL pointing to a remote graph spec file +# ORION_GRAPH_SPEC_URL= + +# ---- Mode ---- + +# Enable test/debug mode (sets log level to DEBUG and runs ingests with a smaller subset of data if possible) +# ORION_TEST_MODE=false + +# ---- Normalization Endpoints ---- + +# Edge normalization / BioLink Lookup endpoint +# EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ + +# Node normalization endpoint +# NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ + +# ---- LitCoin / Bagel (may be removed in the future) ---- + +# Name resolution service URL +# NAMERES_URL=https://name-resolution-sri.renci.org/ + +# SapBERT service URL +# SAPBERT_URL=https://babel-sapbert.apps.renci.org/ + +# Shared source data path for LitCoin pipeline +# SHARED_SOURCE_DATA_PATH=/tmp/shared_data + +# LitCoin predicate mapping service URL +# LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org/ + +# Bagel service endpoint +# BAGEL_ENDPOINT=https://bagel.apps.renci.org/ + +# Bagel service credentials +# BAGEL_SERVICE_USERNAME= +# BAGEL_SERVICE_PASSWORD= + +# OpenAI credentials for LitCoin GPT features +# OPENAI_API_KEY= +# OPENAI_API_ORGANIZATION= \ No newline at end of file diff --git a/README.md b/README.md index 41fa5ec3..6fbb6740 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ ORION uses three directories for its data, configured via environment variables: You can set these up manually or use the provided script: ```bash -source ./set_up_test_env.sh +source ./set_up_dev_env.sh ``` #### Graph Spec diff --git a/docs/ORION.ipynb b/docs/ORION.ipynb index 46c44dc7..5d5fce00 100644 --- a/docs/ORION.ipynb +++ b/docs/ORION.ipynb @@ -85,7 +85,11 @@ { "cell_type": "code", "id": "g6i460bvtda", - "source": "%%bash\ncd ~/ORION_root/ORION/\nsource ./set_up_test_env.sh", + "source": [ + "%%bash\n", + "cd ~/ORION_root/ORION/\n", + "source ./set_up_dev_env.sh" + ], "metadata": {}, "execution_count": null, "outputs": [] @@ -130,4 +134,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/set_up_dev_env.sh b/set_up_dev_env.sh new file mode 100644 index 00000000..f824f091 --- /dev/null +++ b/set_up_dev_env.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# ORION requires directories to store data ingest files and graph outputs. +# This script creates those directories and sets the environment variables pointing to them. +# See the README for more information. + +# ORION_STORAGE - a directory for storing ingest pipeline files +mkdir -p "$PWD/../ORION_storage" +export ORION_STORAGE="$PWD/../ORION_storage/" + +# ORION_GRAPHS - a directory for storing knowledge graph outputs +mkdir -p "$PWD/../ORION_graphs" +export ORION_GRAPHS="$PWD/../ORION_graphs/" + +# ORION_LOGS - a directory for storing logs +mkdir -p "$PWD/../ORION_logs" +export ORION_LOGS="$PWD/../ORION_logs/" diff --git a/set_up_test_env.sh b/set_up_test_env.sh deleted file mode 100644 index 46ed120c..00000000 --- a/set_up_test_env.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -#These environment variables are required by Data Services. See the README for more information. - -#ORION_STORAGE - a directory for storing data sources -mkdir -p "$PWD/../ORION_storage" -export ORION_STORAGE="$PWD/../ORION_storage/" - -#ORION_GRAPHS - a directory for storing knowledge graphs -mkdir -p "$PWD/../ORION_graphs" -export ORION_GRAPHS="$PWD/../ORION_graphs/" - -#ORION_LOGS - a directory for storing logs -mkdir -p "$PWD/../ORION_logs" -export ORION_LOGS="$PWD/../ORION_logs/" - -#Use EITHER of the following, ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL - -#ORION_GRAPH_SPEC - the name of a Graph Spec file located in the graph_specs directory of ORION -export ORION_GRAPH_SPEC=example-graph-spec.yaml - -#ORION_GRAPH_SPEC_URL - a URL pointing to a Graph Spec file -#export ORION_GRAPH_SPEC_URL=https://raw.githubusercontent.com/RENCI-AUTOMAT/ORION/helm_deploy/graph_specs/yeast-graph-spec.yml - -export PYTHONPATH="$PYTHONPATH:$PWD" - -# The following environment variables are optional -# -# export EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ -# export NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ -# export NAMERES_URL=https://name-resolution-sri.renci.org/ -# export SAPBERT_URL=https://babel-sapbert.apps.renci.org/ -# export LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org/ - -# export ORION_OUTPUT_URL=https://localhost/ # this is currently only used to generate metadata -# export BL_VERSION=4.2.1 - -# if you are building your own docker image and issues occur, setting the correct platform may help -# export DOCKER_PLATFORM=linux/arm64 - From e60309644b2720e7ee9d7be10f2c95edd7fbd0b0 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Thu, 2 Apr 2026 16:12:38 -0700 Subject: [PATCH 03/14] use pydantic settings config for env vars --- orion/biolink_utils.py | 4 ++- orion/build_manager.py | 9 ++--- orion/config.py | 40 ++++++++++++++++++---- orion/ingest_pipeline.py | 17 ++++----- orion/loader_interface.py | 3 +- orion/neo4j_tools.py | 5 ++- orion/normalization.py | 12 +++---- orion/supplementation.py | 9 +++-- parsers/LitCoin/src/NER/nameres.py | 4 +-- parsers/LitCoin/src/NER/sapbert.py | 4 +-- parsers/LitCoin/src/bagel/bagel_gpt.py | 4 +-- parsers/LitCoin/src/bagel/bagel_service.py | 18 ++++------ parsers/LitCoin/src/loadLitCoin.py | 5 +-- pyproject.toml | 2 +- uv.lock | 18 ++++++++-- 15 files changed, 97 insertions(+), 57 deletions(-) diff --git a/orion/biolink_utils.py b/orion/biolink_utils.py index 011f9f96..28e5b178 100644 --- a/orion/biolink_utils.py +++ b/orion/biolink_utils.py @@ -6,7 +6,9 @@ from requests.adapters import HTTPAdapter, Retry from functools import cache -BIOLINK_MODEL_VERSION = os.environ.get("BL_VERSION", "v4.3.4") +from orion.config import config + +BIOLINK_MODEL_VERSION = config.BL_VERSION def get_biolink_model_toolkit(biolink_version: str = None) -> Toolkit: version = biolink_version if biolink_version else BIOLINK_MODEL_VERSION diff --git a/orion/build_manager.py b/orion/build_manager.py index 33afc1d6..505e0950 100644 --- a/orion/build_manager.py +++ b/orion/build_manager.py @@ -10,6 +10,7 @@ from orion.utils import GetDataPullError from orion.logging import get_orion_logger +from orion.config import config from orion.data_sources import get_available_data_sources, get_data_source_metadata_path from orion.exceptions import DataVersionError, GraphSpecError from orion.ingest_pipeline import IngestPipeline @@ -508,8 +509,8 @@ def generate_kgx_metadata_files(self, f.write(kgx_graph_metadata.to_json()) def load_graph_specs(self, graph_specs_dir=None): - graph_spec_file = os.getenv('ORION_GRAPH_SPEC') - graph_spec_url = os.getenv('ORION_GRAPH_SPEC_URL') + graph_spec_file = config.ORION_GRAPH_SPEC + graph_spec_url = config.ORION_GRAPH_SPEC_URL if graph_spec_file and graph_spec_url: raise GraphSpecError(f'Configuration Error - the environment variables ORION_GRAPH_SPEC and ' @@ -685,7 +686,7 @@ def get_graph_dir_path(self, graph_id: str, graph_version: str): @staticmethod def get_graph_output_url(graph_id: str, graph_version: str): - graph_output_url = os.environ.get('ORION_OUTPUT_URL', "https://localhost/").removesuffix('/') + graph_output_url = config.ORION_OUTPUT_URL.removesuffix('/') return f'{graph_output_url}/{graph_id}/{graph_version}/' @staticmethod @@ -712,7 +713,7 @@ def get_graph_metadata(self, graph_id: str, graph_version: str): @staticmethod def get_graph_output_dir(): # confirm the directory specified by the environment variable ORION_GRAPHS is valid - graphs_dir = os.getenv('ORION_GRAPHS') + graphs_dir = config.ORION_GRAPHS if graphs_dir and Path(graphs_dir).is_dir(): return graphs_dir diff --git a/orion/config.py b/orion/config.py index f5ec3b7d..2cb314ff 100644 --- a/orion/config.py +++ b/orion/config.py @@ -1,10 +1,38 @@ -import os from pathlib import Path -from dotenv import dotenv_values +from pydantic_settings import BaseSettings, SettingsConfigDict -CONFIG = { - **dotenv_values(Path(__file__).parents[1] / '.env'), # load config variables from .env - **os.environ, # override loaded values with environment variables -} +class Config(BaseSettings): + model_config = SettingsConfigDict( + env_file=Path(__file__).parent.parent/".env", + env_file_encoding="utf-8", + env_ignore_empty=True + ) + ORION_STORAGE: str | None = None + ORION_GRAPHS: str | None = None + ORION_LOGS: str | None = None + + ORION_OUTPUT_URL: str = "https://localhost" + ORION_TEST_MODE: bool = False + + ORION_GRAPH_SPEC: str = "example-graph-spec.yaml" + ORION_GRAPH_SPEC_URL: str = "" + + BL_VERSION: str = "v4.3.4" + + EDGE_NORMALIZATION_ENDPOINT: str = "https://bl-lookup-sri.renci.org" + NODE_NORMALIZATION_ENDPOINT: str = "https://nodenormalization-sri.renci.org" + + # the following were used for the LitCoin project and may be removed in the future + NAMERES_URL: str = "https://name-resolution-sri.renci.org" + SAPBERT_URL: str = "https://babel-sapbert.apps.renci.org" + SHARED_SOURCE_DATA_PATH: str = "/tmp/shared_data" + LITCOIN_PRED_MAPPING_URL: str = "https://pred-mapping.apps.renci.org" + BAGEL_ENDPOINT: str = "https://bagel.apps.renci.org" + BAGEL_SERVICE_USERNAME: str | None = None + BAGEL_SERVICE_PASSWORD: str | None = None + OPENAI_API_KEY: str | None = None + OPENAI_API_ORGANIZATION: str | None = None + +config = Config() \ No newline at end of file diff --git a/orion/ingest_pipeline.py b/orion/ingest_pipeline.py index e80eeb7a..1f0a41c7 100644 --- a/orion/ingest_pipeline.py +++ b/orion/ingest_pipeline.py @@ -9,6 +9,7 @@ from orion.exceptions import DataVersionError from orion.utils import GetDataPullError from orion.logging import get_orion_logger +from orion.config import config from orion.kgx_file_normalizer import KGXFileNormalizer from orion.kgx_validation import validate_graph from orion.normalization import NormalizationScheme, NodeNormalizer, EdgeNormalizer, NormalizationFailedError @@ -698,14 +699,13 @@ def init_storage_dir(storage_dir: str=None): raise IOError(f'Storage directory not valid: {storage_dir}') # otherwise use the storage directory specified by the environment variable ORION_STORAGE # check to make sure it's set and valid, otherwise fail - storage_dir_from_env = os.getenv("ORION_STORAGE") - if storage_dir_from_env is None: + if config.ORION_STORAGE is None: raise Exception(f'No storage directory was specified. You must either provide a path programmatically or ' f'use the environment variable ORION_STORAGE to configure a storage directory.') - if os.path.isdir(storage_dir_from_env): - return storage_dir_from_env + if os.path.isdir(config.ORION_STORAGE): + return config.ORION_STORAGE else: - raise IOError(f'Storage directory not valid: {storage_dir_from_env}') + raise IOError(f'Storage directory not valid: {config.ORION_STORAGE}') def init_source_output_dir(self, source_id: str): source_dir_path = os.path.join(self.storage_dir, source_id) @@ -730,12 +730,7 @@ def main(): 'in the finalized kgx files.') args = parser.parse_args() - if 'ORION_TEST_MODE' in os.environ: - test_mode_from_env = os.environ['ORION_TEST_MODE'] - else: - test_mode_from_env = False - - loader_test_mode = args.test_mode or test_mode_from_env + loader_test_mode = args.test_mode or config.ORION_TEST_MODE loader_strict_normalization = (not args.lenient_normalization) ingest_pipeline = IngestPipeline(test_mode=loader_test_mode, fresh_start_mode=args.fresh_start_mode) diff --git a/orion/loader_interface.py b/orion/loader_interface.py index 2f033f55..23158c50 100644 --- a/orion/loader_interface.py +++ b/orion/loader_interface.py @@ -4,6 +4,7 @@ import inspect from orion.kgx_file_writer import KGXFileWriter from orion.logging import get_orion_logger +from orion.config import config class SourceDataLoader: @@ -36,7 +37,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): if not os.path.exists(self.data_path): os.mkdir(self.data_path) else: - self.data_path = os.environ.get("ORION_STORAGE") + self.data_path = config.ORION_STORAGE # the final output lists of nodes and edges self.final_node_list: list = [] diff --git a/orion/neo4j_tools.py b/orion/neo4j_tools.py index 75890c0c..107db9a5 100644 --- a/orion/neo4j_tools.py +++ b/orion/neo4j_tools.py @@ -5,6 +5,7 @@ import orion.kgx_file_converter as kgx_file_converter from orion.biolink_constants import NAMED_THING from orion.logging import get_orion_logger +from orion.config import config logger = get_orion_logger("orion.neo4j_tools") @@ -22,7 +23,9 @@ def __init__(self, self.http_port = http_port self.https_port = https_port self.bolt_port = bolt_port - self.password = password if password else os.environ.get('ORION_NEO4J_PASSWORD', 'orion-password') + # This is only the password for generating a dump on a temporary neo4j instance + # (so it doesn't really matter what it is or that its secure) + self.password = 'orion-password' self.graph_db_uri = f'bolt://{neo4j_host}:{bolt_port}' self.graph_db_auth = ("neo4j", self.password) self.neo4j_driver = neo4j.GraphDatabase.driver(self.graph_db_uri, auth=self.graph_db_auth) diff --git a/orion/normalization.py b/orion/normalization.py index ad33a300..1e72f6bb 100644 --- a/orion/normalization.py +++ b/orion/normalization.py @@ -9,6 +9,7 @@ from robokop_genetics.genetics_normalization import GeneticsNormalizer from orion.biolink_constants import * from orion.logging import get_orion_logger +from orion.config import config logger = get_orion_logger("orion.normalization") @@ -51,7 +52,7 @@ def __init__(self, error_message: str, actual_error: Exception = None): self.error_message = error_message self.actual_error = actual_error -NODE_NORMALIZATION_URL = os.environ.get('NODE_NORMALIZATION_ENDPOINT', 'https://nodenormalization-sri.renci.org/') +NODE_NORMALIZATION_URL = config.NODE_NORMALIZATION_URL class NodeNormalizer: @@ -382,8 +383,6 @@ class EdgeNormalizer: Class that contains methods relating to edge normalization. """ - DEFAULT_EDGE_NORM_ENDPOINT = f'https://bl-lookup-sri.renci.org/' - def __init__(self, edge_normalization_version: str = 'latest'): """ @@ -393,10 +392,7 @@ def __init__(self, self.edge_normalization_lookup = {} self.cached_edge_norms = {} - if 'EDGE_NORMALIZATION_ENDPOINT' in os.environ and os.environ['EDGE_NORMALIZATION_ENDPOINT']: - self.edge_norm_endpoint = os.environ['EDGE_NORMALIZATION_ENDPOINT'] - else: - self.edge_norm_endpoint = self.DEFAULT_EDGE_NORM_ENDPOINT + self.edge_norm_endpoint = config.EDGE_NORMALIZATION_ENDPOINT if edge_normalization_version != 'latest': if self.check_bl_version_valid(edge_normalization_version): @@ -552,7 +548,7 @@ def get_valid_node_types(self): resp.raise_for_status() -NAME_RESOLVER_URL = os.getenv('NAMERES_URL', 'https://name-resolution-sri.renci.org') +NAME_RESOLVER_URL = config.NAMERES_URL NAME_RESOLVER_ENDPOINT = f'{NAME_RESOLVER_URL}/lookup' NAME_RESOLVER_HEADERS = {"accept": "application/json"} NAME_RESOLVER_API_ERROR = 'api_error' diff --git a/orion/supplementation.py b/orion/supplementation.py index 4d17bf79..a1acd252 100644 --- a/orion/supplementation.py +++ b/orion/supplementation.py @@ -10,6 +10,7 @@ from orion.biolink_constants import * from orion.normalization import FALLBACK_EDGE_PREDICATE, NormalizationScheme from orion.logging import get_orion_logger +from orion.config import config from orion.kgx_file_writer import KGXFileWriter from orion.kgx_file_normalizer import KGXFileNormalizer @@ -61,10 +62,12 @@ class SequenceVariantSupplementation: SUPPLEMENTATION_VERSION = "1.1" - def __init__(self, output_dir="."): - - workspace_dir = os.getenv("ORION_STORAGE", output_dir) + def __init__(self, output_dir=None): + workspace_dir = output_dir or config.ORION_STORAGE + if not path.isdir(workspace_dir): + raise RuntimeError(f'Workspace directory not valid for SequenceVariantSupplementation.') + # if the snpEff dir exists, assume we already downloaded it self.snpeff_dir = path.join(workspace_dir, "snpEff") if not path.isdir(self.snpeff_dir): diff --git a/parsers/LitCoin/src/NER/nameres.py b/parsers/LitCoin/src/NER/nameres.py index 5156e3c8..774f903b 100644 --- a/parsers/LitCoin/src/NER/nameres.py +++ b/parsers/LitCoin/src/NER/nameres.py @@ -1,12 +1,12 @@ -import os import logging import requests from parsers.LitCoin.src.NER.base import BaseNEREngine +from orion.config import config # Configuration: NameRes -NAMERES_URL = os.getenv('NAMERES_URL', 'https://name-resolution-sri.renci.org/') +NAMERES_URL = config.NAMERES_URL NAMERES_ENDPOINT = f'{NAMERES_URL}lookup' NAMERES_RL_ENDPOINT = f'{NAMERES_URL}reverse_lookup' diff --git a/parsers/LitCoin/src/NER/sapbert.py b/parsers/LitCoin/src/NER/sapbert.py index ab50389b..59907302 100644 --- a/parsers/LitCoin/src/NER/sapbert.py +++ b/parsers/LitCoin/src/NER/sapbert.py @@ -1,12 +1,12 @@ -import os import logging import requests from parsers.LitCoin.src.NER.base import BaseNEREngine +from orion.config import config # Configuration: get the SAPBERT URL and figure out the annotate path. -SAPBERT_URL = os.getenv('SAPBERT_URL', 'https://babel-sapbert.apps.renci.org/') +SAPBERT_URL = config.SAPBERT_URL SAPBERT_ANNOTATE_ENDPOINT = SAPBERT_URL + 'annotate/' SAPBERT_MODEL_NAME = "sapbert" SAPBERT_COUNT = 1000 # We've found that 1000 is about the minimum you need for reasonable results. diff --git a/parsers/LitCoin/src/bagel/bagel_gpt.py b/parsers/LitCoin/src/bagel/bagel_gpt.py index c38ec552..a4b863e9 100644 --- a/parsers/LitCoin/src/bagel/bagel_gpt.py +++ b/parsers/LitCoin/src/bagel/bagel_gpt.py @@ -2,10 +2,10 @@ import os from collections import defaultdict -from orion.config import CONFIG +from orion.config import config from orion.logging import get_orion_logger -OPENAI_API_KEY = CONFIG.get("OPENAI_API_KEY") +OPENAI_API_KEY = config.OPENAI_API_KEY LLM_RESULTS = [] diff --git a/parsers/LitCoin/src/bagel/bagel_service.py b/parsers/LitCoin/src/bagel/bagel_service.py index 772691e1..4af5d472 100644 --- a/parsers/LitCoin/src/bagel/bagel_service.py +++ b/parsers/LitCoin/src/bagel/bagel_service.py @@ -1,21 +1,17 @@ import requests from requests.auth import HTTPBasicAuth -from orion.config import CONFIG +from orion.config import config -BAGEL_ENDPOINT = 'https://bagel.apps.renci.org/' -BAGEL_ENDPOINT += 'find_curies_openai' +BAGEL_ENDPOINT = config.BAGEL_ENDPOINT + 'find_curies_openai' -bagel_nameres_url = CONFIG.get('NAMERES_ENDPOINT', 'https://name-resolution-sri.renci.org/') -bagel_nameres_url += 'lookup?autocomplete=false&offset=0&limit=10&string="' +bagel_nameres_url = config.NAMERES_URL + 'lookup?autocomplete=false&offset=0&limit=10&string="' -bagel_sapbert_url = CONFIG.get('SAPBERT_URL', 'https://sap-qdrant.apps.renci.org/') -bagel_sapbert_url += "annotate/" +bagel_sapbert_url = config.SAPBERT_URL + "annotate/" -bagel_nodenorm_url = CONFIG.get('NODE_NORMALIZATION_ENDPOINT', 'https://nodenormalization-sri.renci.org/') -bagel_nodenorm_url += 'get_normalized_nodes' +bagel_nodenorm_url = config.NODE_NORMALIZATION_ENDPOINT + 'get_normalized_nodes' -BAGEL_SERVICE_USERNAME = CONFIG.get("BAGEL_SERVICE_USERNAME", 'default_bagel_username') -BAGEL_SERVICE_PASSWORD = CONFIG.get("BAGEL_SERVICE_PASSWORD", 'default_bagel_password') +BAGEL_SERVICE_USERNAME = config.BAGEL_SERVICE_USERNAME +BAGEL_SERVICE_PASSWORD = config.BAGEL_SERVICE_PASSWORD def call_bagel_service(text, entity, entity_type=''): diff --git a/parsers/LitCoin/src/loadLitCoin.py b/parsers/LitCoin/src/loadLitCoin.py index 292365bc..6e708d25 100644 --- a/parsers/LitCoin/src/loadLitCoin.py +++ b/parsers/LitCoin/src/loadLitCoin.py @@ -12,6 +12,7 @@ from orion.utils import GetData, quick_jsonl_file_iterator from orion.normalization import call_name_resolution, NAME_RESOLVER_API_ERROR from orion.prefixes import PUBMED +from orion.config import config from parsers.LitCoin.src.bagel.bagel_service import call_bagel_service @@ -68,7 +69,7 @@ class LITCOIN: ABSTRACT_JOURNAL_EDGE_PROP = 'journal' -LITCOIN_PRED_MAPPING_URL = os.getenv('LITCOIN_PRED_MAPPING_URL', 'https://pred-mapping.apps.renci.org') +LITCOIN_PRED_MAPPING_URL = config.LITCOIN_PRED_MAPPING_URL PRED_MAPPING_ENDPOINT = f'{LITCOIN_PRED_MAPPING_URL}/query/' @@ -133,7 +134,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): :param source_data_dir - the specific storage directory to save files in """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - self.shared_source_data_path = os.getenv('SHARED_SOURCE_DATA_PATH', None) + self.shared_source_data_path = config.SHARED_SOURCE_DATA_PATH self.data_url = 'https://stars.renci.org/var/data_services/litcoin/' self.version_file = 'litcoin.yaml' self.abstracts_file = 'abstracts_CompAndHeal.json' diff --git a/pyproject.toml b/pyproject.toml index 6fe1b2b9..fcadba77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "bmt>=1.4.6", "jsonlines>=4.0.0", "orjson>=3.11.7", - "python-dotenv>=1.0.1", + "pydantic-settings>=2.13.0", "pyyaml>=6.0.1", "requests>=2.33.1", "requests-toolbelt>=1.0.0", diff --git a/uv.lock b/uv.lock index ba525c96..c6eeb1e7 100644 --- a/uv.lock +++ b/uv.lock @@ -733,6 +733,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, ] +[[package]] +name = "pydantic-settings" +version = "2.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/6d/fffca34caecc4a3f97bda81b2098da5e8ab7efc9a66e819074a11955d87e/pydantic_settings-2.13.1.tar.gz", hash = "sha256:b4c11847b15237fb0171e1462bf540e294affb9b86db4d9aa5c01730bdbe4025", size = 223826, upload-time = "2026-02-19T13:45:08.055Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/4b/ccc026168948fec4f7555b9164c724cf4125eac006e176541483d2c959be/pydantic_settings-2.13.1-py3-none-any.whl", hash = "sha256:d56fd801823dbeae7f0975e1f8c8e25c258eb75d278ea7abb5d9cebb01b56237", size = 58929, upload-time = "2026-02-19T13:45:06.034Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -937,7 +951,7 @@ dependencies = [ { name = "bmt" }, { name = "jsonlines" }, { name = "orjson" }, - { name = "python-dotenv" }, + { name = "pydantic-settings" }, { name = "pyyaml" }, { name = "requests" }, { name = "requests-toolbelt" }, @@ -984,8 +998,8 @@ requires-dist = [ { name = "polars", marker = "extra == 'robokop'", specifier = ">=1.19.0" }, { name = "prefixmaps", marker = "extra == 'robokop'", specifier = ">=0.2.6" }, { name = "psycopg2-binary", marker = "extra == 'robokop'", specifier = ">=2.9.9" }, + { name = "pydantic-settings", specifier = ">=2.13.0" }, { name = "pyoxigraph", marker = "extra == 'robokop'", specifier = ">=0.3.22" }, - { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "pyyaml", specifier = ">=6.0.1" }, { name = "redis", marker = "extra == 'robokop'", specifier = ">=5.2.1" }, { name = "requests", specifier = ">=2.33.1" }, From 3a1121ab0af7621fb8826b708b996e6d1da1f523 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Fri, 3 Apr 2026 15:05:46 -0700 Subject: [PATCH 04/14] standardize url slash usage and nomenclature --- .env.example | 27 +++++++++++++--------- helm/orion/templates/graph-builder.yaml | 12 +++++----- orion/build_manager.py | 3 +-- orion/config.py | 14 ++++++++--- orion/normalization.py | 12 +++++----- parsers/LitCoin/src/NER/nameres.py | 4 ++-- parsers/LitCoin/src/NER/sapbert.py | 2 +- parsers/LitCoin/src/bagel/bagel.py | 4 ++-- parsers/LitCoin/src/bagel/bagel_service.py | 8 +++---- 9 files changed, 49 insertions(+), 37 deletions(-) diff --git a/.env.example b/.env.example index 2165f415..97529bc3 100644 --- a/.env.example +++ b/.env.example @@ -10,7 +10,7 @@ # ORION_LOGS= # Base URL used when generating graph metadata -# ORION_OUTPUT_URL=https://localhost/ +# ORION_OUTPUT_URL=https://localhost # ---- Graph Spec ---- @@ -25,30 +25,35 @@ # Enable test/debug mode (sets log level to DEBUG and runs ingests with a smaller subset of data if possible) # ORION_TEST_MODE=false -# ---- Normalization Endpoints ---- +# ---- Biolink Model ---- -# Edge normalization / BioLink Lookup endpoint -# EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ +# Biolink model version +# BL_VERSION=v4.3.4 -# Node normalization endpoint -# NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ +# ---- Normalization URLs ---- + +# Edge normalization / BioLink Lookup URL +# EDGE_NORMALIZATION_URL=https://bl-lookup-sri.renci.org + +# Node normalization URL +# NODE_NORMALIZATION_URL=https://nodenormalization-sri.renci.org # ---- LitCoin / Bagel (may be removed in the future) ---- # Name resolution service URL -# NAMERES_URL=https://name-resolution-sri.renci.org/ +# NAMERES_URL=https://name-resolution-sri.renci.org # SapBERT service URL -# SAPBERT_URL=https://babel-sapbert.apps.renci.org/ +# SAPBERT_URL=https://babel-sapbert.apps.renci.org # Shared source data path for LitCoin pipeline # SHARED_SOURCE_DATA_PATH=/tmp/shared_data # LitCoin predicate mapping service URL -# LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org/ +# LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org -# Bagel service endpoint -# BAGEL_ENDPOINT=https://bagel.apps.renci.org/ +# Bagel service URL +# BAGEL_URL=https://bagel.apps.renci.org # Bagel service credentials # BAGEL_SERVICE_USERNAME= diff --git a/helm/orion/templates/graph-builder.yaml b/helm/orion/templates/graph-builder.yaml index ef2e9a8b..6e6513f8 100644 --- a/helm/orion/templates/graph-builder.yaml +++ b/helm/orion/templates/graph-builder.yaml @@ -70,15 +70,15 @@ spec: - name: BL_VERSION value: {{ .Values.orion.normalization.bl_version }} {{- if .Values.orion.normalization.nodeNormEndpoint }} - - name: NODE_NORMALIZATION_ENDPOINT + - name: NODE_NORMALIZATION_URL value: {{ .Values.orion.normalization.nodeNormEndpoint }} {{- end }} {{- if .Values.orion.normalization.edgeNormEndpoint }} - - name: EDGE_NORMALIZATION_ENDPOINT + - name: EDGE_NORMALIZATION_URL value: {{ .Values.orion.normalization.edgeNormEndpoint }} {{- end }} {{- if .Values.orion.normalization.nameResolverEndpoint }} - - name: NAMERES_ENDPOINT + - name: NAMERES_URL value: {{ .Values.orion.normalization.nameResolverEndpoint }} {{- end }} {{- if .Values.orion.normalization.sapbertEndpoint }} @@ -157,15 +157,15 @@ spec: - name: BL_VERSION value: {{ .Values.orion.normalization.bl_version }} {{- if .Values.orion.normalization.nodeNormEndpoint }} - - name: NODE_NORMALIZATION_ENDPOINT + - name: NODE_NORMALIZATION_URL value: {{ .Values.orion.normalization.nodeNormEndpoint }} {{- end }} {{- if .Values.orion.normalization.edgeNormEndpoint }} - - name: EDGE_NORMALIZATION_ENDPOINT + - name: EDGE_NORMALIZATION_URL value: {{ .Values.orion.normalization.edgeNormEndpoint }} {{- end }} {{- if .Values.orion.normalization.nameResolverEndpoint }} - - name: NAMERES_ENDPOINT + - name: NAMERES_URL value: {{ .Values.orion.normalization.nameResolverEndpoint }} {{- end }} {{- if .Values.orion.normalization.sapbertEndpoint }} diff --git a/orion/build_manager.py b/orion/build_manager.py index 505e0950..73e6233e 100644 --- a/orion/build_manager.py +++ b/orion/build_manager.py @@ -686,8 +686,7 @@ def get_graph_dir_path(self, graph_id: str, graph_version: str): @staticmethod def get_graph_output_url(graph_id: str, graph_version: str): - graph_output_url = config.ORION_OUTPUT_URL.removesuffix('/') - return f'{graph_output_url}/{graph_id}/{graph_version}/' + return f'{config.ORION_OUTPUT_URL}/{graph_id}/{graph_version}/' @staticmethod def get_graph_nodes_file_path(graph_output_dir: str): diff --git a/orion/config.py b/orion/config.py index 2cb314ff..f80a4fee 100644 --- a/orion/config.py +++ b/orion/config.py @@ -1,4 +1,5 @@ from pathlib import Path +from pydantic import field_validator from pydantic_settings import BaseSettings, SettingsConfigDict @@ -9,6 +10,13 @@ class Config(BaseSettings): env_ignore_empty=True ) + @field_validator("*", mode="before") + @classmethod + def strip_trailing_slashes(cls, v, info): + if isinstance(v, str) and info.field_name.endswith("_URL"): + return v.rstrip("/") + return v + ORION_STORAGE: str | None = None ORION_GRAPHS: str | None = None ORION_LOGS: str | None = None @@ -21,15 +29,15 @@ class Config(BaseSettings): BL_VERSION: str = "v4.3.4" - EDGE_NORMALIZATION_ENDPOINT: str = "https://bl-lookup-sri.renci.org" - NODE_NORMALIZATION_ENDPOINT: str = "https://nodenormalization-sri.renci.org" + EDGE_NORMALIZATION_URL: str = "https://bl-lookup-sri.renci.org" + NODE_NORMALIZATION_URL: str = "https://nodenormalization-sri.renci.org" # the following were used for the LitCoin project and may be removed in the future NAMERES_URL: str = "https://name-resolution-sri.renci.org" SAPBERT_URL: str = "https://babel-sapbert.apps.renci.org" SHARED_SOURCE_DATA_PATH: str = "/tmp/shared_data" LITCOIN_PRED_MAPPING_URL: str = "https://pred-mapping.apps.renci.org" - BAGEL_ENDPOINT: str = "https://bagel.apps.renci.org" + BAGEL_URL: str = "https://bagel.apps.renci.org" BAGEL_SERVICE_USERNAME: str | None = None BAGEL_SERVICE_PASSWORD: str | None = None OPENAI_API_KEY: str | None = None diff --git a/orion/normalization.py b/orion/normalization.py index 1e72f6bb..aff29759 100644 --- a/orion/normalization.py +++ b/orion/normalization.py @@ -100,7 +100,7 @@ def __init__(self, def hit_node_norm_service(self, curies, retries=0): resp: requests.models.Response = \ - self.requests_session.post(f'{NODE_NORMALIZATION_URL}get_normalized_nodes', + self.requests_session.post(f'{NODE_NORMALIZATION_URL}/get_normalized_nodes', json={'curies': curies, 'conflate': self.conflate_node_types, 'drug_chemical_conflate': self.conflate_node_types, @@ -346,7 +346,7 @@ def get_current_node_norm_version(self): Retrieves the current production version from the node normalization service """ # hit the node norm status endpoint - node_norm_status_url = f'{NODE_NORMALIZATION_URL}status' + node_norm_status_url = f'{NODE_NORMALIZATION_URL}/status' resp: requests.models.Response = requests.get(node_norm_status_url) resp.raise_for_status() status: dict = resp.json() @@ -392,7 +392,7 @@ def __init__(self, self.edge_normalization_lookup = {} self.cached_edge_norms = {} - self.edge_norm_endpoint = config.EDGE_NORMALIZATION_ENDPOINT + self.edge_norm_endpoint = config.EDGE_NORMALIZATION_URL if edge_normalization_version != 'latest': if self.check_bl_version_valid(edge_normalization_version): @@ -443,7 +443,7 @@ def normalize_edge_data(self, predicate_chunk: list = predicates_to_normalize_list[start_index: end_index] # hit the edge normalization service - request_url = f'{self.edge_norm_endpoint}resolve_predicate?version={self.edge_norm_version}&predicate=' + request_url = f'{self.edge_norm_endpoint}/resolve_predicate?version={self.edge_norm_version}&predicate=' request_url += '&predicate='.join(predicate_chunk) logger.debug(f'Sending request: {request_url}') resp: requests.models.Response = requests.get(request_url) @@ -515,7 +515,7 @@ def check_bl_version_valid(self, bl_version: str): def get_available_versions(self): # call the versions endpoint - edge_norm_versions_url = f'{self.edge_norm_endpoint}versions' + edge_norm_versions_url = f'{self.edge_norm_endpoint}/versions' resp: requests.models.Response = requests.get(edge_norm_versions_url) # did we get a good status code @@ -535,7 +535,7 @@ def check_node_type_valid(self, node_type: str): def get_valid_node_types(self): # call the descendants endpoint with the root node type - edge_norm_descendants_url = f'{self.edge_norm_endpoint}bl/{NAMED_THING}/descendants?version={self.edge_norm_version}' + edge_norm_descendants_url = f'{self.edge_norm_endpoint}/bl/{NAMED_THING}/descendants?version={self.edge_norm_version}' resp: requests.models.Response = requests.get(edge_norm_descendants_url) # did we get a good status code diff --git a/parsers/LitCoin/src/NER/nameres.py b/parsers/LitCoin/src/NER/nameres.py index 774f903b..1c4cbbbc 100644 --- a/parsers/LitCoin/src/NER/nameres.py +++ b/parsers/LitCoin/src/NER/nameres.py @@ -7,8 +7,8 @@ # Configuration: NameRes NAMERES_URL = config.NAMERES_URL -NAMERES_ENDPOINT = f'{NAMERES_URL}lookup' -NAMERES_RL_ENDPOINT = f'{NAMERES_URL}reverse_lookup' +NAMERES_ENDPOINT = f'{NAMERES_URL}/lookup' +NAMERES_RL_ENDPOINT = f'{NAMERES_URL}/reverse_lookup' class NameResNEREngine(BaseNEREngine): diff --git a/parsers/LitCoin/src/NER/sapbert.py b/parsers/LitCoin/src/NER/sapbert.py index 59907302..ae28c66c 100644 --- a/parsers/LitCoin/src/NER/sapbert.py +++ b/parsers/LitCoin/src/NER/sapbert.py @@ -7,7 +7,7 @@ # Configuration: get the SAPBERT URL and figure out the annotate path. SAPBERT_URL = config.SAPBERT_URL -SAPBERT_ANNOTATE_ENDPOINT = SAPBERT_URL + 'annotate/' +SAPBERT_ANNOTATE_ENDPOINT = f'{SAPBERT_URL}/annotate/' SAPBERT_MODEL_NAME = "sapbert" SAPBERT_COUNT = 1000 # We've found that 1000 is about the minimum you need for reasonable results. diff --git a/parsers/LitCoin/src/bagel/bagel.py b/parsers/LitCoin/src/bagel/bagel.py index 88ea96f3..4d14eaec 100644 --- a/parsers/LitCoin/src/bagel/bagel.py +++ b/parsers/LitCoin/src/bagel/bagel.py @@ -119,7 +119,7 @@ def augment_results(terms, nameres, taxes): augs = nameres.reverse_lookup(curies) for curie in augs: terms[curie].update(augs[curie]) - resp = requests.get(f"{NODE_NORMALIZATION_URL}get_normalized_nodes?curie="+curie+"&conflate=true&drug_chemical_conflate=true&description=true") + resp = requests.get(f"{NODE_NORMALIZATION_URL}/get_normalized_nodes?curie="+curie+"&conflate=true&drug_chemical_conflate=true&description=true") if resp.status_code == 200: result = resp.json() try: @@ -131,7 +131,7 @@ def augment_results(terms, nameres, taxes): if len(annotation["taxa"]) > 0: tax_id = annotation["taxa"][0] if tax_id not in taxes: - resp = requests.get(f"{NODE_NORMALIZATION_URL}get_normalized_nodes?curie="+tax_id) + resp = requests.get(f"{NODE_NORMALIZATION_URL}/get_normalized_nodes?curie="+tax_id) if resp.status_code == 200: result = resp.json() try: diff --git a/parsers/LitCoin/src/bagel/bagel_service.py b/parsers/LitCoin/src/bagel/bagel_service.py index 4af5d472..3d5e672e 100644 --- a/parsers/LitCoin/src/bagel/bagel_service.py +++ b/parsers/LitCoin/src/bagel/bagel_service.py @@ -2,13 +2,13 @@ from requests.auth import HTTPBasicAuth from orion.config import config -BAGEL_ENDPOINT = config.BAGEL_ENDPOINT + 'find_curies_openai' +BAGEL_ENDPOINT = f'{config.BAGEL_URL}/find_curies_openai' -bagel_nameres_url = config.NAMERES_URL + 'lookup?autocomplete=false&offset=0&limit=10&string="' +bagel_nameres_url = f'{config.NAMERES_URL}/lookup?autocomplete=false&offset=0&limit=10&string="' -bagel_sapbert_url = config.SAPBERT_URL + "annotate/" +bagel_sapbert_url = f'{config.SAPBERT_URL}/annotate/' -bagel_nodenorm_url = config.NODE_NORMALIZATION_ENDPOINT + 'get_normalized_nodes' +bagel_nodenorm_url = f'{config.NODE_NORMALIZATION_URL}/get_normalized_nodes' BAGEL_SERVICE_USERNAME = config.BAGEL_SERVICE_USERNAME BAGEL_SERVICE_PASSWORD = config.BAGEL_SERVICE_PASSWORD From a62b56375c0c42439aa4a84520803363b4ee35ba Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 13:12:10 -0700 Subject: [PATCH 05/14] dont hardcode env passthroughs, use env file if there --- docker-compose-worker.yml | 33 ++++++++------------------------- docker-compose.yml | 23 +++-------------------- 2 files changed, 11 insertions(+), 45 deletions(-) diff --git a/docker-compose-worker.yml b/docker-compose-worker.yml index 94018b3f..52857447 100644 --- a/docker-compose-worker.yml +++ b/docker-compose-worker.yml @@ -5,40 +5,23 @@ services: dockerfile: Dockerfile container_name: orion-worker command: [celery, "-A", "celery_worker.celery_app", "worker", "--loglevel=info", "-Q", "orion"] + env_file: + - .env environment: - - CELERY_BROKER_URL=redis://redis:6379/0 - - CELERY_RESULT_BACKEND=redis://redis:6379/0 - - SHARED_SOURCE_DATA_PATH=/tmp/shared_data + # override paths from env, use paths volumes are mounted to inside the container - ORION_STORAGE=/ORION_storage - ORION_GRAPHS=/ORION_graphs - ORION_LOGS=/ORION_logs - - BAGEL_SERVICE_USERNAME=fake-username-do-not-commit-a-real-one!!! - - BAGEL_SERVICE_PASSWORD=fake-password-do-not-commit-a-real-one!!! - - ORION_GRAPH_SPEC - - ORION_GRAPH_SPEC_URL - - ORION_OUTPUT_URL - - EDGE_NORMALIZATION_ENDPOINT - - NODE_NORMALIZATION_ENDPOINT - - NAMERES_URL - - SAPBERT_URL - - LITCOIN_PRED_MAPPING_URL - - BL_VERSION - - PHAROS_DB_HOST - - PHAROS_DB_USER - - PHAROS_DB_PASSWORD - - PHAROS_DB_NAME - - PHAROS_DB_PORT - - DRUGCENTRAL_DB_HOST - - DRUGCENTRAL_DB_USER - - DRUGCENTRAL_DB_PASSWORD - - DRUGCENTRAL_DB_NAME - - DRUGCENTRAL_DB_PORT + - SHARED_SOURCE_DATA_PATH=/tmp/shared_data + # specific to celery + - CELERY_BROKER_URL=redis://redis:6379/0 + - CELERY_RESULT_BACKEND=redis://redis:6379/0 volumes: - .:/ORION - - "${SHARED_SOURCE_DATA_PATH}:/tmp/shared_data" - "${ORION_STORAGE}:/ORION_storage" - "${ORION_GRAPHS}:/ORION_graphs" - "${ORION_LOGS}:/ORION_logs" + - "${SHARED_SOURCE_DATA_PATH}:/tmp/shared_data" user: 1000:7474 networks: - app-network diff --git a/docker-compose.yml b/docker-compose.yml index c8794725..6b9842ef 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,33 +3,16 @@ services: build: context: . command: [orion-build, all] + env_file: + - .env environment: + # override paths from env, use paths volumes are mounted to inside the container - ORION_STORAGE=/ORION_storage - ORION_GRAPHS=/ORION_graphs - ORION_LOGS=/ORION_logs - - ORION_GRAPH_SPEC - - ORION_GRAPH_SPEC_URL - - ORION_OUTPUT_URL - - EDGE_NORMALIZATION_ENDPOINT - - NODE_NORMALIZATION_ENDPOINT - - NAMERES_URL - - SAPBERT_URL - - BL_VERSION - - PHAROS_DB_HOST - - PHAROS_DB_USER - - PHAROS_DB_PASSWORD - - PHAROS_DB_NAME - - PHAROS_DB_PORT - - DRUGCENTRAL_DB_HOST - - DRUGCENTRAL_DB_USER - - DRUGCENTRAL_DB_PASSWORD - - DRUGCENTRAL_DB_NAME - - DRUGCENTRAL_DB_PORT volumes: - .:/ORION - "${ORION_STORAGE}:/ORION_storage" - "${ORION_GRAPHS}:/ORION_graphs" - "${ORION_LOGS}:/ORION_logs" user: 7474:7474 - - From 10364adb901ae343937da6d56e99689cdf139bfa Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 16:08:43 -0700 Subject: [PATCH 06/14] making env file optional, removing persistent logs from docker compose --- docker-compose-worker.yml | 3 ++- docker-compose.yml | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker-compose-worker.yml b/docker-compose-worker.yml index 52857447..5a9efc57 100644 --- a/docker-compose-worker.yml +++ b/docker-compose-worker.yml @@ -6,7 +6,8 @@ services: container_name: orion-worker command: [celery, "-A", "celery_worker.celery_app", "worker", "--loglevel=info", "-Q", "orion"] env_file: - - .env + - path: .env + required: false environment: # override paths from env, use paths volumes are mounted to inside the container - ORION_STORAGE=/ORION_storage diff --git a/docker-compose.yml b/docker-compose.yml index 6b9842ef..8898f706 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,15 +4,14 @@ services: context: . command: [orion-build, all] env_file: - - .env + - path: .env + required: false environment: # override paths from env, use paths volumes are mounted to inside the container - ORION_STORAGE=/ORION_storage - ORION_GRAPHS=/ORION_graphs - - ORION_LOGS=/ORION_logs volumes: - .:/ORION - "${ORION_STORAGE}:/ORION_storage" - "${ORION_GRAPHS}:/ORION_graphs" - - "${ORION_LOGS}:/ORION_logs" user: 7474:7474 From f51b5ea6fcf928e7981efc22f4407a6809606356 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 16:10:27 -0700 Subject: [PATCH 07/14] lazy load the storage dir --- orion/ingest_pipeline.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/orion/ingest_pipeline.py b/orion/ingest_pipeline.py index 1f0a41c7..b057ff5d 100644 --- a/orion/ingest_pipeline.py +++ b/orion/ingest_pipeline.py @@ -39,7 +39,9 @@ def __init__(self, logger.info(f'IngestPipeline running in fresh start mode... previous state and files ignored.') # lazy load the storage directory path - self.storage_dir = self.init_storage_dir(storage_dir) + # store the storage_dir parameter to override the Config if provided programmatically or through CLI + self._storage_dir_override = storage_dir + self._storage_dir = None # dict of source_id -> latest source version (to prevent double lookups) self.latest_source_version_lookup = {} @@ -689,8 +691,14 @@ def get_final_file_paths(self, source_id: str, source_version: str, parsing_vers def get_source_version_path(self, source_id: str, source_version: str): return os.path.join(self.storage_dir, source_id, source_version) + @property + def storage_dir(self): + if self._storage_dir is None: + self._storage_dir = self._resolve_storage_dir(self._storage_dir_override) + return self._storage_dir + @staticmethod - def init_storage_dir(storage_dir: str=None): + def _resolve_storage_dir(storage_dir: str = None): # if a dir was provided programmatically try to use that if storage_dir is not None: if os.path.isdir(storage_dir): From 7662095067f2d69f2c289e49e52f2965e13db9fc Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 16:10:40 -0700 Subject: [PATCH 08/14] adapt tests for new config --- tests/test_graph_spec.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/test_graph_spec.py b/tests/test_graph_spec.py index 91030e90..e76d7a68 100644 --- a/tests/test_graph_spec.py +++ b/tests/test_graph_spec.py @@ -4,16 +4,21 @@ from unittest.mock import MagicMock from orion.build_manager import GraphBuilder, GraphSpecError +from orion import config as config_module + + +def set_config(**overrides): + """Override the config""" + for key, value in overrides.items(): + object.__setattr__(config_module.config, key, value) def clear_graph_spec_config(): - os.environ['ORION_GRAPH_SPEC'] = '' - os.environ['ORION_GRAPH_SPEC_URL'] = '' + set_config(ORION_GRAPH_SPEC='', ORION_GRAPH_SPEC_URL='') def reset_graph_spec_config(): - os.environ['ORION_GRAPH_SPEC'] = 'testing-graph-spec.yaml' - os.environ['ORION_GRAPH_SPEC_URL'] = '' + set_config(ORION_GRAPH_SPEC='testing-graph-spec.yaml', ORION_GRAPH_SPEC_URL='') @pytest.fixture(scope='module') @@ -43,16 +48,14 @@ def test_empty_graph_spec_config(test_graph_spec_dir, test_graph_output_dir): def test_invalid_graph_spec_config(test_graph_spec_dir, test_graph_output_dir): - clear_graph_spec_config() - os.environ['ORION_GRAPH_SPEC'] = 'invalid-spec.yaml' + set_config(ORION_GRAPH_SPEC='invalid-spec.yaml', ORION_GRAPH_SPEC_URL='') with pytest.raises(GraphSpecError): graph_builder = GraphBuilder(graph_specs_dir=test_graph_spec_dir, graph_output_dir=test_graph_output_dir) def test_invalid_graph_spec_url_config(test_graph_output_dir): - clear_graph_spec_config() - os.environ['ORION_GRAPH_SPEC_URL'] = 'http://localhost/invalid_graph_spec_url' + set_config(ORION_GRAPH_SPEC='', ORION_GRAPH_SPEC_URL='http://localhost/invalid_graph_spec_url') with pytest.raises(requests.exceptions.ConnectionError): graph_builder = GraphBuilder(graph_output_dir=test_graph_output_dir) From 020ab4ab41c58d08a6ed23009b9f47e20a06d88d Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 16:41:23 -0700 Subject: [PATCH 09/14] updating readme --- .env.example | 4 ++-- README.md | 43 ++++++++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/.env.example b/.env.example index 97529bc3..75cecf91 100644 --- a/.env.example +++ b/.env.example @@ -1,10 +1,10 @@ # ---- Storage & Output ---- # Directory for source data downloads and ingest pipeline files -# ORION_STORAGE= +ORION_STORAGE=~/ORION_storage/ # Directory for final graph releases -# ORION_GRAPHS= +ORION_GRAPHS=~/ORION_graphs/ # Directory for log files (if unset, logs go to stdout only) # ORION_LOGS= diff --git a/README.md b/README.md index 6fbb6740..067d762f 100644 --- a/README.md +++ b/README.md @@ -42,31 +42,34 @@ After installation, the following commands are available (prefix with `uv run` i ### Configuring ORION -ORION uses three directories for its data, configured via environment variables: +ORION is configured via environment variables, which can be set directly or through an `.env` file. -| Variable | Purpose | -|---|--------------------------------------| -| `ORION_STORAGE` | Data ingest pipeline storage | -| `ORION_GRAPHS` | Knowledge graph outputs | -| `ORION_LOGS` | Log files | - -You can set these up manually or use the provided script: +In most cases, you can simply use this provided script to set up a local environment. It will create directories for ORION outputs next to where ORION was installed and set env vars pointing to them. ```bash source ./set_up_dev_env.sh ``` -#### Graph Spec +For more customization and settings, use an .env file. Copy or rename the `.env.example` file to `.env`. -A Graph Spec yaml file defines which sources to include in a knowledge graph. Set one of the following environment variables (not both): +Then uncommment and edit `.env` as desired to set values for your environment. -```bash -# Option 1: Name of a file in the graph_specs/ directory -export ORION_GRAPH_SPEC=example-graph-spec.yaml +| Variable | Purpose | Default | +|---|------------------------------------------------------------|---| +| `ORION_STORAGE` | Path to a directory for data ingest pipeline storage | (required) | +| `ORION_GRAPHS` | Path to a directory for Knowledge Graph outputs | (required) | +| `ORION_LOGS` | Path to a Log file directory (if unset, logs go to stdout) | `None` | +| `ORION_GRAPH_SPEC` | Graph Spec filename from `graph_specs/` | `example-graph-spec.yaml` | +| `ORION_GRAPH_SPEC_URL` | URL to a remote Graph Spec file | | -# Option 2: URL pointing to a Graph Spec yaml file -export ORION_GRAPH_SPEC_URL=https://stars.renci.org/var/data_services/graph_specs/default-graph-spec.yaml -``` +Configuration is managed by [pydantic-settings](https://docs.pydantic.dev/latest/concepts/pydantic_settings/) — environment variables override `.env` file values, and sensible defaults are provided where possible. See `orion/config.py` for the full list of settings. + +#### Graph Spec + +A Graph Spec yaml file defines which sources to include in a knowledge graph. Set one of the following (not both): + +- `ORION_GRAPH_SPEC` - name of a file in the `graph_specs/` directory +- `ORION_GRAPH_SPEC_URL` - URL pointing to a Graph Spec yaml file Here is a simple Graph Spec example: @@ -100,6 +103,8 @@ See the `graph_specs/` directory for more examples. ### Running with Docker +Make sure environment variables are set or an `.env` file is configured with at least `ORION_STORAGE`, and `ORION_GRAPHS` pointing to valid host directories. The compose file reads these env vars and mounts those directories as volumes in the container. + Build the image: ```bash @@ -115,19 +120,19 @@ docker compose up Build a specific graph: ```bash -docker compose run --rm orion orion-build Example_Graph +docker compose run orion orion-build Example_Graph ``` Run the ingest pipeline for a single data source: ```bash -docker compose run --rm orion orion-ingest DrugCentral +docker compose run orion orion-ingest DrugCentral ``` See available data sources and options: ```bash -docker compose run --rm orion orion-ingest -h +docker compose run orion orion-ingest -h ``` ### Development From 2c53dd31c2c4833195954225b5a2191cbf57daed Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 16:42:27 -0700 Subject: [PATCH 10/14] adding dockerignore --- .dockerignore | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..25573bec --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +.git +.env +.idea +.DS_Store +.pytest_cache +.venv +__pycache__ +*.egg-info +dist \ No newline at end of file From e0b0275127b7d97ae3bee96a490f00baa82dd670 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 20:23:41 -0700 Subject: [PATCH 11/14] removing unnecessary logs --- .github/workflows/test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1b126e2a..945c8e29 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,10 +14,8 @@ jobs: - name: create env params run: | echo "ROBOKOP_HOME=$PWD" >> $GITHUB_ENV - mkdir -p $PWD/tests/workspace/logs mkdir -p $PWD/tests/workspace/storage mkdir -p $PWD/tests/workspace/graphs - echo "ORION_LOGS=$PWD/tests/workspace/logs" >> $GITHUB_ENV echo "ORION_STORAGE=$PWD/tests/workspace/storage" >> $GITHUB_ENV echo "ORION_GRAPHS=$PWD/tests/workspace/graphs" >> $GITHUB_ENV From ca0a15a5780abaf22a2489db383c59159c68ffbd Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 20:24:14 -0700 Subject: [PATCH 12/14] updating actions and removing unused command --- .github/workflows/release.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index eb959fe5..4dee91cd 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -11,29 +11,28 @@ jobs: push_to_registry: name: Push Docker image to GitHub Packages tagged with "latest" and version number. runs-on: ubuntu-latest + permissions: + contents: read + packages: write steps: - name: Check out the repo uses: actions/checkout@v4 - - name: Get the version - id: get_version - run: echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//} - name: Login to ghcr - uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a + uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + uses: docker/metadata-action@v5 with: images: ghcr.io/${{ github.repository }} - name: Push to GitHub Packages - uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 + uses: docker/build-push-action@v6 with: context: . push: true tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: VERSION=${{ steps.get_version.outputs.VERSION }} + labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file From 88c7d8d70ae0be1c499d11301c3cc048ab4788a3 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 21:12:23 -0700 Subject: [PATCH 13/14] bumping robokop-genetics --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fcadba77..c79d7b72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "pyyaml>=6.0.1", "requests>=2.33.1", "requests-toolbelt>=1.0.0", - "robokop-genetics>=0.7.0", + "robokop-genetics>=0.8.0", "uuid-utils>=0.14.1", "xxhash>=3.6.0", ] diff --git a/uv.lock b/uv.lock index c6eeb1e7..d9e0afa0 100644 --- a/uv.lock +++ b/uv.lock @@ -931,16 +931,16 @@ wheels = [ [[package]] name = "robokop-genetics" -version = "0.7.0" +version = "0.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "bmt" }, { name = "redis" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/41/a6/75b9edf1186d3dbfb485910b40570ea9c7452ffce195934de2a40d17167f/robokop_genetics-0.7.0.tar.gz", hash = "sha256:87eb12250867c18f7e149d869fe9173f664f83e90d6c7b910303fd9ba9efc931", size = 18837, upload-time = "2025-10-07T07:18:29.144Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/17/063c21735ef3ce7fb5abb3d3c7bae122db37e45185ecec510bcab3dfb2b2/robokop_genetics-0.8.0.tar.gz", hash = "sha256:4aeb333e5b373b7e2d72f4d56329748a559f4196d0360e7b7d23d4c1a58a1985", size = 18669, upload-time = "2026-04-07T04:08:32.127Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/96/7e941b2ad392429aac56b0826965534e79edff20784eb767c702cab8fbef/robokop_genetics-0.7.0-py3-none-any.whl", hash = "sha256:fe33f004138f5feb5c43157411b146411bf249b97a1a6900348607f874dd8494", size = 18695, upload-time = "2025-10-07T07:18:27.895Z" }, + { url = "https://files.pythonhosted.org/packages/ae/e8/dd70c2cc4e0e076d31a0266aaeb710620d987016eddc65c84f2dee08d07f/robokop_genetics-0.8.0-py3-none-any.whl", hash = "sha256:3ea16f1c72d8c0f3a4f73f9e5c7122347c304b425aafc3d67906ebc68d56fefe", size = 18313, upload-time = "2026-04-07T04:08:30.278Z" }, ] [[package]] @@ -1004,7 +1004,7 @@ requires-dist = [ { name = "redis", marker = "extra == 'robokop'", specifier = ">=5.2.1" }, { name = "requests", specifier = ">=2.33.1" }, { name = "requests-toolbelt", specifier = ">=1.0.0" }, - { name = "robokop-genetics", specifier = ">=0.7.0" }, + { name = "robokop-genetics", specifier = ">=0.8.0" }, { name = "uuid-utils", specifier = ">=0.14.1" }, { name = "xxhash", specifier = ">=3.6.0" }, ] From a050f3cbf2306de0a2f93cc8066deef9997d56c5 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 6 Apr 2026 21:55:22 -0700 Subject: [PATCH 14/14] adding more comments to the env.example --- .env.example | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index 75cecf91..856092b2 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,8 @@ +# Copy or rename this file to ".env" to use it for environment variable configurations. +# +# ATTENTION: The only required environment variables are ORION_STORAGE and ORION_GRAPHS. The rest are optional and it's +# usually fine to leave them commented out or delete them, as the ORION config module will assign defaults. + # ---- Storage & Output ---- # Directory for source data downloads and ingest pipeline files @@ -9,7 +14,8 @@ ORION_GRAPHS=~/ORION_graphs/ # Directory for log files (if unset, logs go to stdout only) # ORION_LOGS= -# Base URL used when generating graph metadata +# Base URL utilized to generate URI identifiers utilized by metadata. +# For example, ROBOKOP graphs use https://robokop.renci.org/ # ORION_OUTPUT_URL=https://localhost # ---- Graph Spec ---- @@ -27,7 +33,7 @@ ORION_GRAPHS=~/ORION_graphs/ # ---- Biolink Model ---- -# Biolink model version +# Biolink model version (optional - don't set this and ORION will use the latest) # BL_VERSION=v4.3.4 # ---- Normalization URLs ----