diff --git a/.env b/.env deleted file mode 100644 index da0872f7..00000000 --- a/.env +++ /dev/null @@ -1,5 +0,0 @@ -OPENAI_API_KEY=fake-key-do-not-commit-a-real-one!!! -OPENAI_API_ORGANIZATION=fake-org-do-not-commit-a-real-one!!! -BAGEL_SERVICE_USERNAME=fake-username-do-not-commit-a-real-one!!! -BAGEL_SERVICE_PASSWORD=fake-password-do-not-commit-a-real-one!!! -SHARED_SOURCE_DATA_PATH=/tmp/shared_data diff --git a/.env.sample b/.env.sample new file mode 100644 index 00000000..49858fa7 --- /dev/null +++ b/.env.sample @@ -0,0 +1,38 @@ +# ORION Core Configuration +# ALL VARIABLES ARE OPTIONAL +# Values here are default values that the system will set if these environment variables are not set + +# Paths for ORION's primary data storage + +# Location to store data from the data sources +# All the directories below will be created under STORAGE_BASE_PATH +# ORION_STORAGE=Storage/ORION_STORAGE +# Location to store graph databases +# ORION_GRAPHS=Storage/ORION_KG +# Location to store logs +# ORION_LOGS= +# Data Storage Configuration +# Path for shared data across services +# SHARED_SOURCE_DATA_PATH= + +# ORION_OUTPUT_URL=https://localhost/ # this is currently only used to generate metadata +# ORION_TEST_MODE=False + +# Graph Specification +# Use either ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL, not both +# Name of the Graph Spec file located in the graph_specs directory +# ORION_GRAPH_SPEC=example-graph-spec.yaml +# ORION_GRAPH_SPEC_URL=https://stars.renci.org/var/data_services/graph_specs/default-graph-spec.yaml + +# Bagel Service Authentication +# Required for accessing the Bagel service +# BAGEL_SERVICE_USERNAME=your-username-here +# BAGEL_SERVICE_PASSWORD=your-password-here + +# URLs +# EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ +# NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ +# NAMERES_URL=https://name-resolution-sri.renci.org/ +# SAPBERT_URL=https://babel-sapbert.apps.renci.org/ +# LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org/ +# BAGEL_ENDPOINT=https://bagel.apps.renci.org/ diff --git a/.gitignore b/.gitignore index 5f1d0ea7..80d73592 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,56 @@ -# pycache -**/__pycache__ -*.pycache +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual environments +venv/ +ENV/ +env/ +.venv + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ +.DS_Store + +# Testing +.pytest_cache/ +.coverage +.tox/ +htmlcov/ + +# Environment variables .env -.idea \ No newline at end of file +.env.local +.env.*.local + +# Project-specific +Storage/ +*.log + +# Lock files (optional - comment out if you want to track them) +# uv.lock \ No newline at end of file diff --git a/Common/build_manager.py b/Common/build_manager.py index 1ce05bd8..aca8e5ab 100644 --- a/Common/build_manager.py +++ b/Common/build_manager.py @@ -22,6 +22,7 @@ from Common.meta_kg import MetaKnowledgeGraphBuilder, META_KG_FILENAME, TEST_DATA_FILENAME, EXAMPLE_DATA_FILENAME from Common.redundant_kg import generate_redundant_kg from Common.collapse_qualifiers import generate_collapsed_qualifiers_kg +from Common.config import CONFIG NODES_FILENAME = 'nodes.jsonl' EDGES_FILENAME = 'edges.jsonl' @@ -33,10 +34,9 @@ class GraphBuilder: def __init__(self, graph_specs_dir=None): - self.logger = LoggingUtil.init_logging("ORION.Common.GraphBuilder", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG['ORION_LOGS']) self.graphs_dir = self.get_graphs_dir() # path to the graphs output directory self.source_data_manager = SourceDataManager() # access to the data sources and their metadata @@ -360,8 +360,8 @@ def generate_meta_kg_and_test_data(self, mkgb.write_example_data_to_file(example_data_file_path) def load_graph_specs(self, graph_specs_dir=None): - graph_spec_file = os.environ.get('ORION_GRAPH_SPEC', None) - graph_spec_url = os.environ.get('ORION_GRAPH_SPEC_URL', None) + graph_spec_file = CONFIG["ORION_GRAPH_SPEC"] + graph_spec_url = CONFIG["ORION_GRAPH_SPEC_URL"] if graph_spec_file and graph_spec_url: raise GraphSpecError(f'Configuration Error - the environment variables ORION_GRAPH_SPEC and ' @@ -524,7 +524,7 @@ def get_graph_dir_path(self, graph_id: str, graph_version: str): @staticmethod def get_graph_output_url(graph_id: str, graph_version: str): - graph_output_url = os.environ.get('ORION_OUTPUT_URL', "https://localhost/").removesuffix('/') + graph_output_url = CONFIG["ORION_OUTPUT_URL"] return f'{graph_output_url}/{graph_id}/{graph_version}/' @staticmethod @@ -551,14 +551,8 @@ def get_graph_metadata(self, graph_id: str, graph_version: str): @staticmethod def get_graphs_dir(): # confirm the directory specified by the environment variable ORION_GRAPHS is valid - graphs_dir = os.environ.get('ORION_GRAPHS', None) - if graphs_dir and Path(graphs_dir).is_dir(): - return os.environ['ORION_GRAPHS'] - - # if invalid or not specified back out - raise IOError('ORION graphs directory not configured properly. ' - 'Specify a valid directory with environment variable ORION_GRAPHS.') - + graphs_dir = CONFIG.get_path("ORION_GRAPHS") + return graphs_dir if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merge data sources into complete graphs.") diff --git a/Common/config.py b/Common/config.py index f5ec3b7d..8a5844ae 100644 --- a/Common/config.py +++ b/Common/config.py @@ -1,10 +1,99 @@ -import os +# import os +# from pathlib import Path +# from dotenv import dotenv_values + +# CONFIG = { +# **dotenv_values(Path(__file__).parents[1] / '.env'), # load config variables from .env +# **os.environ, # override loaded values with environment variables +# } + +from dataclasses import dataclass +from .utils import LoggingUtil from pathlib import Path -from dotenv import dotenv_values +from typing import ClassVar +from pydantic import field_validator, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + +logger = LoggingUtil.init_logging("ORION.Common.config", + line_format='medium') + +class Config(BaseSettings): + model_config = SettingsConfigDict( + env_file= Path(__file__).parent.parent/".env", + env_file_encoding="utf-8", + ) # Load .env file and OS ENV Variables. + + ORION_STORAGE: Path = Path.cwd()/"storage/orion_storage" + ORION_GRAPHS: Path = Path.cwd()/"storage/orion_graphs" + ORION_LOGS: Path | None = None + SHARED_SOURCE_DATA_PATH: Path | None = None + + ORION_OUTPUT_URL: str="https://localhost/" + ORION_TEST_MODE: bool=False + + ORION_GRAPH_SPEC: str="example-graph-spec.yaml" + ORION_GRAPH_SPEC_URL: str="" + + BAGEL_SERVICE_USERNAME: str="default_bagel_username" + BAGEL_SERVICE_PASSWORD: str="default_bagel_password" + + EDGE_NORMALIZATION_ENDPOINT: str="https://bl-lookup-sri.renci.org/" + NODE_NORMALIZATION_ENDPOINT: str="https://nodenormalization-sri.renci.org/" + NAMERES_URL: str="https://name-resolution-sri.renci.org/" + SAPBERT_URL: str="https://babel-sapbert.apps.renci.org/" + LITCOIN_PRED_MAPPING_URL: str="https://pred-mapping.apps.renci.org/" + BAGEL_ENDPOINT: str="https://bagel.apps.renci.org/" + + # class method to get an instance of the class, with an option to be able to reload + _instance: ClassVar["Config | None"] = None + + @classmethod + def get(cls, refresh: bool = False) -> "Config": + if cls._instance is None or refresh: + cls._instance = cls() + return cls._instance + + # Validation function for ORION_LOGS + @field_validator("ORION_LOGS") + @classmethod + def validate_logs_path(cls, v: Path | None) -> Path | None: + if v is None: + return None + if not v.exists(): + raise ValueError(f"ORION_LOGS path does not exist: {v}") + elif not v.is_dir(): + raise ValueError(f"ORION_LOGS is not a directory: {v}") + return v + + ## Making sure that either orion graph spec or orion graph spec url are set (not both) + @model_validator(mode="after") + def check_graph_spec(self) -> "Config": + if self.ORION_GRAPH_SPEC and self.ORION_GRAPH_SPEC_URL: + raise ValueError("Set either ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL, not both") + if not self.ORION_GRAPH_SPEC_URL and not self.ORION_GRAPH_SPEC: + raise ValueError("Must set either ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL") + return self + + ## Make relevant directory, and return the path for orion_storage, and orion_graphs + def get_path(self, name: str) -> Path: + if name not in ("ORION_STORAGE", "ORION_GRAPHS"): + raise ValueError(f"Unknown directory field: {name}") + + path = getattr(self, name) + try: + path.mkdir(parents=True, exist_ok=True) + except OSError as e: + raise ValueError(f"Failed to create {name} directory: {e}") + return path -CONFIG = { - **dotenv_values(Path(__file__).parents[1] / '.env'), # load config variables from .env - **os.environ, # override loaded values with environment variables -} +class ConfigProxy: + def __getattr__(self, name: str): + return getattr(Config.get(), name) + + def __getitem__(self, name: str): + return getattr(Config.get(), name) + def refresh(self): + Config.get(refresh=True) +CONFIG = ConfigProxy() \ No newline at end of file diff --git a/Common/kgx_file_merger.py b/Common/kgx_file_merger.py index 39b49ad0..a576b3d3 100644 --- a/Common/kgx_file_merger.py +++ b/Common/kgx_file_merger.py @@ -6,15 +6,15 @@ from Common.biolink_constants import SUBJECT_ID, OBJECT_ID from Common.merging import GraphMerger, DiskGraphMerger, MemoryGraphMerger from Common.load_manager import RESOURCE_HOGS +from Common.config import CONFIG # import line_profiler # import atexit # profile = line_profiler.LineProfiler() # atexit.register(profile.print_stats) - logger = LoggingUtil.init_logging("ORION.Common.KGXFileMerger", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG["ORION_LOGS"]) CONNECTED_EDGE_SUBSET = 'connected_edge_subset' DONT_MERGE = 'dont_merge_edges' diff --git a/Common/kgx_file_normalizer.py b/Common/kgx_file_normalizer.py index 3ac25521..9f522ed7 100644 --- a/Common/kgx_file_normalizer.py +++ b/Common/kgx_file_normalizer.py @@ -9,7 +9,7 @@ NormalizationFailedError from Common.utils import LoggingUtil, chunk_iterator from Common.kgx_file_writer import KGXFileWriter - +from Common.config import CONFIG EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS} NODE_NORMALIZATION_BATCH_SIZE = 1_000_000 @@ -25,7 +25,7 @@ class KGXFileNormalizer: logger = LoggingUtil.init_logging("ORION.Common.KGXFileNormalizer", line_format='medium', level=logging.INFO, - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG["ORION_LOGS"]) def __init__(self, source_nodes_file_path: str, diff --git a/Common/kgx_file_writer.py b/Common/kgx_file_writer.py index 54bf9e45..e171d2dd 100644 --- a/Common/kgx_file_writer.py +++ b/Common/kgx_file_writer.py @@ -6,14 +6,14 @@ from Common.kgxmodel import kgxnode, kgxedge from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \ SUBJECT_ID, OBJECT_ID, PREDICATE - +from Common.config import CONFIG class KGXFileWriter: logger = LoggingUtil.init_logging("ORION.Common.KGXFileWriter", line_format='medium', level=logging.INFO, - log_file_path=os.environ.get('ORION_LOGS')) + log_file_path=CONFIG["ORION_LOGS"]) """ constructor :param nodes_output_file_path: the file path for the nodes file diff --git a/Common/load_manager.py b/Common/load_manager.py index 894dae60..ddc181b4 100644 --- a/Common/load_manager.py +++ b/Common/load_manager.py @@ -14,13 +14,13 @@ from Common.metadata import SourceMetadata from Common.loader_interface import SourceDataBrokenError, SourceDataFailedError from Common.supplementation import SequenceVariantSupplementation, SupplementationFailedError - +from Common.config import CONFIG SOURCE_DATA_LOADER_CLASSES = SourceDataLoaderClassFactory() logger = LoggingUtil.init_logging("ORION.Common.SourceDataManager", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG["ORION_LOGS"]) class SourceDataManager: @@ -691,13 +691,7 @@ def get_source_version_path(self, source_id: str, source_version: str): def init_storage_dir(self): # use the storage directory specified by the environment variable ORION_STORAGE # check to make sure it's set and valid, otherwise fail - if "ORION_STORAGE" not in os.environ: - raise Exception(f'You must use the environment variable ORION_STORAGE ' - f'to specify a storage directory.') - if os.path.isdir(os.environ["ORION_STORAGE"]): - return os.environ["ORION_STORAGE"] - else: - raise IOError(f'Storage directory not valid: {os.environ["ORION_STORAGE"]}') + return CONFIG.get_path("ORION_STORAGE") def init_source_output_dir(self, source_id: str): source_dir_path = os.path.join(self.storage_dir, source_id) @@ -723,12 +717,8 @@ def init_source_output_dir(self, source_id: str): 'in the finalized kgx files.') args = parser.parse_args() - if 'ORION_TEST_MODE' in os.environ: - test_mode_from_env = os.environ['ORION_TEST_MODE'] - else: - test_mode_from_env = False - - loader_test_mode = args.test_mode or test_mode_from_env + test_mode_from_env = CONFIG["ORION_TEST_MODE"] + loader_test_mode = args.test_mode or test_mode_from_env ## TODO: Is this redundant? loader_strict_normalization = (not args.lenient_normalization) load_manager = SourceDataManager(test_mode=loader_test_mode, fresh_start_mode=args.fresh_start_mode) diff --git a/Common/loader_interface.py b/Common/loader_interface.py index 5ca94b6f..bc413950 100644 --- a/Common/loader_interface.py +++ b/Common/loader_interface.py @@ -4,7 +4,7 @@ import inspect from Common.kgx_file_writer import KGXFileWriter from Common.utils import LoggingUtil - +from Common.config import CONFIG class SourceDataLoader: @@ -36,7 +36,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): if not os.path.exists(self.data_path): os.mkdir(self.data_path) else: - self.data_path = os.environ.get("ORION_STORAGE") + self.data_path = CONFIG.get_path("ORION_STORAGE") # the final output lists of nodes and edges self.final_node_list: list = [] @@ -49,7 +49,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.logger = LoggingUtil.init_logging(f"ORION.parsers.{self.get_name()}", level=logging.INFO, line_format='medium', - log_file_path=os.environ.get('ORION_LOGS')) + log_file_path=CONFIG["ORION_LOGS"]) def get_latest_source_version(self): """Determine and return the latest source version ie. a unique identifier associated with the latest version.""" diff --git a/Common/merging.py b/Common/merging.py index 53071510..8349e36c 100644 --- a/Common/merging.py +++ b/Common/merging.py @@ -5,6 +5,7 @@ from Common.biolink_utils import BiolinkUtils from Common.biolink_constants import * from Common.utils import quick_json_loads, quick_json_dumps, chunk_iterator, LoggingUtil +from Common.config import CONFIG NODE_PROPERTIES_THAT_SHOULD_BE_SETS = {SYNONYMS, NODE_TYPES, SYNONYM} EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS, XREFS} @@ -13,10 +14,9 @@ EDGE_ENTITY_TYPE = 'edge' bmt = BiolinkUtils() - logger = LoggingUtil.init_logging("ORION.Common.merging", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG["ORION_LOGS"]) def node_key_function(node): diff --git a/Common/neo4j_tools.py b/Common/neo4j_tools.py index fefce11d..6bb72eb1 100644 --- a/Common/neo4j_tools.py +++ b/Common/neo4j_tools.py @@ -5,7 +5,7 @@ import Common.kgx_file_converter as kgx_file_converter from Common.biolink_constants import NAMED_THING from Common.utils import LoggingUtil - +from Common.config import CONFIG class Neo4jTools: @@ -25,7 +25,7 @@ def __init__(self, self.neo4j_driver = neo4j.GraphDatabase.driver(self.graph_db_uri, auth=self.graph_db_auth) self.logger = LoggingUtil.init_logging("ORION.Common.neo4j_tools", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG["ORION_LOGS"]) def import_csv_files(self, graph_directory: str, diff --git a/Common/normalization.py b/Common/normalization.py index 66a9e70a..defd5a89 100644 --- a/Common/normalization.py +++ b/Common/normalization.py @@ -9,6 +9,7 @@ from robokop_genetics.genetics_normalization import GeneticsNormalizer from Common.biolink_constants import * from Common.utils import LoggingUtil +from Common.config import CONFIG NORMALIZATION_CODE_VERSION = '1.4' @@ -18,7 +19,6 @@ # predicate to use when normalization fails FALLBACK_EDGE_PREDICATE = 'biolink:related_to' - @dataclass class NormalizationScheme: node_normalization_version: str = 'latest' @@ -49,8 +49,7 @@ def __init__(self, error_message: str, actual_error: Exception = None): self.error_message = error_message self.actual_error = actual_error -NODE_NORMALIZATION_URL = os.environ.get('NODE_NORMALIZATION_ENDPOINT', 'https://nodenormalization-sri.renci.org/') - +NODE_NORMALIZATION_URL = CONFIG["NODE_NORMALIZATION_ENDPOINT"] class NodeNormalizer: """ @@ -81,7 +80,7 @@ def __init__(self, self.logger = LoggingUtil.init_logging("ORION.Common.NodeNormalizer", level=log_level, line_format='medium', - log_file_path=os.environ.get('ORION_LOGS')) + log_file_path=CONFIG["ORION_LOGS"]) # storage for regular nodes that failed to normalize self.failed_to_normalize_ids = set() # storage for variant nodes that failed to normalize @@ -386,9 +385,6 @@ class EdgeNormalizer: """ Class that contains methods relating to edge normalization. """ - - DEFAULT_EDGE_NORM_ENDPOINT = f'https://bl-lookup-sri.renci.org/' - def __init__(self, edge_normalization_version: str = 'latest', log_level=logging.INFO): @@ -397,15 +393,13 @@ def __init__(self, :param log_level - overrides default log level """ # create a logger - self.logger = LoggingUtil.init_logging("ORION.Common.EdgeNormalizer", level=log_level, line_format='medium', log_file_path=os.environ.get('ORION_LOGS')) + self.logger = LoggingUtil.init_logging("ORION.Common.EdgeNormalizer", level=log_level, line_format='medium', + log_file_path=CONFIG["ORION_LOGS"]) # normalization map for future look up of all normalized predicates self.edge_normalization_lookup = {} self.cached_edge_norms = {} - - if 'EDGE_NORMALIZATION_ENDPOINT' in os.environ and os.environ['EDGE_NORMALIZATION_ENDPOINT']: - self.edge_norm_endpoint = os.environ['EDGE_NORMALIZATION_ENDPOINT'] - else: - self.edge_norm_endpoint = self.DEFAULT_EDGE_NORM_ENDPOINT + + self.edge_norm_endpoint = CONFIG["EDGE_NORMALIZATION_ENDPOINT"] if edge_normalization_version != 'latest': if self.check_bl_version_valid(edge_normalization_version): @@ -560,13 +554,11 @@ def get_valid_node_types(self): # this shouldn't happen, raise an exception resp.raise_for_status() - -NAME_RESOLVER_URL = os.getenv('NAMERES_URL', 'https://name-resolution-sri.renci.org') -NAME_RESOLVER_ENDPOINT = f'{NAME_RESOLVER_URL}/lookup' +NAME_RESOLVER_URL = CONFIG['NAMERES_URL'] +NAME_RESOLVER_ENDPOINT = f'{NAME_RESOLVER_URL}lookup' NAME_RESOLVER_HEADERS = {"accept": "application/json"} NAME_RESOLVER_API_ERROR = 'api_error' - def call_name_resolution(name: str, biolink_type: str, retries=0, logger=None): nameres_payload = { "string": name, diff --git a/Common/predicates.py b/Common/predicates.py index 3fdd7df2..54950e6b 100644 --- a/Common/predicates.py +++ b/Common/predicates.py @@ -2,6 +2,7 @@ import requests from Common.prefixes import * import time +from Common.config import CONFIG # these are predicates from DGIDB as well as drug and chemical activity types from drug central DGIDB_PREDICATE_MAPPING = { @@ -52,8 +53,8 @@ "xc50": f"RO:0002436" # This is related to ec50 and ic50 both of which describe binding events } -LITCOIN_PRED_MAPPING_URL = os.getenv('LITCOIN_PRED_MAPPING_URL', 'https://pred-mapping.apps.renci.org') -PRED_MAPPING_ENDPOINT = f'{LITCOIN_PRED_MAPPING_URL}/query/' +LITCOIN_PRED_MAPPING_URL = CONFIG["LITCOIN_PRED_MAPPING_URL"] +PRED_MAPPING_ENDPOINT = f'{LITCOIN_PRED_MAPPING_URL}query/' def call_pred_mapping(subject: str, obj: str, predicate: str, abstract: str, retries=0, logger=None): headers = { diff --git a/Common/utils.py b/Common/utils.py index 0adf94cd..be9eab57 100644 --- a/Common/utils.py +++ b/Common/utils.py @@ -51,7 +51,7 @@ def init_logging(name, level=logging.INFO, line_format='minimum', log_file_path= logger.setLevel(level) # if there was a file path passed in use it - if log_file_path is not None: + if log_file_path: # create a rotating file handler, 100mb max per file with a max number of 10 files file_handler = RotatingFileHandler(filename=os.path.join(log_file_path, name + '.log'), maxBytes=100000000, backupCount=10) diff --git a/celery_worker.py b/celery_worker.py index 26f2fdf7..8d9138db 100644 --- a/celery_worker.py +++ b/celery_worker.py @@ -1,7 +1,7 @@ from celery import Celery import subprocess import os - +from Common.config import CONFIG # Configure Celery to connect to the Redis broker celery_app = Celery( @@ -30,10 +30,11 @@ def run_build_manager(task_data): print(f'task_data: {task_data}', flush=True) # Run build_manager.py as a subprocess with the provided config os.environ["ORION_GRAPH_SPEC"] = task_data["graph_spec_filename"] + CONFIG.refresh() # no need to catch CalledProcessError exception, but rather let it propogate to Celery task handling result = subprocess.run( ["python", "build_manager.py", task_data["graph_id"], "--graph_specs_dir", - os.getenv('SHARED_SOURCE_DATA_PATH', None)], + CONFIG["SHARED_SOURCE_DATA_PATH"]], capture_output=True, text=True, check=True diff --git a/cli/memgraph_dump.py b/cli/memgraph_dump.py index 59e4ccbc..3999fd10 100644 --- a/cli/memgraph_dump.py +++ b/cli/memgraph_dump.py @@ -1,11 +1,11 @@ import argparse -import os from Common.utils import LoggingUtil from Common.memgraph_tools import create_memgraph_dump +from Common.config import CONFIG logger = LoggingUtil.init_logging("ORION.cli.memgraph_dump", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path=CONFIG['ORION_LOGS']) if __name__ == '__main__': ap = argparse.ArgumentParser(description='') diff --git a/cli/neo4j_dump.py b/cli/neo4j_dump.py index 1f7b50ee..a697989f 100644 --- a/cli/neo4j_dump.py +++ b/cli/neo4j_dump.py @@ -2,10 +2,11 @@ import os from Common.utils import LoggingUtil from Common.neo4j_tools import create_neo4j_dump +from Common.config import CONFIG logger = LoggingUtil.init_logging("ORION.cli.neo4j_dump", line_format='medium', - log_file_path=os.environ['ORION_LOGS']) + log_file_path= CONFIG['ORION_LOGS']) if __name__ == '__main__': ap = argparse.ArgumentParser(description='') diff --git a/parsers/LitCoin/src/NER/nameres.py b/parsers/LitCoin/src/NER/nameres.py index 5156e3c8..17d4fb61 100644 --- a/parsers/LitCoin/src/NER/nameres.py +++ b/parsers/LitCoin/src/NER/nameres.py @@ -5,12 +5,14 @@ from parsers.LitCoin.src.NER.base import BaseNEREngine +from Common.config import CONFIG + # Configuration: NameRes -NAMERES_URL = os.getenv('NAMERES_URL', 'https://name-resolution-sri.renci.org/') + +NAMERES_URL = CONFIG["NAMERES_URL"] NAMERES_ENDPOINT = f'{NAMERES_URL}lookup' NAMERES_RL_ENDPOINT = f'{NAMERES_URL}reverse_lookup' - class NameResNEREngine(BaseNEREngine): def __init__(self, requests_session): """ diff --git a/parsers/LitCoin/src/NER/sapbert.py b/parsers/LitCoin/src/NER/sapbert.py index ab50389b..95beb81a 100644 --- a/parsers/LitCoin/src/NER/sapbert.py +++ b/parsers/LitCoin/src/NER/sapbert.py @@ -5,8 +5,10 @@ from parsers.LitCoin.src.NER.base import BaseNEREngine +from Common.config import CONFIG + # Configuration: get the SAPBERT URL and figure out the annotate path. -SAPBERT_URL = os.getenv('SAPBERT_URL', 'https://babel-sapbert.apps.renci.org/') +SAPBERT_URL = CONFIG["SAPBERT_URL"] SAPBERT_ANNOTATE_ENDPOINT = SAPBERT_URL + 'annotate/' SAPBERT_MODEL_NAME = "sapbert" SAPBERT_COUNT = 1000 # We've found that 1000 is about the minimum you need for reasonable results. diff --git a/parsers/LitCoin/src/bagel/bagel_gpt.py b/parsers/LitCoin/src/bagel/bagel_gpt.py deleted file mode 100644 index 93ab0869..00000000 --- a/parsers/LitCoin/src/bagel/bagel_gpt.py +++ /dev/null @@ -1,124 +0,0 @@ -import json -import os -from collections import defaultdict - -from Common.config import CONFIG -from Common.utils import LoggingUtil - -OPENAI_API_KEY = CONFIG.get("OPENAI_API_KEY") - -LLM_RESULTS = [] - - -logger = LoggingUtil.init_logging("ORION.Common.BagelGPT", - line_format='medium', - log_file_path=os.environ['ORION_LOGS']) - -def ask_classes_and_descriptions(text, term, termlist, abstract_id, requests_session): - """Get GPT results based only on the labels of the terms.""" - - # Get the Labels - labels = defaultdict(list) - descriptions = defaultdict(list) - for curie, annotation in termlist.items(): - labels[(annotation["label"], annotation["biolink_type"])].append(curie) - descriptions[(annotation["label"], annotation["biolink_type"])].append(annotation["description"]) - synonym_list = [(x[0], x[1], d) for x, d in descriptions.items()] - - # Define the Prompt - prompt = f""" You are an expert in biomedical vocabularies and ontologies. I will provide you with the abstract to a scientific paper, as well as - a query term: biomedical entity that occurs in that abstract. I will also provide you a list of possible synonyms for the query term, along - with their class as defined within their vocabulary, such as Gene or Disease. This will help you distinguish between - entities with the same name such as HIV, which could refer to either a particular virus (class OrganismTaxon) or a disease (class Disease). It can also - help distinguish between a disease hyperlipidemia (class Disease) versus hyperlipidemia as a symptom of another disease (class PhenotpyicFeature). - For some entities, I will also provide a description of the entity along with the name and class. - Please determine whether the query term, as it is used in the abstract, is an exact synonym of any of the terms in the list. There should be at most one - exact synonym of the query term. If there are no exact synonyms for the query term in the list, please look for narrow, broad, or related synonyms, - The synonym is narrow if the query term is a more specific form of one of the list terms. For example, the query term "Type 2 Diabetes" would be a - narrow synonym of "Diabetes" because it is not an exact synonym, but a more specific form. - The synonym is broad if the query term is a more general form of the list term. For instance, the query term "brain injury" would be a broad synonym - of "Cerebellar Injury" because it is more generic. - The synonym is related if it is neither exact, narrow, or broad, but is still a similar enough term. For instance the query term "Pain" would be - a related synonym of "Pain Disorder". - It is also possible that there are neither exact nor narrow synonyms of the query term in the list. - Provide your answers in the following JSON structure: - [ - {{ - "synonym": ..., - "vocabulary class": ..., - "synonymType": ... - }} - ] - where the value for synonym is the element from the synonym list, vocabulary class is the class that I input - associated with that synonym, and synonymType is either "exact", "narrow", "broad", or "related". - - abstract: {text} - query term: {term} - possible_synonyms_classes_and_descriptions: {synonym_list} - """ - - results = query(prompt, requests_session) - - LLM_RESULTS.append({ - 'abstract_id': abstract_id, - 'term': term, - 'prompt': prompt, - 'output': results - }) - - for result in results: - syn = result['synonym'] - cls = result['vocabulary class'] - syntype = result['synonymType'] - curies = labels[(syn, cls)] - for curie in curies: - termlist[curie]["synonym_Type"] = syntype - - grouped_by_syntype = defaultdict(list) - for curie in termlist: - syntype = termlist[curie].get("synonym_Type", None) - if syntype: - termlist[curie]["curie"] = curie - grouped_by_syntype[syntype].append(termlist[curie]) - return grouped_by_syntype - - -def query(prompt, requests_session): - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {OPENAI_API_KEY}" - } - - payload = { - "model": "gpt-4o-mini", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": prompt - } - ] - } - ] - } - - response = requests_session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) - if response.status_code != 200: - logger.error(f'openai call returned non-200 status: {response.status_code}') - response.raise_for_status() - try: - content = response.json()["choices"][0]["message"]["content"] - # print(content) - except KeyError as k: - logger.warning(f'openai json did not contain expected key {k}: {response.json()}') - raise k - - try: - chunk = content[content.index("["):(content.rindex("]") + 1)] - output = json.loads(chunk) - except (json.JSONDecodeError, ValueError) as e: - logger.warning(f'openai results did not contain valid json chunk: {content}') - raise e - return output diff --git a/parsers/LitCoin/src/bagel/bagel_service.py b/parsers/LitCoin/src/bagel/bagel_service.py index 0706d787..8fe28cdd 100644 --- a/parsers/LitCoin/src/bagel/bagel_service.py +++ b/parsers/LitCoin/src/bagel/bagel_service.py @@ -2,20 +2,20 @@ from requests.auth import HTTPBasicAuth from Common.config import CONFIG -BAGEL_ENDPOINT = 'https://bagel.apps.renci.org/' +BAGEL_ENDPOINT = CONFIG["BAGEL_ENDPOINT"] BAGEL_ENDPOINT += 'find_curies_openai' -bagel_nameres_url = CONFIG.get('NAMERES_ENDPOINT', 'https://name-resolution-sri.renci.org/') +bagel_nameres_url = CONFIG["NAMERES_URL"] bagel_nameres_url += 'lookup?autocomplete=false&offset=0&limit=10&string="' -bagel_sapbert_url = CONFIG.get('SAPBERT_URL', 'https://sap-qdrant.apps.renci.org/') +bagel_sapbert_url = CONFIG["SAPBERT_URL"] # This default is different: 'https://sap-qdrant.apps.renci.org/' bagel_sapbert_url += "annotate/" -bagel_nodenorm_url = CONFIG.get('NODE_NORMALIZATION_ENDPOINT', 'https://nodenormalization-sri.renci.org/') +bagel_nodenorm_url = CONFIG["NODE_NORMALIZATION_ENDPOINT"] bagel_nodenorm_url += 'get_normalized_nodes' -BAGEL_SERVICE_USERNAME = CONFIG.get("BAGEL_SERVICE_USERNAME", 'default_bagel_username') -BAGEL_SERVICE_PASSWORD = CONFIG.get("BAGEL_SERVICE_PASSWORD", 'default_bagel_password') +BAGEL_SERVICE_USERNAME = CONFIG["BAGEL_SERVICE_USERNAME"] +BAGEL_SERVICE_PASSWORD = CONFIG["BAGEL_SERVICE_PASSWORD"] def call_bagel_service(text, entity, entity_type=''): diff --git a/parsers/LitCoin/src/loadLitCoin.py b/parsers/LitCoin/src/loadLitCoin.py index ab43e53f..76687976 100644 --- a/parsers/LitCoin/src/loadLitCoin.py +++ b/parsers/LitCoin/src/loadLitCoin.py @@ -12,6 +12,7 @@ from Common.normalization import call_name_resolution, NAME_RESOLVER_API_ERROR from Common.predicates import call_pred_mapping from Common.prefixes import PUBMED +from Common.config import CONFIG from parsers.LitCoin.src.bagel.bagel_service import call_bagel_service from parsers.LitCoin.src.bagel.bagel import get_orion_bagel_results, extract_best_match, \ @@ -84,7 +85,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): :param source_data_dir - the specific storage directory to save files in """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - self.shared_source_data_path = os.getenv('SHARED_SOURCE_DATA_PATH', None) + self.shared_source_data_path = CONFIG["SHARED_SOURCE_DATA_PATH"] self.data_url = 'https://stars.renci.org/var/data_services/litcoin/' self.version_file = 'litcoin.yaml' self.abstracts_file = 'abstracts_CompAndHeal.json' diff --git a/requirements.txt b/requirements.txt index 10ca6b31..ee7657b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ python-dotenv>=1.0.1 polars>=1.19.0 celery>=5.4.0 redis>=5.2.1 +pydantic-settings diff --git a/set_up_test_env.sh b/set_up_test_env.sh deleted file mode 100644 index 46ed120c..00000000 --- a/set_up_test_env.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -#These environment variables are required by Data Services. See the README for more information. - -#ORION_STORAGE - a directory for storing data sources -mkdir -p "$PWD/../ORION_storage" -export ORION_STORAGE="$PWD/../ORION_storage/" - -#ORION_GRAPHS - a directory for storing knowledge graphs -mkdir -p "$PWD/../ORION_graphs" -export ORION_GRAPHS="$PWD/../ORION_graphs/" - -#ORION_LOGS - a directory for storing logs -mkdir -p "$PWD/../ORION_logs" -export ORION_LOGS="$PWD/../ORION_logs/" - -#Use EITHER of the following, ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL - -#ORION_GRAPH_SPEC - the name of a Graph Spec file located in the graph_specs directory of ORION -export ORION_GRAPH_SPEC=example-graph-spec.yaml - -#ORION_GRAPH_SPEC_URL - a URL pointing to a Graph Spec file -#export ORION_GRAPH_SPEC_URL=https://raw.githubusercontent.com/RENCI-AUTOMAT/ORION/helm_deploy/graph_specs/yeast-graph-spec.yml - -export PYTHONPATH="$PYTHONPATH:$PWD" - -# The following environment variables are optional -# -# export EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/ -# export NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/ -# export NAMERES_URL=https://name-resolution-sri.renci.org/ -# export SAPBERT_URL=https://babel-sapbert.apps.renci.org/ -# export LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org/ - -# export ORION_OUTPUT_URL=https://localhost/ # this is currently only used to generate metadata -# export BL_VERSION=4.2.1 - -# if you are building your own docker image and issues occur, setting the correct platform may help -# export DOCKER_PLATFORM=linux/arm64 - diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..1372a11c --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +from Common import utils \ No newline at end of file diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 00000000..cfa92584 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,22 @@ +import os +from unittest import mock +from pathlib import Path +from Common.config import CONFIG + +@mock.patch.dict(os.environ, { + "ORION_GRAPHS": str(Path.cwd()/"tmp/orion_graphs"), +}) + +def test_config_created_from_env_vars(): + if (Path.cwd()/"tmp/orion_graphs").exists(): + os.rmdir(Path.cwd()/"tmp/orion_graphs") + + CONFIG.refresh() + assert(CONFIG.ORION_STORAGE == Path.cwd()/"storage/orion_storage") + assert(CONFIG.ORION_GRAPHS == Path.cwd()/"tmp/orion_graphs") + assert(CONFIG.SHARED_SOURCE_DATA_PATH == Path("Storage/SHARED_DATA")) + assert(CONFIG.ORION_TEST_MODE) + + CONFIG.get_path('ORION_GRAPHS') + assert(CONFIG['ORION_GRAPHS'].exists()) + os.rmdir(Path.cwd()/"tmp/orion_graphs") diff --git a/tests/test_graph_spec.py b/tests/test_graph_spec.py index f82931cc..673e0fe0 100644 --- a/tests/test_graph_spec.py +++ b/tests/test_graph_spec.py @@ -4,7 +4,7 @@ from unittest.mock import MagicMock from Common.build_manager import GraphBuilder, GraphSpecError - +from Common.config import CONFIG def clear_graph_spec_config(): os.environ['ORION_GRAPH_SPEC'] = '' @@ -31,6 +31,8 @@ def get_source_data_manager_mock(): def test_empty_graph_spec_config(): clear_graph_spec_config() + with pytest.raises(ValueError): + CONFIG.refresh() with pytest.raises(GraphSpecError): graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) @@ -38,6 +40,7 @@ def test_empty_graph_spec_config(): def test_invalid_graph_spec_config(): clear_graph_spec_config() os.environ['ORION_GRAPH_SPEC'] = 'invalid-spec.yaml' + CONFIG.refresh() with pytest.raises(GraphSpecError): graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) @@ -45,6 +48,7 @@ def test_invalid_graph_spec_config(): def test_invalid_graph_spec_url_config(): clear_graph_spec_config() os.environ['ORION_GRAPH_SPEC_URL'] = 'http://localhost/invalid_graph_spec_url' + CONFIG.refresh() with pytest.raises(requests.exceptions.ConnectionError): graph_builder = GraphBuilder() @@ -52,6 +56,7 @@ def test_invalid_graph_spec_url_config(): # the graph spec is loaded up properly but doesn't attempt to determine versions when unspecified def test_valid_graph_spec_config(): reset_graph_spec_config() + CONFIG.refresh() graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) assert len(graph_builder.graph_specs) testing_graph_spec = graph_builder.graph_specs.get('Testing_Graph', None) @@ -65,6 +70,7 @@ def test_valid_graph_spec_config(): # graph spec sources are able to return versions once source_version(s) are set def test_graph_spec_lazy_versions(): reset_graph_spec_config() + CONFIG.refresh() graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) testing_graph_spec = graph_builder.graph_specs.get('Testing_Graph', None) for source in testing_graph_spec.sources: @@ -79,6 +85,7 @@ def test_graph_spec_lazy_versions(): # then see if a graph with a subgraph can properly determine graph versions def test_graph_spec_subgraph_version(): reset_graph_spec_config() + CONFIG.refresh() graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) graph_builder.source_data_manager = get_source_data_manager_mock() @@ -100,6 +107,7 @@ def test_graph_spec_subgraph_version(): # make sure a graph spec with an invalid subgraph fails with the appropriate exception def test_graph_spec_invalid_subgraph(): reset_graph_spec_config() + CONFIG.refresh() graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) graph_builder.source_data_manager = get_source_data_manager_mock() testing_graph_spec = graph_builder.graph_specs.get('Testing_Graph_3', None) @@ -111,6 +119,7 @@ def test_graph_spec_invalid_subgraph(): # make sure a graph spec with an invalid subgraph version (which is otherwise valid) fails to build def test_graph_spec_invalid_subgraph_version(): reset_graph_spec_config() + CONFIG.refresh() graph_builder = GraphBuilder(graph_specs_dir=get_testing_graph_spec_dir()) graph_builder.source_data_manager = get_source_data_manager_mock() testing_graph_spec = graph_builder.graph_specs.get('Testing_Graph_4', None)