Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .env

This file was deleted.

38 changes: 38 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# ORION Core Configuration
# ALL VARIABLES ARE OPTIONAL
# Values here are default values that the system will set if these environment variables are not set

# Paths for ORION's primary data storage

# Location to store data from the data sources
# All the directories below will be created under STORAGE_BASE_PATH
# ORION_STORAGE=Storage/ORION_STORAGE
# Location to store graph databases
# ORION_GRAPHS=Storage/ORION_KG
# Location to store logs
# ORION_LOGS=
# Data Storage Configuration
# Path for shared data across services
# SHARED_SOURCE_DATA_PATH=

# ORION_OUTPUT_URL=https://localhost/ # this is currently only used to generate metadata
# ORION_TEST_MODE=False

# Graph Specification
# Use either ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL, not both
# Name of the Graph Spec file located in the graph_specs directory
# ORION_GRAPH_SPEC=example-graph-spec.yaml
# ORION_GRAPH_SPEC_URL=https://stars.renci.org/var/data_services/graph_specs/default-graph-spec.yaml

# Bagel Service Authentication
# Required for accessing the Bagel service
# BAGEL_SERVICE_USERNAME=your-username-here
# BAGEL_SERVICE_PASSWORD=your-password-here

# URLs
# EDGE_NORMALIZATION_ENDPOINT=https://bl-lookup-sri.renci.org/
# NODE_NORMALIZATION_ENDPOINT=https://nodenormalization-sri.renci.org/
# NAMERES_URL=https://name-resolution-sri.renci.org/
# SAPBERT_URL=https://babel-sapbert.apps.renci.org/
# LITCOIN_PRED_MAPPING_URL=https://pred-mapping.apps.renci.org/
# BAGEL_ENDPOINT=https://bagel.apps.renci.org/
59 changes: 55 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,56 @@
# pycache
**/__pycache__
*.pycache
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Virtual environments
venv/
ENV/
env/
.venv

# IDE
.idea/
.vscode/
*.swp
*.swo
*~
.DS_Store

# Testing
.pytest_cache/
.coverage
.tox/
htmlcov/

# Environment variables
.env
.idea
.env.local
.env.*.local

# Project-specific
Storage/
*.log

# Lock files (optional - comment out if you want to track them)
# uv.lock
20 changes: 7 additions & 13 deletions Common/build_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from Common.meta_kg import MetaKnowledgeGraphBuilder, META_KG_FILENAME, TEST_DATA_FILENAME, EXAMPLE_DATA_FILENAME
from Common.redundant_kg import generate_redundant_kg
from Common.collapse_qualifiers import generate_collapsed_qualifiers_kg
from Common.config import CONFIG

NODES_FILENAME = 'nodes.jsonl'
EDGES_FILENAME = 'edges.jsonl'
Expand All @@ -33,10 +34,9 @@ class GraphBuilder:

def __init__(self,
graph_specs_dir=None):

self.logger = LoggingUtil.init_logging("ORION.Common.GraphBuilder",
line_format='medium',
log_file_path=os.environ['ORION_LOGS'])
log_file_path=CONFIG['ORION_LOGS'])

self.graphs_dir = self.get_graphs_dir() # path to the graphs output directory
self.source_data_manager = SourceDataManager() # access to the data sources and their metadata
Expand Down Expand Up @@ -360,8 +360,8 @@ def generate_meta_kg_and_test_data(self,
mkgb.write_example_data_to_file(example_data_file_path)

def load_graph_specs(self, graph_specs_dir=None):
graph_spec_file = os.environ.get('ORION_GRAPH_SPEC', None)
graph_spec_url = os.environ.get('ORION_GRAPH_SPEC_URL', None)
graph_spec_file = CONFIG["ORION_GRAPH_SPEC"]
graph_spec_url = CONFIG["ORION_GRAPH_SPEC_URL"]

if graph_spec_file and graph_spec_url:
raise GraphSpecError(f'Configuration Error - the environment variables ORION_GRAPH_SPEC and '
Expand Down Expand Up @@ -524,7 +524,7 @@ def get_graph_dir_path(self, graph_id: str, graph_version: str):

@staticmethod
def get_graph_output_url(graph_id: str, graph_version: str):
graph_output_url = os.environ.get('ORION_OUTPUT_URL', "https://localhost/").removesuffix('/')
graph_output_url = CONFIG["ORION_OUTPUT_URL"]
return f'{graph_output_url}/{graph_id}/{graph_version}/'

@staticmethod
Expand All @@ -551,14 +551,8 @@ def get_graph_metadata(self, graph_id: str, graph_version: str):
@staticmethod
def get_graphs_dir():
# confirm the directory specified by the environment variable ORION_GRAPHS is valid
graphs_dir = os.environ.get('ORION_GRAPHS', None)
if graphs_dir and Path(graphs_dir).is_dir():
return os.environ['ORION_GRAPHS']

# if invalid or not specified back out
raise IOError('ORION graphs directory not configured properly. '
'Specify a valid directory with environment variable ORION_GRAPHS.')

graphs_dir = CONFIG.get_path("ORION_GRAPHS")
return graphs_dir

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Merge data sources into complete graphs.")
Expand Down
101 changes: 95 additions & 6 deletions Common/config.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,99 @@
import os
# import os
# from pathlib import Path
# from dotenv import dotenv_values

# CONFIG = {
# **dotenv_values(Path(__file__).parents[1] / '.env'), # load config variables from .env
# **os.environ, # override loaded values with environment variables
# }

from dataclasses import dataclass
from .utils import LoggingUtil
from pathlib import Path
from dotenv import dotenv_values
from typing import ClassVar
from pydantic import field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict

logger = LoggingUtil.init_logging("ORION.Common.config",
line_format='medium')

class Config(BaseSettings):
model_config = SettingsConfigDict(
env_file= Path(__file__).parent.parent/".env",
env_file_encoding="utf-8",
) # Load .env file and OS ENV Variables.

ORION_STORAGE: Path = Path.cwd()/"storage/orion_storage"
ORION_GRAPHS: Path = Path.cwd()/"storage/orion_graphs"
ORION_LOGS: Path | None = None
SHARED_SOURCE_DATA_PATH: Path | None = None

ORION_OUTPUT_URL: str="https://localhost/"
ORION_TEST_MODE: bool=False

ORION_GRAPH_SPEC: str="example-graph-spec.yaml"
ORION_GRAPH_SPEC_URL: str=""

BAGEL_SERVICE_USERNAME: str="default_bagel_username"
BAGEL_SERVICE_PASSWORD: str="default_bagel_password"

EDGE_NORMALIZATION_ENDPOINT: str="https://bl-lookup-sri.renci.org/"
NODE_NORMALIZATION_ENDPOINT: str="https://nodenormalization-sri.renci.org/"
NAMERES_URL: str="https://name-resolution-sri.renci.org/"
SAPBERT_URL: str="https://babel-sapbert.apps.renci.org/"
LITCOIN_PRED_MAPPING_URL: str="https://pred-mapping.apps.renci.org/"
BAGEL_ENDPOINT: str="https://bagel.apps.renci.org/"

# class method to get an instance of the class, with an option to be able to reload
_instance: ClassVar["Config | None"] = None

@classmethod
def get(cls, refresh: bool = False) -> "Config":
if cls._instance is None or refresh:
cls._instance = cls()
return cls._instance

# Validation function for ORION_LOGS
@field_validator("ORION_LOGS")
@classmethod
def validate_logs_path(cls, v: Path | None) -> Path | None:
if v is None:
return None
if not v.exists():
raise ValueError(f"ORION_LOGS path does not exist: {v}")
elif not v.is_dir():
raise ValueError(f"ORION_LOGS is not a directory: {v}")
return v

## Making sure that either orion graph spec or orion graph spec url are set (not both)
@model_validator(mode="after")
def check_graph_spec(self) -> "Config":
if self.ORION_GRAPH_SPEC and self.ORION_GRAPH_SPEC_URL:
raise ValueError("Set either ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL, not both")
if not self.ORION_GRAPH_SPEC_URL and not self.ORION_GRAPH_SPEC:
raise ValueError("Must set either ORION_GRAPH_SPEC or ORION_GRAPH_SPEC_URL")
return self

## Make relevant directory, and return the path for orion_storage, and orion_graphs
def get_path(self, name: str) -> Path:
if name not in ("ORION_STORAGE", "ORION_GRAPHS"):
raise ValueError(f"Unknown directory field: {name}")

path = getattr(self, name)
try:
path.mkdir(parents=True, exist_ok=True)
except OSError as e:
raise ValueError(f"Failed to create {name} directory: {e}")
return path

CONFIG = {
**dotenv_values(Path(__file__).parents[1] / '.env'), # load config variables from .env
**os.environ, # override loaded values with environment variables
}
class ConfigProxy:
def __getattr__(self, name: str):
return getattr(Config.get(), name)

def __getitem__(self, name: str):
return getattr(Config.get(), name)

def refresh(self):
Config.get(refresh=True)

CONFIG = ConfigProxy()
4 changes: 2 additions & 2 deletions Common/kgx_file_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from Common.biolink_constants import SUBJECT_ID, OBJECT_ID
from Common.merging import GraphMerger, DiskGraphMerger, MemoryGraphMerger
from Common.load_manager import RESOURCE_HOGS
from Common.config import CONFIG

# import line_profiler
# import atexit
# profile = line_profiler.LineProfiler()
# atexit.register(profile.print_stats)

logger = LoggingUtil.init_logging("ORION.Common.KGXFileMerger",
line_format='medium',
log_file_path=os.environ['ORION_LOGS'])
log_file_path=CONFIG["ORION_LOGS"])

CONNECTED_EDGE_SUBSET = 'connected_edge_subset'
DONT_MERGE = 'dont_merge_edges'
Expand Down
4 changes: 2 additions & 2 deletions Common/kgx_file_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
NormalizationFailedError
from Common.utils import LoggingUtil, chunk_iterator
from Common.kgx_file_writer import KGXFileWriter

from Common.config import CONFIG

EDGE_PROPERTIES_THAT_SHOULD_BE_SETS = {AGGREGATOR_KNOWLEDGE_SOURCES, PUBLICATIONS}
NODE_NORMALIZATION_BATCH_SIZE = 1_000_000
Expand All @@ -25,7 +25,7 @@ class KGXFileNormalizer:
logger = LoggingUtil.init_logging("ORION.Common.KGXFileNormalizer",
line_format='medium',
level=logging.INFO,
log_file_path=os.environ['ORION_LOGS'])
log_file_path=CONFIG["ORION_LOGS"])

def __init__(self,
source_nodes_file_path: str,
Expand Down
4 changes: 2 additions & 2 deletions Common/kgx_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from Common.kgxmodel import kgxnode, kgxedge
from Common.biolink_constants import PRIMARY_KNOWLEDGE_SOURCE, AGGREGATOR_KNOWLEDGE_SOURCES, \
SUBJECT_ID, OBJECT_ID, PREDICATE

from Common.config import CONFIG

class KGXFileWriter:

logger = LoggingUtil.init_logging("ORION.Common.KGXFileWriter",
line_format='medium',
level=logging.INFO,
log_file_path=os.environ.get('ORION_LOGS'))
log_file_path=CONFIG["ORION_LOGS"])
"""
constructor
:param nodes_output_file_path: the file path for the nodes file
Expand Down
20 changes: 5 additions & 15 deletions Common/load_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
from Common.metadata import SourceMetadata
from Common.loader_interface import SourceDataBrokenError, SourceDataFailedError
from Common.supplementation import SequenceVariantSupplementation, SupplementationFailedError

from Common.config import CONFIG

SOURCE_DATA_LOADER_CLASSES = SourceDataLoaderClassFactory()

logger = LoggingUtil.init_logging("ORION.Common.SourceDataManager",
line_format='medium',
log_file_path=os.environ['ORION_LOGS'])
log_file_path=CONFIG["ORION_LOGS"])


class SourceDataManager:
Expand Down Expand Up @@ -691,13 +691,7 @@ def get_source_version_path(self, source_id: str, source_version: str):
def init_storage_dir(self):
# use the storage directory specified by the environment variable ORION_STORAGE
# check to make sure it's set and valid, otherwise fail
if "ORION_STORAGE" not in os.environ:
raise Exception(f'You must use the environment variable ORION_STORAGE '
f'to specify a storage directory.')
if os.path.isdir(os.environ["ORION_STORAGE"]):
return os.environ["ORION_STORAGE"]
else:
raise IOError(f'Storage directory not valid: {os.environ["ORION_STORAGE"]}')
return CONFIG.get_path("ORION_STORAGE")

def init_source_output_dir(self, source_id: str):
source_dir_path = os.path.join(self.storage_dir, source_id)
Expand All @@ -723,12 +717,8 @@ def init_source_output_dir(self, source_id: str):
'in the finalized kgx files.')
args = parser.parse_args()

if 'ORION_TEST_MODE' in os.environ:
test_mode_from_env = os.environ['ORION_TEST_MODE']
else:
test_mode_from_env = False

loader_test_mode = args.test_mode or test_mode_from_env
test_mode_from_env = CONFIG["ORION_TEST_MODE"]
loader_test_mode = args.test_mode or test_mode_from_env ## TODO: Is this redundant?
loader_strict_normalization = (not args.lenient_normalization)
load_manager = SourceDataManager(test_mode=loader_test_mode,
fresh_start_mode=args.fresh_start_mode)
Expand Down
6 changes: 3 additions & 3 deletions Common/loader_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import inspect
from Common.kgx_file_writer import KGXFileWriter
from Common.utils import LoggingUtil

from Common.config import CONFIG

class SourceDataLoader:

Expand Down Expand Up @@ -36,7 +36,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
if not os.path.exists(self.data_path):
os.mkdir(self.data_path)
else:
self.data_path = os.environ.get("ORION_STORAGE")
self.data_path = CONFIG.get_path("ORION_STORAGE")

# the final output lists of nodes and edges
self.final_node_list: list = []
Expand All @@ -49,7 +49,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
self.logger = LoggingUtil.init_logging(f"ORION.parsers.{self.get_name()}",
level=logging.INFO,
line_format='medium',
log_file_path=os.environ.get('ORION_LOGS'))
log_file_path=CONFIG["ORION_LOGS"])

def get_latest_source_version(self):
"""Determine and return the latest source version ie. a unique identifier associated with the latest version."""
Expand Down
Loading
Loading