JBris
diff --git a/‎bin/eda.py
+2-1 b/‎bin/eda.py
+2-1
diff --git a/‎bin/process.py
+17-10 b/‎bin/process.py
+17-10
diff --git a/‎bin/to_db.py
+186 b/‎bin/to_db.py
+186
diff --git a/‎conf/config.yaml
+2 b/‎conf/config.yaml
+2
diff --git a/‎conf/db/graph.yaml
+4 b/‎conf/db/graph.yaml
+4
diff --git a/‎data/out/in_silico/process/.gitignore renamed to ‎data/out/in_silico/processed/.gitignore b/‎data/out/in_silico/process/.gitignore renamed to ‎data/out/in_silico/processed/.gitignore
diff --git a/‎docker-compose.yml
+1 b/‎docker-compose.yml
+1
diff --git a/‎docs/source/pipelines/index.rst
+2 b/‎docs/source/pipelines/index.rst
+2
diff --git a/‎docs/source/pipelines/process.rst
+7 b/‎docs/source/pipelines/process.rst
+7
diff --git a/‎docs/source/pipelines/to_db.rst
+7 b/‎docs/source/pipelines/to_db.rst
+7
@@ -7,7 +7,6 @@
 import hydra
 import matplotlib.pyplot as plt
 import networkx as nx
-import numpy as np
 from omegaconf import DictConfig
 from os.path import join as join_path
 import pandas as pd
@@ -159,10 +158,12 @@ def log_results(
 
     mlflow.end_run()
 
+
 ######################################
 # Main
 ######################################
 
+
 @hydra.main(version_base=None, config_path="../conf", config_name="config")
 def main(config: DictConfig) -> None:
     """
 
@@ -5,9 +5,7 @@
 ######################################
 
 import hydra
-import matplotlib.pyplot as plt
 import networkx as nx
-import numpy as np
 from omegaconf import DictConfig
 from os.path import join as join_path
 import pandas as pd
@@ -19,8 +17,11 @@
 
 
 def process_network(
-    feature_matrix: pd.DataFrame, edge_list: pd.DataFrame, from_col: str, to_col: str, 
-    len_component: int = 5
+    feature_matrix: pd.DataFrame,
+    edge_list: pd.DataFrame,
+    from_col: str,
+    to_col: str,
+    len_component: int = 5,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
     Construct a graph from edge list data.
@@ -49,7 +50,7 @@ def process_network(
         if len(component) <= len_component:
             for node in component:
                 G.remove_node(node)
-    
+
     nodes = list(G.nodes)
     filtered_feature_matrix = feature_matrix[nodes]
     filtered_edge_list = nx.to_pandas_edgelist(G, source=from_col, target=to_col)
@@ -60,8 +61,8 @@ def log_results(
     tracking_uri: str,
     experiment_prefix: str,
     grn_name: str,
-    feature_matrix: pd.DataFrame, 
-    edge_list: pd.DataFrame
+    feature_matrix: pd.DataFrame,
+    edge_list: pd.DataFrame,
 ) -> None:
     """
     Log experiment results to the experiment tracker.
@@ -94,14 +95,16 @@ def log_results(
 
     mlflow.log_metric("num_features", len(feature_matrix.index))
     mlflow.log_metric("num_nodes", len(feature_matrix.columns))
-    mlflow.log_metric("num_1st_order_relationships", len(edge_list.index))
+    mlflow.log_metric("num_edges", len(edge_list.index))
 
     mlflow.end_run()
 
+
 ######################################
 # Main
 ######################################
 
+
 @hydra.main(version_base=None, config_path="../conf", config_name="config")
 def main(config: DictConfig) -> None:
     """
@@ -116,6 +119,7 @@ def main(config: DictConfig) -> None:
 
     DATA_DIR = config["dir"]["data_dir"]
     PREPROCESS_DIR = config["dir"]["preprocessed_dir"]
+    PROCESS_DIR = config["dir"]["processed_dir"]
     OUT_DIR = config["dir"]["out_dir"]
 
     GRN_NAME = config["grn"]["input_dir"]
@@ -131,9 +135,11 @@ def main(config: DictConfig) -> None:
     feature_matrix = pd.read_csv(join_path(input_dir, FEATURE_MATRIX_FILE))
     edge_list = pd.read_csv(join_path(input_dir, EDGE_LIST_FILE))
 
-    filtered_feature_matrix, filtered_edge_list = process_network(feature_matrix, edge_list, FROM_COL, TO_COL)
+    filtered_feature_matrix, filtered_edge_list = process_network(
+        feature_matrix, edge_list, FROM_COL, TO_COL
+    )
 
-    output_dir = join_path(DATA_DIR, OUT_DIR, GRN_NAME, "process")
+    output_dir = join_path(DATA_DIR, OUT_DIR, GRN_NAME, PROCESS_DIR)
     Path(output_dir).mkdir(parents=True, exist_ok=True)
 
     filtered_feature_matrix.to_csv(join_path(output_dir, FEATURE_MATRIX_FILE))
@@ -148,5 +154,6 @@ def main(config: DictConfig) -> None:
             filtered_edge_list,
         )
 
+
 if __name__ == "__main__":
     main()
@@ -0,0 +1,186 @@
+#!/usr/bin/env python
+
+######################################
+# Imports
+######################################
+
+from adbnx_adapter import ADBNX_Adapter
+from arango import ArangoClient
+import hydra
+import networkx as nx
+from omegaconf import DictConfig
+from os.path import join as join_path
+import pandas as pd
+
+
+######################################
+# Functions
+######################################
+
+
+def log_results(
+    tracking_uri: str,
+    experiment_prefix: str,
+    grn_name: str,
+    feature_matrix: pd.DataFrame,
+    edge_list: pd.DataFrame,
+) -> None:
+    """
+    Log experiment results to the experiment tracker.
+
+    Args:
+        tracking_uri (str):
+            The tracking URI.
+        experiment_prefix (str):
+            The experiment name prefix.
+        grn_name (str):
+            The name of the GRN.
+        feature_matrix (pd.DataFrame):
+            The feature matrix.
+        edge_list (pd.DataFrame):
+            The edge list.
+    """
+    import mlflow
+
+    mlflow.set_tracking_uri(tracking_uri)
+
+    experiment_name = f"{experiment_prefix}_process"
+    existing_exp = mlflow.get_experiment_by_name(experiment_name)
+    if not existing_exp:
+        mlflow.create_experiment(experiment_name)
+    mlflow.set_experiment(experiment_name)
+
+    mlflow.set_tag("grn", grn_name)
+
+    mlflow.log_param("grn", grn_name)
+
+    mlflow.log_metric("num_features", len(feature_matrix.index))
+    mlflow.log_metric("num_nodes", len(feature_matrix.columns))
+    mlflow.log_metric("num_edges", len(edge_list.index))
+
+    mlflow.end_run()
+
+
+######################################
+# Main
+######################################
+
+
+def get_graph(
+    feature_matrix: pd.DataFrame, edge_list: pd.DataFrame, from_col: str, to_col: str
+) -> nx.Graph:
+    """
+    Construct a graph from edge list data.
+
+    Args:
+        feature_matrix (pd.DataFrame):
+            The feature matrix.
+        edge_list (pd.DataFrame):
+            The edge list.
+        from_col (str):
+            The "from" column name.
+        to_col (str):
+            The "to" column name.
+
+    Returns:
+        nx.Graph:
+            The graph to write to the database.
+    """
+    edges = edge_list.sort_values(from_col)
+
+    G = nx.from_pandas_edgelist(edges, from_col, to_col, create_using=nx.Graph())
+    node_features = feature_matrix.to_dict()
+    nx.set_node_attributes(G, node_features, "expression")
+
+    return G
+
+
+def to_db(
+    db_host: str,
+    db_name: str,
+    db_username: str,
+    db_password: str,
+    collection: str,
+    G: nx.Graph,
+) -> None:
+    """
+    Write the graph to the database.
+
+    Args:
+        db_host (str):
+            The database host.
+        db_name (str):
+            The database name.
+        db_username (str):
+            The database username.
+        db_password (str):
+            The database password.
+        collection (str):
+            The database collection.
+        G (nx.Graph):
+            The graph.
+    """
+    sys_db = ArangoClient(hosts=db_host).db(
+        "_system", username=db_username, password=db_password
+    )
+    if not sys_db.has_database(db_name):
+        sys_db.create_database(db_name)
+    db = ArangoClient(hosts=db_host).db(
+        db_name, username=db_username, password=db_password
+    )
+
+    edges_collection = f"{collection}_edges"
+    for db_collection in [collection, edges_collection]:
+        if db.has_collection(db_collection):
+            db.delete_collection(db_collection)
+
+    if db.has_graph(collection):
+        db.delete_graph(collection)
+
+    graph_definitions = [
+        {
+            "edge_collection": edges_collection,
+            "from_vertex_collections": [collection],
+            "to_vertex_collections": [collection],
+        }
+    ]
+
+    adapter = ADBNX_Adapter(db)
+    adapter.networkx_to_arangodb(collection, G, graph_definitions)
+
+
+@hydra.main(version_base=None, config_path="../conf", config_name="config")
+def main(config: DictConfig) -> None:
+    """
+    The main entry point for the plotting pipeline.
+
+    Args:
+        config (DictConfig):
+            The pipeline configuration.
+    """
+    # Constants
+    DATA_DIR = config["dir"]["data_dir"]
+    PROCESS_DIR = config["dir"]["processed_dir"]
+    OUT_DIR = config["dir"]["out_dir"]
+
+    GRN_NAME = config["grn"]["input_dir"]
+    FEATURE_MATRIX_FILE = config["grn"]["feature_matrix"]
+    EDGE_LIST_FILE = config["grn"]["edge_list"]
+    FROM_COL = config["grn"]["from_col"]
+    TO_COL = config["grn"]["to_col"]
+
+    DB_HOST = config["db"]["host"]
+    DB_NAME = config["db"]["name"]
+    DB_USERNAME = config["db"]["username"]
+    DB_PASSWORD = config["db"]["password"]
+
+    input_dir = join_path(DATA_DIR, OUT_DIR, GRN_NAME, PROCESS_DIR)
+    feature_matrix = pd.read_csv(join_path(input_dir, FEATURE_MATRIX_FILE))
+    edge_list = pd.read_csv(join_path(input_dir, EDGE_LIST_FILE))
+
+    G = get_graph(feature_matrix, edge_list, FROM_COL, TO_COL)
+    to_db(DB_HOST, DB_NAME, DB_USERNAME, DB_PASSWORD, GRN_NAME, G)
+
+
+if __name__ == "__main__":
+    main()
@@ -1,6 +1,7 @@
 defaults:
   - _self_
   - grn: in_silico
+  - db: graph
   - experiment_tracking: docker
 
 experiment:
@@ -9,4 +10,5 @@ experiment:
 dir:
   data_dir: data
   preprocessed_dir: preprocessed
+  processed_dir: processed
   out_dir: out
@@ -0,0 +1,4 @@
+host: http://arangodb:8529
+name: grn
+username: root
+password: password
@@ -16,6 +16,7 @@ services:
       AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
       AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
       MLFLOW_S3_ENDPOINT_URL: $MLFLOW_S3_ENDPOINT_URL
+      ARANGO_ROOT_PASSWORD: $ARANGO_ROOT_PASSWORD
     volumes:
       - ${PWD}:${PWD}:Z
       - /var/run/docker.sock:/var/run/docker.sock
 
@@ -6,3 +6,5 @@ Nextflow Graph Machine Learning Pipelines
    :caption: Contents:
 
    eda.rst
+   process.rst
+   to_db.rst
@@ -0,0 +1,7 @@
+Process Data
+=================================================
+
+*Date published:* |today|
+
+.. automodule:: bin.process
+   :members:
@@ -0,0 +1,7 @@
+To Graph Database
+=================================================
+
+*Date published:* |today|
+
+.. automodule:: bin.to_db
+   :members: