Skip to content

Commit 55c9e41

Browse files
committed
adding processing script
1 parent bc9bf24 commit 55c9e41

File tree

5 files changed

+213
-5
lines changed

5 files changed

+213
-5
lines changed

bin/eda.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pathlib import Path
1515

1616
######################################
17-
# Main
17+
# Functions
1818
######################################
1919

2020

@@ -159,6 +159,9 @@ def log_results(
159159

160160
mlflow.end_run()
161161

162+
######################################
163+
# Main
164+
######################################
162165

163166
@hydra.main(version_base=None, config_path="../conf", config_name="config")
164167
def main(config: DictConfig) -> None:

bin/process.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/usr/bin/env python
2+
3+
######################################
4+
# Imports
5+
######################################
6+
7+
import hydra
8+
import matplotlib.pyplot as plt
9+
import networkx as nx
10+
import numpy as np
11+
from omegaconf import DictConfig
12+
from os.path import join as join_path
13+
import pandas as pd
14+
from pathlib import Path
15+
16+
######################################
17+
# Functions
18+
######################################
19+
20+
21+
def process_network(
22+
feature_matrix: pd.DataFrame, edge_list: pd.DataFrame, from_col: str, to_col: str,
23+
len_component: int = 5
24+
) -> tuple[pd.DataFrame, pd.DataFrame]:
25+
"""
26+
Construct a graph from edge list data.
27+
28+
Args:
29+
feature_matrix (pd.DataFrame):
30+
The feature matrix.
31+
edge_list (pd.DataFrame):
32+
The edge list.
33+
from_col (str):
34+
The "from" column name.
35+
to_col (str):
36+
The "to" column name.
37+
len_component (int, optional):
38+
The minimum size of a subgraph to filter out. Defaults to 5.
39+
40+
Returns:
41+
tuple[pd.DataFrame, pd.DataFrame]:
42+
The processed graph as a feature matrix and edge list.
43+
"""
44+
edges = edge_list.sort_values(from_col)
45+
46+
G = nx.from_pandas_edgelist(edges, from_col, to_col, create_using=nx.Graph())
47+
48+
for component in list(nx.connected_components(G)):
49+
if len(component) <= len_component:
50+
for node in component:
51+
G.remove_node(node)
52+
53+
nodes = list(G.nodes)
54+
filtered_feature_matrix = feature_matrix[nodes]
55+
filtered_edge_list = nx.to_pandas_edgelist(G, source=from_col, target=to_col)
56+
return filtered_feature_matrix, filtered_edge_list
57+
58+
59+
def log_results(
60+
tracking_uri: str,
61+
experiment_prefix: str,
62+
grn_name: str,
63+
feature_matrix: pd.DataFrame,
64+
edge_list: pd.DataFrame
65+
) -> None:
66+
"""
67+
Log experiment results to the experiment tracker.
68+
69+
Args:
70+
tracking_uri (str):
71+
The tracking URI.
72+
experiment_prefix (str):
73+
The experiment name prefix.
74+
grn_name (str):
75+
The name of the GRN.
76+
feature_matrix (pd.DataFrame):
77+
The feature matrix.
78+
edge_list (pd.DataFrame):
79+
The edge list.
80+
"""
81+
import mlflow
82+
83+
mlflow.set_tracking_uri(tracking_uri)
84+
85+
experiment_name = f"{experiment_prefix}_process"
86+
existing_exp = mlflow.get_experiment_by_name(experiment_name)
87+
if not existing_exp:
88+
mlflow.create_experiment(experiment_name)
89+
mlflow.set_experiment(experiment_name)
90+
91+
mlflow.set_tag("grn", grn_name)
92+
93+
mlflow.log_param("grn", grn_name)
94+
95+
mlflow.log_metric("num_features", len(feature_matrix.index))
96+
mlflow.log_metric("num_nodes", len(feature_matrix.columns))
97+
mlflow.log_metric("num_1st_order_relationships", len(edge_list.index))
98+
99+
mlflow.end_run()
100+
101+
######################################
102+
# Main
103+
######################################
104+
105+
@hydra.main(version_base=None, config_path="../conf", config_name="config")
106+
def main(config: DictConfig) -> None:
107+
"""
108+
The main entry point for the plotting pipeline.
109+
110+
Args:
111+
config (DictConfig):
112+
The pipeline configuration.
113+
"""
114+
# Constants
115+
EXPERIMENT_PREFIX = config["experiment"]["name"]
116+
117+
DATA_DIR = config["dir"]["data_dir"]
118+
PREPROCESS_DIR = config["dir"]["preprocessed_dir"]
119+
OUT_DIR = config["dir"]["out_dir"]
120+
121+
GRN_NAME = config["grn"]["input_dir"]
122+
FEATURE_MATRIX_FILE = config["grn"]["feature_matrix"]
123+
EDGE_LIST_FILE = config["grn"]["edge_list"]
124+
FROM_COL = config["grn"]["from_col"]
125+
TO_COL = config["grn"]["to_col"]
126+
127+
TRACKING_URI = config["experiment_tracking"]["tracking_uri"]
128+
ENABLE_TRACKING = config["experiment_tracking"]["enabled"]
129+
130+
input_dir = join_path(DATA_DIR, PREPROCESS_DIR, GRN_NAME)
131+
feature_matrix = pd.read_csv(join_path(input_dir, FEATURE_MATRIX_FILE))
132+
edge_list = pd.read_csv(join_path(input_dir, EDGE_LIST_FILE))
133+
134+
filtered_feature_matrix, filtered_edge_list = process_network(feature_matrix, edge_list, FROM_COL, TO_COL)
135+
136+
output_dir = join_path(DATA_DIR, OUT_DIR, GRN_NAME, "process")
137+
Path(output_dir).mkdir(parents=True, exist_ok=True)
138+
139+
filtered_feature_matrix.to_csv(join_path(output_dir, FEATURE_MATRIX_FILE))
140+
filtered_edge_list.to_csv(join_path(output_dir, EDGE_LIST_FILE), index=False)
141+
142+
if ENABLE_TRACKING:
143+
log_results(
144+
TRACKING_URI,
145+
EXPERIMENT_PREFIX,
146+
GRN_NAME,
147+
filtered_feature_matrix,
148+
filtered_edge_list,
149+
)
150+
151+
if __name__ == "__main__":
152+
main()

data/out/in_silico/process/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/expression_data.csv
2+
/gold_standard.csv

dvc.lock

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ stages:
3030
size: 48365
3131
isexec: true
3232
eda:
33-
cmd: python bin/eda.py grn.input_dir=in_silico
33+
cmd: python bin/eda.py grn.input_dir=in_silico grn.edge_list=gold_standard.csv
3434
deps:
3535
- path: bin/eda.py
3636
hash: md5
37-
md5: 91405f1eed106719693d4de10e9cd9ca
38-
size: 5343
37+
md5: c2f1b995f4f9f7b040c43935a25d4b62
38+
size: 5432
3939
- path: conf/config.yaml
4040
hash: md5
4141
md5: 1dda4ab35427c3f6d36d6649032c2cd2
@@ -63,3 +63,42 @@ stages:
6363
md5: ec1ac03c4923b48c493eab3886bea489
6464
size: 191
6565
isexec: true
66+
process:
67+
cmd: python bin/process.py grn.input_dir=in_silico grn.feature_matrix=expression_data.csv
68+
grn.edge_list=gold_standard.csv
69+
deps:
70+
- path: bin/process.py
71+
hash: md5
72+
md5: 00b4bc49fcac48a47cb61cc8940d48fd
73+
size: 4582
74+
- path: conf/config.yaml
75+
hash: md5
76+
md5: 1dda4ab35427c3f6d36d6649032c2cd2
77+
size: 210
78+
- path: conf/experiment_tracking/docker.yaml
79+
hash: md5
80+
md5: f9a686d34db5162fc959ca7470695aad
81+
size: 47
82+
- path: conf/grn/in_silico.yaml
83+
hash: md5
84+
md5: 53861163fdc851b7a76f234bbe196701
85+
size: 117
86+
- path: data/preprocessed/in_silico/expression_data.csv
87+
hash: md5
88+
md5: bc5772dc41a43050fb822f065a5c4d2a
89+
size: 13063512
90+
- path: data/preprocessed/in_silico/gold_standard.csv
91+
hash: md5
92+
md5: 280e46f849dffad692ec41922b1304e3
93+
size: 48325
94+
outs:
95+
- path: data/out/in_silico/process/expression_data.csv
96+
hash: md5
97+
md5: 70b70518c6dbd8489451b8fc88c5e000
98+
size: 12415950
99+
isexec: true
100+
- path: data/out/in_silico/process/gold_standard.csv
101+
hash: md5
102+
md5: 8988a2f805eaa860e550ea6613eef862
103+
size: 40110
104+
isexec: true

dvc.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
stages:
22
eda:
3-
cmd: python bin/eda.py grn.input_dir=in_silico
3+
cmd: python bin/eda.py grn.input_dir=in_silico grn.edge_list=gold_standard.csv
44
deps:
55
- conf/config.yaml
66
- conf/grn/in_silico.yaml
@@ -10,3 +10,15 @@ stages:
1010
outs:
1111
- data/out/in_silico/eda/graph.png
1212
- data/out/in_silico/eda/metrics.csv
13+
process:
14+
cmd: python bin/process.py grn.input_dir=in_silico grn.feature_matrix=expression_data.csv grn.edge_list=gold_standard.csv
15+
deps:
16+
- conf/config.yaml
17+
- conf/grn/in_silico.yaml
18+
- conf/experiment_tracking/docker.yaml
19+
- data/preprocessed/in_silico/expression_data.csv
20+
- data/preprocessed/in_silico/gold_standard.csv
21+
- bin/process.py
22+
outs:
23+
- data/out/in_silico/process/expression_data.csv
24+
- data/out/in_silico/process/gold_standard.csv

0 commit comments

Comments
 (0)