Skip to content

Commit

Permalink
MVP done but need to refine statistical model
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabrielstav committed Mar 24, 2024
1 parent a1764cc commit 3144b66
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 64 deletions.
27 changes: 14 additions & 13 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@

version: 1.0

run_name: "" # Name of the run, used for creating output dir with data. If empty, the run dirname will be the timestamp.
run_name: "huvec_250kb" # Name of the run, used for creating output dir with data. If empty, the run dirname will be the timestamp.

paths:
# Directories for input and output data (absolute paths)
input_dir: "/Users/GBS/hic_data/imr90/matrix/imr90" # Directory containing HiC-Pro output files
input_dir: "/Users/GBS/hic_data/huvec/matrix/huvec" # Directory containing HiC-Pro output files
run_dir: "/Users/GBS/hiedge_testing/mcf10_tests" # Output directory (named by run name) containing HiEdge results
output_dir: ""
temp_dir: = ""
Expand All @@ -32,22 +32,26 @@ pipeline_settings:
hicpro_raw_dirname: "raw" # Directory containing raw HiC-Pro matrix output (dafault name by hicpro is always: raw)
hicpro_norm_dirname: "iced" # Directory containing balanced HiC-Pro matrix output and bias matrices (dafault name by hicpro is always: iced)

# Output format for HiEdge results (csv, hdf5, parquet, text)
output_type: "default" # Output type for HiEdge results (default, verbose, qval)
output_format: "csv" # Output format for HiEdge results (csv, hdf5, parquet, txt)

# Make plots (visualize data)
make_plots: True # If true, make plots for HiEdge results (spline fit, distance dependant decay, p-value distribution, q-value distribution, etc.)

# Execution settings
interaction_type: "intra" # Type of interactions to consider (inter, intra, mixed). Mixed will process all interactions, but will perform separate statistical testing for inter and intra interactions.
iced_data: False # If set to true, use balanced matrix files by iterative correction (ic) from Hi-C Pro (iced matrices) instead of raw matrix files (not recommended, statistical model assumes raw matrices)
round_iced_matrices: False # If true, round iced matrices to integer values.
intra_resolutions: [1000000] # Resolution in base pairs for intrachromosomal data to be processed (if empty, process all resolutions for which data is available)
intra_resolutions: [250000] # Resolution in base pairs for intrachromosomal data to be processed (if empty, process all resolutions for which data is available)
inter_resolutions: [1000000] # Resolutions in base pairs to consider for interchromosomal data (if empty, process all resolutions for which data is available)

# Filtering settings
filter_blacklist: True # If true, filter out interactions in blacklist regions (blacklist file path)
filter_cytobands: True # If true, filter out interactions in cytoband regions (cytobands file path)
remove_chromosomes: ["chrM", "chrY", "chrX"] # List of strings of chromosomes to be omitted from filtering. Empty list means no chromosomes are selected.
select_chromosomes: [] # List of strings of chromosome(s) to include in filtering, all chromosomes not selected are omitted. Empty list means no specific chromosomes are selected.
filter_self_interactions: False # If true, remove self-interacting bins (interactions between hic-contacts in the same bin)

# Make plots (visualize data)
make_plots: False # If true, make plots for HiEdge results (spline fit, distance dependant decay, p-value distribution, q-value distribution, etc.)
filter_self_interactions: True # If true, remove self-interacting bins (interactions between hic-contacts in the same bin)

select_specific_regions: False
select_regions:
Expand All @@ -69,20 +73,17 @@ pipeline_settings:
interaction_distance_filters:
1000000:
min_distance: 5000000 # Minimum distance for interactions to be considered
max_distance: 90000000 # Maximum distance for interactions to be considered
max_distance: 9000000 # Maximum distance for interactions to be considered
500000:
min_distance: 1000000 # Minimum distance for interactions to be considered
max_distance: 5000000 # Maximum distance for interactions to be considered

# Output settings
output_format: "csv" # Output format for HiEdge results (csv, hdf5, parquet, text)


statistical_settings:
spline_passes: 2 # Number of spline passes
fdr_threshold: 0.05 # False Discovery Rate threshold for adjusting p-values by Benjamini-Hochberg procedure
metabin_occupancy: 200 # Number of metabins to use for occupancy calculation (number of binds used in spline fit)
use_hicpro_bias: False # If true, use bias files from HiC-Pro for adjusting expected interaction frequencies before statistical testing. If false, no bias files are used.
use_hicpro_bias: True # If true, use bias files from HiC-Pro for adjusting expected interaction frequencies before statistical testing. If false, no bias files are used.
bias_lower_bound: 0.5 # Lower bound for bias values for filtering
bias_upper_bound: 2 # Upper bound for bias values for filtering
use_filtered_data_for_average_contact_probability: False # If true, use filtered data for average contact probability calculation used in spline fitting. If false, use raw (unfiltered) data.
use_sequential_fdr: True # If true, use sequential FDR correction for multiple testing. If false, use partition-based FDR correction.
12 changes: 10 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
from src.data_preparation.preparation_controller import DataPreparationController
from src.filtering.filtering_controller import FilteringController
from src.statistics.stat_controller import StatController
from src.output.output_formatter import OutputConfigurator
from src.output.output_formatter import OutputConfiguratorRunner
from pathlib import Path
from dask import compute
from dask.delayed import delayed
from typing import List
from time import time


class Pipeline:
Expand All @@ -24,6 +25,9 @@ def __init__(self, config_path: Path):

def run(self):

# start time of the pipeline
start_time = time()

# Set up the run directory and instantiate config
setup_tool = RunDirectorySetup(config=self.config, config_path=self.config_path)
setup_tool.prepare_run_environment()
Expand All @@ -46,9 +50,12 @@ def run(self):
print(f"Doing statistical modeling: {statistical_output}")

# Configure and write output
self._execute_in_parallel(statistical_output, OutputConfigurator)
self._execute_in_parallel(statistical_output, OutputConfiguratorRunner)
print(f"Writing output!")

# print time taken to run the pipeline
print(f"Time taken to run the pipeline: {time() - start_time} seconds.")

def _execute_in_parallel(self, inputs, controller_class) -> List:
# Ensure inputs is a list for uniform processing
if not isinstance(inputs, list):
Expand Down Expand Up @@ -77,6 +84,7 @@ def _load_config(self) -> Config:
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the pipeline with the specified configuration.")
parser.add_argument("--config", "-con", "-c", type=Path, required=True, help="Path to the configuration YAML file.")
parser.add_argument("--run_name", "-r", type=str, required=False, help="Name of the run.")
args = parser.parse_args()

pipeline = Pipeline(args.config)
Expand Down
6 changes: 5 additions & 1 deletion src/filtering/blacklist_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, config: Config):
def filter_single_partition(self, partition: pd.DataFrame, resolution) -> pd.DataFrame:
pbt_partition = pbt.BedTool.from_dataframe(partition)
filtered_partition = pbt_partition.window(self.blacklisted_regions, r=False, v=True, w=resolution)
filtered_partition_df = filtered_partition.to_dataframe(names=["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "interaction_count", "idx_1", "idx_2", "bias_1", "bias_2"])
filtered_partition_df = filtered_partition.to_dataframe(names=["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "interaction_count", "idx_1", "idx_2", "bias_1", "bias_2"], engine="python")
return filtered_partition_df

def filter_blacklist(self, bedpe_data: Union[dd.DataFrame, pd.DataFrame], resolution) -> dd.DataFrame:
Expand All @@ -38,3 +38,7 @@ def filter_blacklist(self, bedpe_data: Union[dd.DataFrame, pd.DataFrame], resolu
print(filtered_partitions.compute().head())

return filtered_partitions

class RemoveBlacklistedRegionsDask:
# TODO: Implement filtering logic in Dask, make seris of blacklisted regions and filter on each partition using overlap or midpoints in region
pass
5 changes: 5 additions & 0 deletions src/filtering/cytobands_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,8 @@ def filter_cytobands(self, bedpe_ddf: dd.DataFrame, resolution) -> dd.DataFrame:
if not isinstance(filtered_partitions, dd.DataFrame):
print("Filtered partitions are not a dask dataframe!")
return filtered_partitions


class RemoveCytobandRegionsDask:
# TODO: Implement filtering logic in Dask, make seris of cytoband regions and filter on each partition using overlap or midpoints in region
pass
43 changes: 34 additions & 9 deletions src/output/output_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,26 @@

# Import modules
from pathlib import Path
from dask import dataframe as dd

class OutputConfiguratorRunner:
def __init__(self, config, input_data):
self.config = config
self.input_data = input_data

def run(self):
output_configurator = OutputConfigurator(config=self.config)
print(f"Running output configurator with input data: {self.input_data}")
output_configurator.run(self.input_data)


class OutputConfigurator:
def __init__(self, config):
self.config = config

def run(self, input_data):
output_type = self.config.output_type
print(f"Head of input data in output configuration: {input_data.data.head(5)}")
output_type = self.config.pipeline_settings.output_type
if output_type == "default":
self._write_default_output(input_data)
elif output_type == "verbose":
Expand All @@ -21,7 +33,7 @@ def run(self, input_data):

def _write_default_output(self, input_data):
file_path = self._construct_file_path(input_data, suffix="default")
columns = ["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "q_value", "confidence_estimate", "interaction_count"]
columns = ["chr_1", "start_1", "end_1", "chr_2", "start_2", "end_2", "interaction_count", "p_value", "q_value"]
self._write_to_file(input_data.data[columns], file_path)

def _write_verbose_output(self, input_data):
Expand All @@ -35,26 +47,39 @@ def _write_qval_output(self, input_data):

def _construct_file_path(self, input_data, suffix):
interaction_type = input_data.metadata.interaction_type
file_name = f"{input_data.metadata.experiment}_{input_data.metadata.resolution}_{suffix}.csv"
output_dir = Path(self.config.output_dir) / interaction_type

if self.config.pipeline_settings.output_format == "csv":
file_name = f"{input_data.metadata.experiment}_{input_data.metadata.resolution}_{suffix}.csv"
elif self.config.pipeline_settings.output_format == "parquet":
file_name = f"{input_data.metadata.experiment}_{input_data.metadata.resolution}_{suffix}.parquet"
elif self.config.pipeline_settings.output_format == "hdf5":
file_name = f"{input_data.metadata.experiment}_{input_data.metadata.resolution}_{suffix}.h5"
elif self.config.pipeline_settings.output_format == "txt":
file_name = f"{input_data.metadata.experiment}_{input_data.metadata.resolution}_{suffix}.txt"
else:
raise ValueError(f"Unsupported output format: {self.config.pipeline_settings.output_format}. Supported formats are csv, parquet, hdf5, txt.")

output_dir = Path(self.config.paths.output_dir) / interaction_type
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir / file_name

def _write_to_file(self, df, file_path):
output_format = self.config.pipeline_settings.output_format
file_name = file_path.name

if not isinstance(df, dd.DataFrame):
print(f"Output dataframe is not a Dask dataframe: {type(df)}")

if output_format == "csv":
df.to_csv(file_path, single_file=True)
df.to_csv(file_path)
elif output_format == "parquet":
df.to_parquet(file_path)
elif output_format == "hdf5":
# HDF5 writing needs special handling for Dask
df.compute().to_hdf(file_path, key="data", mode="w")
elif output_format == "txt":
df.to_csv(file_path, single_file=True, sep="\t", header=False, index=False)
df.compute().to_hdf(file_path.hdf5, key="data", mode="w")
else:
raise ValueError(f"Unsupported output format: {output_format}")
df.to_csv(file_path, sep="\t", header=False, index=False)


print(f"Output file {file_name} written to {file_path}")

Expand Down
5 changes: 0 additions & 5 deletions src/plots/spline_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def __init__(self, spline_fitter, metadata, config):
self.metadata = metadata
self.spline_fitter = spline_fitter


def plot_spline_fit(self):
x = self.spline_fitter.x
y = self.spline_fitter.y
Expand All @@ -26,7 +25,6 @@ def plot_spline_fit(self):
plt.ylabel("Probability")
plt.legend()
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_spline_fit.png")
plt.show()

def plot_spline_fit_with_error(self):
pass
Expand All @@ -42,7 +40,6 @@ def plot_distance_distributuion(self):
plt.xlabel("Genomic Distance")
plt.ylabel("Frequency")
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_distance_distribution.png")
plt.show()

def plot_probability_distribution(self):
y = self.spline_fitter.y
Expand All @@ -52,7 +49,6 @@ def plot_probability_distribution(self):
plt.xlabel("Probability")
plt.ylabel("Frequency")
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_probability_distribution.png")
plt.show()


class DistanceDecayPlotter:
Expand All @@ -72,4 +68,3 @@ def plot_distance_dependant_decay(self):
plt.ylabel("Contact Count")
plt.scatter(x, y, alpha=0.5)
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_distance_decay.png")
plt.show()
48 changes: 45 additions & 3 deletions src/plots/statistics_plots.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,49 @@
# Copyright Gabriel B. Stav. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

# Import modules
from matplotlib import pyplot as plt
import dask.dataframe as dd

# TODO:
# Make various plots of of data, like distribution of p and q values, confidence intervals, etc.
# Also maybe total interactions, total significant interactions, outliers etc
class PValuePlotter:

def __init__(self, data: dd.DataFrame, config, metadata):
self.data = data
self.config = config
self.metadata = metadata

def plot_pvalues_vs_distance(self):
sample_data = self.data.sample(frac=1).compute() if isinstance(self.data, dd.DataFrame) else self.data

plt.figure(figsize=(10, 6))
plt.scatter(sample_data["genomic_distance"], sample_data["p_value"], alpha=0.5)
plt.title("P-Values vs Genomic Distance")
plt.xlabel("Genomic Distance")
plt.ylabel("P-Value")
plt.xscale("log")
plt.yscale("log")
plt.grid(True, which="both", ls="--")
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_pvalues_vs_distance_distribution.png")

def plot_pvalue_distribution(self):
sample_data = self.data.sample(frac=1).compute() if isinstance(self.data, dd.DataFrame) else self.data

plt.figure(figsize=(10, 6))
plt.hist(sample_data["p_value"], bins=100, alpha=0.5)
plt.title("P-Value Distribution")
plt.xlabel("P-Value")
plt.ylabel("Frequency")
plt.yscale("log")
plt.grid(True, which="both", ls="--")
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_pvalue_distribution.png")

def plot_qvalue_distribution(self):
sample_data = self.data.sample(frac=1).compute() if isinstance(self.data, dd.DataFrame) else self.data

plt.figure(figsize=(10, 6))
plt.hist(sample_data["q_value"], bins=100, alpha=0.5)
plt.title("Q-Value Distribution")
plt.xlabel("Q-Value")
plt.ylabel("Frequency")
plt.yscale("log")
plt.grid(True, which="both", ls="--")
plt.savefig(self.config.paths.plot_dir / f"{self.metadata.experiment}_{self.metadata.resolution}_qvalue_distribution.png")
10 changes: 3 additions & 7 deletions src/setup/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,16 @@ class InteractionDistanceFilter(BaseModel):
min_distance: int
max_distance: int


class Version(BaseModel):
version: float


class RunName(BaseModel):
run_name: str


class ReferencePaths(BaseModel):
blacklist_dir: Optional[Path] = None
cytoband_dir: Optional[Path] = None


class Paths(BaseModel):
input_dir: Path
run_dir: Path
Expand Down Expand Up @@ -78,6 +74,7 @@ class PipelineSettings:
use_interaction_distance_filters: bool
interaction_distance_filters: Dict[int, InteractionDistanceFilter]
output_format: str
output_type : str
select_regions: Optional[Dict[str, List[str]]] = None
omit_regions: Optional[Dict[str, List[str]]] = None

Expand All @@ -102,7 +99,6 @@ def validate_interaction_distance_filters(cls, v: Dict[int, InteractionDistanceF
raise ValueError(f"Keys in interaction_distance_filters must be integers, got {type(k).__name__}")
parsed[k] = val


class StatisticalSettings(BaseModel):
spline_passes: int
fdr_threshold: float
Expand All @@ -111,15 +107,15 @@ class StatisticalSettings(BaseModel):
bias_lower_bound: float
bias_upper_bound: float
use_filtered_data_for_average_contact_probability: bool
use_sequential_fdr: bool


class Config(BaseModel):
version: float
run_name: str
paths: Paths
pipeline_settings: PipelineSettings
statistical_settings: StatisticalSettings
# dask_settings: DaskSettings


class InstantiateConfig:
def __init__(self, config_path: Path):
Expand Down
Loading

0 comments on commit 3144b66

Please sign in to comment.