diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py index c7166ac8..48cc15ef 100644 --- a/pairtools/cli/stats.py +++ b/pairtools/cli/stats.py @@ -28,6 +28,11 @@ " all overlapping statistics. Non-overlapping statistics are appended to" " the end of the file. Supported for tsv stats with single filter.", ) +@click.option( + "--single-mapped-by-side", + is_flag=True, + help="If specified, count single-mapped reads separately for read1 and read2.", +) @click.option( "--n-dist-bins-decade", type=int, @@ -115,7 +120,15 @@ ) @common_io_options def stats( - input_path, output, merge, n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, **kwargs + input_path, + output, + merge, + single_mapped_by_side, + n_dist_bins_decade, + bytile_dups, + output_bytile_stats, + filter, + **kwargs, ): """Calculate pairs statistics. @@ -131,6 +144,7 @@ def stats( input_path, output, merge, + single_mapped_by_side, n_dist_bins_decade, bytile_dups, output_bytile_stats, @@ -140,7 +154,15 @@ def stats( def stats_py( - input_path, output, merge, n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, **kwargs + input_path, + output, + merge, + single_mapped_by_side, + n_dist_bins_decade, + bytile_dups, + output_bytile_stats, + filter, + **kwargs, ): if merge: do_merge(output, input_path, n_dist_bins_decade=n_dist_bins_decade, **kwargs) @@ -190,6 +212,7 @@ def stats_py( filter = None stats = PairCounter( + single_mapped_by_side=single_mapped_by_side, n_dist_bins_decade=n_dist_bins_decade, bytile_dups=bytile_dups, filters=filter, @@ -213,9 +236,9 @@ def stats_py( stats.save( outstream, yaml=kwargs.get("yaml", False), # format as yaml - filter=first_filter_name - if not kwargs.get("yaml", False) - else None, # output only the first filter if non-YAML output + filter=( + first_filter_name if not kwargs.get("yaml", False) else None + ), # output only the first filter if non-YAML output ) if instream != sys.stdin: diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index e008314b..dac67a80 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -18,10 +18,10 @@ def parse_number(s): elif s.replace(".", "", 1).isdigit(): return float(s) else: - return s + return s -def flat_dict_to_nested(input_dict, sep='/'): +def flat_dict_to_nested(input_dict, sep="/"): output_dict = {} for key, value in input_dict.items(): @@ -30,8 +30,10 @@ def flat_dict_to_nested(input_dict, sep='/'): elif type(key) == str: key_parts = key.split(sep) else: - raise ValueError(f"Key type can be either str or tuple. Found key {key} of type {type(key)}.") - + raise ValueError( + f"Key type can be either str or tuple. Found key {key} of type {type(key)}." + ) + current_dict = output_dict for key_part in key_parts[:-1]: current_dict = current_dict.setdefault(key_part, {}) @@ -40,7 +42,7 @@ def flat_dict_to_nested(input_dict, sep='/'): return output_dict -def nested_dict_to_flat(d, tuple_keys=False, sep='/'): +def nested_dict_to_flat(d, tuple_keys=False, sep="/"): """Flatten a nested dictionary to a flat dictionary. Parameters @@ -56,28 +58,31 @@ def nested_dict_to_flat(d, tuple_keys=False, sep='/'): dict A flat dictionary. """ - + if tuple_keys: - join_keys = lambda k1,k2: (k1,) + k2 + join_keys = lambda k1, k2: (k1,) + k2 else: - join_keys = lambda k1,k2: (k1+sep+k2) if k2 else k1 + join_keys = lambda k1, k2: (k1 + sep + k2) if k2 else k1 out = {} for k1, v1 in d.items(): if isinstance(v1, dict): - out.update({ - join_keys(k1,k2): v2 - for k2, v2 in nested_dict_to_flat(v1, tuple_keys, sep).items() - }) + out.update( + { + join_keys(k1, k2): v2 + for k2, v2 in nested_dict_to_flat(v1, tuple_keys, sep).items() + } + ) else: if tuple_keys: out[(k1,)] = v1 else: out[k1] = v1 - + return out + def is_nested_dict(d): """Check if a dictionary is nested. @@ -100,6 +105,7 @@ def is_nested_dict(d): return False + def is_tuple_keyed_dict(d): """Check if a dictionary is tuple-keyed. @@ -116,7 +122,7 @@ def is_tuple_keyed_dict(d): if not isinstance(d, dict): return False - for k,v in d.items(): + for k, v in d.items(): if not isinstance(k, tuple): return False if isinstance(v, dict): @@ -124,6 +130,7 @@ def is_tuple_keyed_dict(d): return True + def is_str_keyed_dict(d): """Check if a dictionary is string-keyed. @@ -140,7 +147,7 @@ def is_str_keyed_dict(d): if not isinstance(d, dict): return False - for k,v in d.keys(): + for k, v in d.keys(): if not isinstance(k, str): return False if isinstance(v, dict): @@ -149,7 +156,7 @@ def is_str_keyed_dict(d): return True -def swap_levels_nested_dict(nested_dict, level1, level2, sep='/'): +def swap_levels_nested_dict(nested_dict, level1, level2, sep="/"): """Swap the order of two levels in a nested dictionary. Parameters @@ -171,23 +178,26 @@ def swap_levels_nested_dict(nested_dict, level1, level2, sep='/'): for k1, v1 in nested_dict.items(): k1_list = list(k1) k1_list[level1], k1_list[level2] = k1_list[level2], k1_list[level1] - out[tuple(k1_list)] = v1 + out[tuple(k1_list)] = v1 return out - + elif is_nested_dict(nested_dict): out = nested_dict_to_flat(nested_dict, tuple_keys=True) out = swap_levels_nested_dict(out, level1, level2) out = flat_dict_to_nested(out) return out - + elif is_str_keyed_dict(nested_dict): out = nested_dict_to_flat(nested_dict, sep=sep) out = swap_levels_nested_dict(out, level1, level2) - out = {sep.join(k):v for k,v in out.items()} + out = {sep.join(k): v for k, v in out.items()} return out - + else: - raise ValueError("Input dictionary must be either nested, string-keyed or tuple-keyed") + raise ValueError( + "Input dictionary must be either nested, string-keyed or tuple-keyed" + ) + class PairCounter(Mapping): """ @@ -205,10 +215,11 @@ class PairCounter(Mapping): DIST_FREQ_REL_DIFF_THRESHOLD = 0.05 N_DIST_BINS_DECADE_DEFAULT = 8 MIN_LOG10_DIST_DEFAULT = 0 - MAX_LOG10_DIST_DEFAULT = 9 + MAX_LOG10_DIST_DEFAULT = 9 def __init__( self, + single_mapped_by_side=False, min_log10_dist=MIN_LOG10_DIST_DEFAULT, max_log10_dist=MAX_LOG10_DIST_DEFAULT, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT, @@ -216,6 +227,7 @@ def __init__( filters=None, **kwargs, ): + self.single_mapped_by_side = single_mapped_by_side # Define filters and parameters for filters evaluation: if filters is not None: self.filters = filters @@ -234,7 +246,7 @@ def __init__( # genomic distance bining for the ++/--/-+/+- distribution log10_dist_bin_step = 1.0 / n_dist_bins_decade self._dist_bins = np.unique( - np.r_[ + np.r_[ 0, np.round( 10 @@ -251,6 +263,9 @@ def __init__( self._stat[key]["total"] = 0 self._stat[key]["total_unmapped"] = 0 self._stat[key]["total_single_sided_mapped"] = 0 + if single_mapped_by_side: + self._stat[key]["total_left_only_mapped"] = 0 + self._stat[key]["total_right_only_mapped"] = 0 # total_mapped = total_dups + total_nodups self._stat[key]["total_mapped"] = 0 self._stat[key]["total_dups"] = 0 @@ -261,7 +276,7 @@ def __init__( self._stat[key]["cis"] = 0 self._stat[key]["trans"] = 0 self._stat[key]["pair_types"] = {} - + # to be removed: # self._stat[key]["dedup"] = {} @@ -307,7 +322,6 @@ def __init__( ) self._summaries_calculated = False - def __getitem__(self, key, filter="no_filter"): if isinstance(key, str): # let's strip any unintentional '/' @@ -378,40 +392,45 @@ def __getitem__(self, key, filter="no_filter"): else: raise ValueError("{} is not a valid key".format(k)) - def __iter__(self): return iter(self._stat) - def __len__(self): return len(self._stat) - def find_dist_freq_convergence_distance(self, rel_threshold): - """Finds the largest distance at which the frequency of pairs of reads - with different strands deviates from their average by the specified + """Finds the largest distance at which the frequency of pairs of reads + with different strands deviates from their average by the specified relative threshold.""" - + out = {} all_strands = ["++", "--", "-+", "+-"] for filter in self.filters: out[filter] = {} - + dist_freqs_by_strands = { - strands: np.array(list(self._stat[filter]["dist_freq"][strands].values())) - for strands in all_strands} - + strands: np.array( + list(self._stat[filter]["dist_freq"][strands].values()) + ) + for strands in all_strands + } + # Calculate the average frequency of pairs with different strands - avg_freq_all_strands = np.mean(np.vstack(list(dist_freqs_by_strands.values())), axis=0) + avg_freq_all_strands = np.mean( + np.vstack(list(dist_freqs_by_strands.values())), axis=0 + ) # Calculate the largest distance at which the frequency of pairs of at least one strand combination deviates from the average by the given threshold - rel_deviations = {strands: np.nan_to_num( - np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands) - / avg_freq_all_strands) - for strands in all_strands} - - idx_maxs = {strand:0 for strand in all_strands} + rel_deviations = { + strands: np.nan_to_num( + np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands) + / avg_freq_all_strands + ) + for strands in all_strands + } + + idx_maxs = {strand: 0 for strand in all_strands} for strands in all_strands: bin_exceeds = rel_deviations[strands] > rel_threshold if np.any(bin_exceeds): @@ -419,70 +438,83 @@ def find_dist_freq_convergence_distance(self, rel_threshold): # Find the largest distance and the strand combination where frequency of pairs deviates from the average by the given threshold: convergence_bin_idx = 0 - convergence_strands = '??' - convergence_dist = '0' + convergence_strands = "??" + convergence_dist = "0" for strands in all_strands: - if (idx_maxs[strands] > convergence_bin_idx): + if idx_maxs[strands] > convergence_bin_idx: convergence_bin_idx = idx_maxs[strands] convergence_strands = strands if idx_maxs[strands] < len(self._dist_bins): - convergence_dist = self._dist_bins[convergence_bin_idx+1] + convergence_dist = self._dist_bins[convergence_bin_idx + 1] else: convergence_dist = np.iinfo(np.int64) - - + out[filter]["convergence_dist"] = convergence_dist out[filter]["strands_w_max_convergence_dist"] = convergence_strands - out[filter]['convergence_rel_diff_threshold'] = rel_threshold + out[filter]["convergence_rel_diff_threshold"] = rel_threshold - out[filter]['n_cis_pairs_below_convergence_dist'] = { - strands:dist_freqs_by_strands[strands][:convergence_bin_idx+1].sum() for strands in all_strands + out[filter]["n_cis_pairs_below_convergence_dist"] = { + strands: dist_freqs_by_strands[strands][: convergence_bin_idx + 1].sum() + for strands in all_strands for strands in all_strands } - out[filter]['n_cis_pairs_below_convergence_dist_all_strands'] = sum( - list(out[filter]['n_cis_pairs_below_convergence_dist'].values())) + out[filter]["n_cis_pairs_below_convergence_dist_all_strands"] = sum( + list(out[filter]["n_cis_pairs_below_convergence_dist"].values()) + ) n_cis_pairs_above_convergence_dist = { - strands:dist_freqs_by_strands[strands][convergence_bin_idx+1:].sum() for strands in all_strands + strands: dist_freqs_by_strands[strands][convergence_bin_idx + 1 :].sum() + for strands in all_strands for strands in all_strands } - - out[filter]['n_cis_pairs_above_convergence_dist_all_strands'] = sum( - list(n_cis_pairs_above_convergence_dist.values())) + + out[filter]["n_cis_pairs_above_convergence_dist_all_strands"] = sum( + list(n_cis_pairs_above_convergence_dist.values()) + ) norms = dict( - cis=self._stat[filter]['cis'], - total_mapped=self._stat[filter]['total_mapped'] + cis=self._stat[filter]["cis"], + total_mapped=self._stat[filter]["total_mapped"], ) - if 'total_nodups' in self._stat[filter]: - norms['total_nodups'] = self._stat[filter]['total_nodups'] + if "total_nodups" in self._stat[filter]: + norms["total_nodups"] = self._stat[filter]["total_nodups"] for key, norm_factor in norms.items(): - out[filter][f'frac_{key}_in_cis_below_convergence_dist'] = { - strands: n_cis_pairs / norm_factor - for strands, n_cis_pairs in out[filter]['n_cis_pairs_below_convergence_dist'].items() + out[filter][f"frac_{key}_in_cis_below_convergence_dist"] = { + strands: n_cis_pairs / norm_factor + for strands, n_cis_pairs in out[filter][ + "n_cis_pairs_below_convergence_dist" + ].items() } - out[filter][f'frac_{key}_in_cis_below_convergence_dist_all_strands'] = sum( - list(out[filter][f'frac_{key}_in_cis_below_convergence_dist'].values())) + out[filter][f"frac_{key}_in_cis_below_convergence_dist_all_strands"] = ( + sum( + list( + out[filter][ + f"frac_{key}_in_cis_below_convergence_dist" + ].values() + ) + ) + ) - out[filter][f'frac_{key}_in_cis_above_convergence_dist_all_strands'] = ( - sum(list(n_cis_pairs_above_convergence_dist.values())) / norm_factor ) + out[filter][f"frac_{key}_in_cis_above_convergence_dist_all_strands"] = ( + sum(list(n_cis_pairs_above_convergence_dist.values())) / norm_factor + ) return out - def calculate_summaries(self): """calculate summary statistics (fraction of cis pairs at different cutoffs, complexity estimate) based on accumulated counts. Results are saved into self._stat["filter_name"]['summary"] """ convergence_stats = self.find_dist_freq_convergence_distance( - self.DIST_FREQ_REL_DIFF_THRESHOLD) + self.DIST_FREQ_REL_DIFF_THRESHOLD + ) for filter_name in self.filters.keys(): for cis_count in ( @@ -495,23 +527,33 @@ def calculate_summaries(self): "cis_40kb+", ): self._stat[filter_name]["summary"][f"frac_{cis_count}"] = ( - (self._stat[filter_name][cis_count] / self._stat[filter_name]["total_nodups"]) + ( + self._stat[filter_name][cis_count] + / self._stat[filter_name]["total_nodups"] + ) if self._stat[filter_name]["total_nodups"] > 0 else 0 ) - self._stat[filter_name]["summary"]["dist_freq_convergence"] = convergence_stats[filter_name] + self._stat[filter_name]["summary"]["dist_freq_convergence"] = ( + convergence_stats[filter_name] + ) self._stat[filter_name]["summary"]["frac_dups"] = ( - (self._stat[filter_name]["total_dups"] / self._stat[filter_name]["total_mapped"]) + ( + self._stat[filter_name]["total_dups"] + / self._stat[filter_name]["total_mapped"] + ) if self._stat[filter_name]["total_mapped"] > 0 else 0 ) - self._stat[filter_name]["summary"][ - "complexity_naive" - ] = estimate_library_complexity( - self._stat[filter_name]["total_mapped"], self._stat[filter_name]["total_dups"], 0 + self._stat[filter_name]["summary"]["complexity_naive"] = ( + estimate_library_complexity( + self._stat[filter_name]["total_mapped"], + self._stat[filter_name]["total_dups"], + 0, + ) ) if filter_name == "no_filter" and self._save_bytile_dups: @@ -535,7 +577,6 @@ def calculate_summaries(self): self._summaries_calculated = True - @classmethod def from_file(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): """create instance of PairCounter from file @@ -562,28 +603,31 @@ def from_file(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): "{} is not a valid stats file".format(file_handle.name) ) raw_stat[key_val_pair[0]] = parse_number(key_val_pair[1]) - - ## TODO: check if raw_stat does not contain any unknown keys + ## TODO: check if raw_stat does not contain any unknown keys # Convert flat dict to nested dict - stat_from_file._stat[default_filter].update(flat_dict_to_nested(raw_stat, sep=cls._KEY_SEP)) + stat_from_file._stat[default_filter].update( + flat_dict_to_nested(raw_stat, sep=cls._KEY_SEP) + ) - stat_from_file._stat[default_filter]['chrom_freq'] = nested_dict_to_flat( - stat_from_file._stat[default_filter]['chrom_freq'], tuple_keys=True) + stat_from_file._stat[default_filter]["chrom_freq"] = nested_dict_to_flat( + stat_from_file._stat[default_filter]["chrom_freq"], tuple_keys=True + ) - bin_to_left_val = lambda bin: int(bin.rstrip('+') if ('+' in bin) else bin.split('-')[0]) + bin_to_left_val = lambda bin: int( + bin.rstrip("+") if ("+" in bin) else bin.split("-")[0] + ) - stat_from_file._stat[default_filter]['dist_freq'] = { + stat_from_file._stat[default_filter]["dist_freq"] = { bin_to_left_val(k): v - for k,v in stat_from_file._stat[default_filter]['dist_freq'].items() - } + for k, v in stat_from_file._stat[default_filter]["dist_freq"].items() + } - stat_from_file._stat[default_filter]['dist_freq'] = swap_levels_nested_dict( - stat_from_file._stat[default_filter]['dist_freq'], 0, 1 + stat_from_file._stat[default_filter]["dist_freq"] = swap_levels_nested_dict( + stat_from_file._stat[default_filter]["dist_freq"], 0, 1 ) - return stat_from_file @classmethod @@ -601,7 +645,9 @@ def from_yaml(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): stat = yaml.safe_load(file_handle) stat_from_file = cls( n_dist_bins_decade=n_dist_bins_decade, - filters={key: val.get("filter_expression", "") for key, val in stat.items()} + filters={ + key: val.get("filter_expression", "") for key, val in stat.items() + }, ) for key, filter in stat.items(): @@ -614,7 +660,6 @@ def from_yaml(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): stat_from_file._stat = stat return stat_from_file - def add_pair( self, chrom1, @@ -679,13 +724,16 @@ def add_pair( for dist_kb in [1, 2, 4, 10, 20, 40]: if dist >= dist_kb * 1000: self._stat[filter][f"cis_{dist_kb}kb+"] += 1 - else: self._stat[filter]["trans"] += 1 else: self._stat[filter]["total_single_sided_mapped"] += 1 - + if self.single_mapped_by_side: + if chrom1 == unmapped_chrom: + self._stat[filter]["total_right_only_mapped"] += 1 + else: + self._stat[filter]["total_left_only_mapped"] += 1 def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. @@ -739,6 +787,19 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): self._stat[key]["total_single_sided_mapped"] += int( total_count - (mapped_count + unmapped_count) ) + if self.single_mapped_by_side: + left_only_mapped_count = np.logical_and( + df_filtered["chrom1"] != unmapped_chrom, + df_filtered["chrom2"] == unmapped_chrom, + ).sum() + right_only_mapped_count = np.logical_and( + df_filtered["chrom1"] == unmapped_chrom, + df_filtered["chrom2"] != unmapped_chrom, + ).sum() + self._stat[key]["total_left_only_mapped"] += int(left_only_mapped_count) + self._stat[key]["total_right_only_mapped"] += int( + right_only_mapped_count + ) # Count the duplicates: if "duplicate" in df_mapped.columns: @@ -834,7 +895,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k] = ( self._stat[filter][k] + other._stat[filter][k] ) - + # sum nested dicts/arrays in a context dependet manner: else: if k in ["pair_types", "dedup"]: @@ -848,7 +909,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k] = sum_dicts( self._stat[filter][k], other._stat[filter][k] ) - + elif k == "chrom_freq": # union list of keys (chr1,chr2) with potential duplicates: union_keys_with_dups = list(self._stat[filter][k].keys()) + list( @@ -863,7 +924,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k][union_key] = self._stat[filter][ k ].get(union_key, 0) + other._stat[filter][k].get(union_key, 0) - + elif k == "dist_freq": for dirs in sum_stat[k]: from functools import reduce @@ -879,7 +940,7 @@ def reducer(accumulator, element): {}, ) # sum_stat[k][dirs] = self._stat[filter][k][dirs] + other._stat[filter][k][dirs] - + elif k == "chromsizes": if k in self._stat[filter] and k in other._stat[filter]: if self._stat[filter][k] == other._stat[filter][k]: @@ -933,7 +994,7 @@ def flatten(self, filter="no_filter"): for i in range(len(self._dist_bins)): for dirs, freqs in v.items(): dist = self._dist_bins[i] - + # last bin is treated differently: "100000+" vs "1200-3000": if i < len(self._dist_bins) - 1: dist_next = self._dist_bins[i + 1] @@ -945,24 +1006,36 @@ def flatten(self, filter="no_filter"): ["{}", "{}+", "{}"] ).format(k, dist, dirs) else: - raise ValueError("There is a mismatch between dist_freq bins in the instance") - + raise ValueError( + "There is a mismatch between dist_freq bins in the instance" + ) + # store key,value pair: - try: + try: flat_stat[formatted_key] = freqs[dist] except: # in some previous versions of stats, last bin was not reported, so we need to skip it now: - if (dist not in freqs) and (i == len(self._dist_bins) - 1): - flat_stat[formatted_key] = 0 + if (dist not in freqs) and ( + i == len(self._dist_bins) - 1 + ): + flat_stat[formatted_key] = 0 else: - raise ValueError(f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match") - - elif (k in ["pair_types", "dedup", "chromsizes", 'summary']) and v: + raise ValueError( + f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match" + ) + + elif (k in ["pair_types", "dedup", "chromsizes", "summary"]) and v: # 'pair_types' and 'dedup' are simple dicts inside, # treat them the exact same way: flat_stat.update( - {k+self._KEY_SEP+k2 : v2 for k2,v2 in nested_dict_to_flat(v, sep=self._KEY_SEP).items()}) - + { + k + self._KEY_SEP + k2: v2 + for k2, v2 in nested_dict_to_flat( + v, sep=self._KEY_SEP + ).items() + } + ) + elif (k == "chrom_freq") and v: for (chrom1, chrom2), freq in v.items(): formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format( @@ -970,7 +1043,10 @@ def flatten(self, filter="no_filter"): ) # store key,value pair: flat_stat[formatted_key] = freq - + else: + print( + f"Can't flatten {k} field: not a trivial field, not implemented" + ) # return flattened dict return flat_stat @@ -985,9 +1061,9 @@ def format_yaml(self, filter="no_filter"): # Storing statistics for each filter for filter_name in self.filters.keys(): for k, v in self._stat[filter_name].items(): - if (k == "chrom_freq"): - v = {self._KEY_SEP.join(k2):v2 for k2, v2 in v.items()} - if v: + if k == "chrom_freq": + v = {self._KEY_SEP.join(k2): v2 for k2, v2 in v.items()} + if v is not None: # skip empty fields formatted_stat[filter_name][k] = deepcopy(v) # return formatted dict formatted_stat = nested_dict_to_flat(formatted_stat, tuple_keys=True) @@ -1031,7 +1107,6 @@ def save(self, outstream, yaml=False, filter="no_filter"): for k, v in data.items(): outstream.write("{}{}{}\n".format(k, self._SEP, v)) - def save_bytile_dups(self, outstream): """save bytile duplication counts to a tab-delimited text file. Parameters @@ -1063,9 +1138,19 @@ def do_merge(output, files_to_merge, **kwargs): ) # use a factory method to instanciate PairCounter if kwargs.get("yaml", False): - stat = PairCounter.from_yaml(f, n_dist_bins_decade=kwargs.get('n_dist_bins_decade', PairCounter.N_DIST_BINS_DECADE_DEFAULT)) + stat = PairCounter.from_yaml( + f, + n_dist_bins_decade=kwargs.get( + "n_dist_bins_decade", PairCounter.N_DIST_BINS_DECADE_DEFAULT + ), + ) else: - stat = PairCounter.from_file(f, n_dist_bins_decade=kwargs.get('n_dist_bins_decade', PairCounter.N_DIST_BINS_DECADE_DEFAULT)) + stat = PairCounter.from_file( + f, + n_dist_bins_decade=kwargs.get( + "n_dist_bins_decade", PairCounter.N_DIST_BINS_DECADE_DEFAULT + ), + ) stats.append(stat) f.close() diff --git a/tests/test_stats.py b/tests/test_stats.py index 8ead6420..a28f5e40 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -15,7 +15,15 @@ def test_mock_pairsam(): mock_pairsam_path = os.path.join(testdir, "data", "mock.4stats.pairs") try: result = subprocess.check_output( - ["python", "-m", "pairtools", "stats", "--yaml", mock_pairsam_path], + [ + "python", + "-m", + "pairtools", + "stats", + "--yaml", + "--single-mapped-by-side", + mock_pairsam_path, + ], ).decode("ascii") except subprocess.CalledProcessError as e: print(e.output) @@ -32,6 +40,8 @@ def test_mock_pairsam(): assert stats["no_filter"]["total"] == 9 assert stats["no_filter"]["total_single_sided_mapped"] == 2 + assert stats["no_filter"]["total_left_only_mapped"] == 0 + assert stats["no_filter"]["total_right_only_mapped"] == 2 assert stats["no_filter"]["total_mapped"] == 6 assert stats["no_filter"]["total_dups"] == 1 assert stats["no_filter"]["cis"] == 3 @@ -138,13 +148,14 @@ def test_merge_stats(): from pairtools.lib.stats import PairCounter + @pytest.fixture def pair_counter(): counter = PairCounter(filters={"f1": "filter1", "f2": "filter2"}) counter._dist_bins = np.array([1, 1000, 10000, 100000, 1000000]) # Populate the counter with some sample data counter._stat["f1"]["dist_freq"] = { - "++": {1: 80, 1000: 80, 10000: 91, 100000: 95}, + "++": {1: 80, 1000: 80, 10000: 91, 100000: 95}, "--": {1: 100, 1000: 100, 10000: 100, 100000: 100}, "-+": {1: 100, 1000: 100, 10000: 100, 100000: 100}, "+-": {1: 120, 1000: 120, 10000: 109, 100000: 105}, @@ -156,7 +167,7 @@ def pair_counter(): "-+": {1: 210, 1000: 185, 10000: 165, 100000: 145}, "+-": {1: 230, 1000: 195, 10000: 175, 100000: 155}, } - + return counter @@ -183,10 +194,9 @@ def test_find_dist_freq_convergence_distance(pair_counter): assert f1_result["convergence_rel_diff_threshold"] == 0.1 assert f1_result["convergence_dist"] == 10000 assert f1_result["strands_w_max_convergence_dist"] == "++" - # f2_result = result["f2"] # assert "convergence_dist" in f2_result # assert "strands_w_max_convergence_dist" in f2_result # assert "convergence_rel_diff_threshold" in f2_result - # Add more assertions for f2_result as needed \ No newline at end of file + # Add more assertions for f2_result as needed