Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions src/finemo/data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,9 @@ def softmax(x: Float[ndarray, "4 W"], temp: float = 100) -> Float[ndarray, "4 W"
return exp / np.sum(exp, axis=0, keepdims=True)


def _motif_name_sort_key(data: Tuple[str, Any]) -> Union[Tuple[int, int], Tuple[int, str]]:
def _motif_name_sort_key(
data: Tuple[str, Any],
) -> Union[Tuple[int, int], Tuple[int, str]]:
"""Generate sort key for TF-MoDISco motif names.

This function creates a sort key that orders motifs by pattern number,
Expand Down Expand Up @@ -1168,7 +1170,11 @@ def load_hits(
Hit data with an additional 'count' column set to 1 for aggregation.
"""
hits_df = pl.scan_csv(
hits_path, separator="\t", quote_char=None, schema=schema
hits_path,
separator="\t",
quote_char=None,
schema=schema,
null_values=[".", "NA", "null", "NaN"],
).with_columns(pl.lit(1).alias("count"))

return hits_df if lazy else hits_df.collect()
Expand Down Expand Up @@ -1234,7 +1240,7 @@ def write_hits(
- hits.tsv: Complete hit data with all instances
- hits_unique.tsv: Deduplicated hits by genomic position and motif (excludes rows with NA chromosome coordinates)
- hits.bed: BED format file for genome browser visualization

Rows where the chromosome field is NA are filtered out during deduplication
to ensure that data_unique only contains well-defined genomic coordinates.
"""
Expand Down Expand Up @@ -1454,9 +1460,7 @@ def write_seqlet_confusion_df(seqlet_confusion_df: pl.DataFrame, out_path: str)


def write_report_data(
report_df: pl.DataFrame,
motifs: Dict[str, Dict[str, ndarray]],
out_dir: str
report_df: pl.DataFrame, motifs: Dict[str, Dict[str, ndarray]], out_dir: str
) -> None:
"""Write comprehensive motif report data including CWMs and metadata.

Expand Down
2 changes: 1 addition & 1 deletion src/finemo/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def collapse_hits(

chroms = hits_df["chr"].unique(maintain_order=True)

if not chroms.is_empty():
if not chroms.is_null().all():
chrom_to_id = {chrom: i for i, chrom in enumerate(chroms)}
# Transform coordinates for overlap computation
# Scale by 2 and adjust by overlap fraction to create effective overlap regions
Expand Down