Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added --skip-failed #905

Merged
merged 8 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.4.6-rc3] - 2025-03-10

- Added --skip-failed flag to callVariant, parseArriba, parserSTARFusion, parseFusionCatcher.

- Added tally table to parseREDITools, parseCIRCExplorer, and parseRMATS

## [1.4.6-rc2] - 2025-03-03

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from . import constant


__version__ = '1.4.6-rc2'
__version__ = '1.4.6-rc3'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
261 changes: 160 additions & 101 deletions moPepGen/cli/call_variant_peptide.py

Large diffs are not rendered by default.

61 changes: 59 additions & 2 deletions moPepGen/cli/parse_arriba.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
[Arriba](https://github.com/suhrig/arriba) and saves as a GVF file. The GVF
file can be later used to call variant peptides using
[callVariant](call-variant.md)."""
from typing import List
from __future__ import annotations
from typing import TYPE_CHECKING
from pathlib import Path
import argparse
from moPepGen import get_logger, seqvar, parser, err
from moPepGen.cli import common


if TYPE_CHECKING:
from typing import List
from logging import Logger

INPUT_FILE_FORMATS = ['.tsv', '.txt']
OUTPUT_FILE_FORMATS = ['.gvf']

Expand Down Expand Up @@ -50,16 +55,50 @@ def add_subparser_parse_arriba(subparsers:argparse._SubParsersAction):
metavar='<choice>',
default='medium'
)
common.add_args_skip_failed(p)
common.add_args_source(p)
common.add_args_reference(p, proteome=False)
common.add_args_debug_level(p)
p.set_defaults(func=parse_arriba)
common.print_help_if_missing_args(p)
return p

class TallyTable():
""" Tally table """
def __init__(self, logger:Logger):
""" Constructor """
self.total:int = 0
self.succeed:int = 0
self.skipped:TallyTableSkipped = TallyTableSkipped()
self.logger = logger

def log(self):
""" Show tally results """
self.logger.info("Summary:")
self.logger.info("Totally records read: %i", self.total)
self.logger.info("Records successfully processed: %i", self.succeed)
self.logger.info("Records skipped: %i", self.skipped.total)
if self.skipped.total > 0:
self.logger.info("Out of those skipped,")
self.logger.info(" Invalid gene ID: %i", self.skipped.invalid_gene_id)
self.logger.info(" Invalid position: %i", self.skipped.invalid_position)
self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence)
self.logger.info(" Antisense strand: %i", self.skipped.antisense_strand)

class TallyTableSkipped():
""" Tally table for failed ones """
def __init__(self):
""" constructor """
self.invalid_gene_id:int = 0
self.invalid_position:int = 0
self.insufficient_evidence:int = 0
self.antisense_strand:int = 0
self.total:int = 0

def parse_arriba(args:argparse.Namespace) -> None:
""" Parse Arriba output and save it in GVF format. """
logger = get_logger()
tally = TallyTable(logger)
# unpack args
fusion = args.input_path
output_path:Path = args.output_path
Expand All @@ -82,17 +121,33 @@ def parse_arriba(args:argparse.Namespace) -> None:

with open(fusion, 'rt') as handle:
for record in parser.ArribaParser.parse(handle):
tally.total += 1
if not record.gene_id1 in anno.genes or not record.gene_id2 in anno.genes:
tally.skipped.invalid_gene_id += 1
tally.skipped.total += 1
continue
if not record.is_valid(min_split_read1, min_split_read2, min_confidence):
tally.skipped.insufficient_evidence += 1
tally.skipped.total += 1
continue
if record.transcript_on_antisense_strand(anno):
tally.skipped.antisense_strand += 1
tally.skipped.total += 1
continue
try:
var_records = record.convert_to_variant_records(anno, genome)
variants.extend(var_records)
tally.succeed += 1
except err.GeneNotFoundError:
tally.skipped.invalid_gene_id += 1
tally.skipped.total += 1
continue
variants.extend(var_records)
except:
if args.skip_failed:
tally.skipped.invalid_position += 1
tally.skipped.total += 1
continue
raise

logger.info('Arriba output %s loaded.', fusion)

Expand All @@ -110,3 +165,5 @@ def parse_arriba(args:argparse.Namespace) -> None:
seqvar.io.write(variants, output_path, metadata)

logger.info("Variants written to disk.")

tally.log()
72 changes: 58 additions & 14 deletions moPepGen/cli/parse_circexplorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
[callVariant](call-variant.md). Noted that only known circRNA is supported (
\*_circular_known.txt) """
from __future__ import annotations
from typing import TYPE_CHECKING
import argparse
from typing import List, Dict
from pathlib import Path
from moPepGen import get_logger, circ, err
from moPepGen.parser import CIRCexplorerParser
from moPepGen.cli import common


if TYPE_CHECKING:
from typing import List, Dict
from logging import Logger

INPUT_FILE_FORMATS = ['.tsv', '.txt']
OUTPUT_FILE_FORMATS = ['.gvf']

Expand Down Expand Up @@ -74,16 +78,48 @@ def add_subparser_parse_circexplorer(subparsers:argparse._SubParsersAction):
default='-100,5',
metavar='<number>'
)
common.add_args_skip_failed(p)
common.add_args_source(p)
common.add_args_reference(p, genome=False, proteome=False)
common.add_args_debug_level(p)
p.set_defaults(func=parse_circexplorer)
common.print_help_if_missing_args(p)
return p

class TallyTable():
""" Tally table """
def __init__(self, logger:Logger):
""" Constructor """
self.total:int = 0
self.succeed:int = 0
self.skipped:TallyTableSkipped = TallyTableSkipped()
self.logger = logger

def log(self):
""" Show tally results """
self.logger.info("Summary:")
self.logger.info("Totally records read: %i", self.total)
self.logger.info("Records successfully processed: %i", self.succeed)
self.logger.info("Records skipped: %i", self.skipped.total)
if self.skipped.total > 0:
self.logger.info("Out of those skipped,")
self.logger.info(" Invalid circRNA record: %i", self.skipped.invalid_record)
self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence)

class TallyTableSkipped():
""" Tally table for failed ones """
def __init__(self):
""" constructor """
self.invalid_gene_id:int = 0
self.invalid_position:int = 0
self.insufficient_evidence:int = 0
self.invalid_record:int = 0
self.total:int = 0

def parse_circexplorer(args:argparse.Namespace):
""" Parse circexplorer known circRNA results. """
logger = get_logger()
tally = TallyTable(logger)

input_path:Path = args.input_path
output_path:Path = args.output_path
Expand All @@ -104,11 +140,16 @@ def parse_circexplorer(args:argparse.Namespace):
circ_records:Dict[str, List[circ.CircRNAModel]] = {}

for record in CIRCexplorerParser.parse(input_path, args.circexplorer3):
tally.total += 1
if not args.circexplorer3:
if not record.is_valid(args.min_read_number):
tally.skipped.total += 1
tally.skipped.insufficient_evidence += 1
continue
elif not record.is_valid(args.min_read_number, args.min_fbr_circ, \
args.min_circ_score):
tally.skipped.total += 1
tally.skipped.insufficient_evidence += 1
continue
try:
circ_record = record.convert_to_circ_rna(anno, intron_start_range,
Expand All @@ -119,13 +160,17 @@ def parse_circexplorer(args:argparse.Namespace):
" Skipping it from parsing.",
record.name, record.isoform_name
)
tally.skipped.invalid_record += 1
tally.skipped.total += 1
continue
except err.IntronNotFoundError:
logger.warning(
"The CIRCexplorer record %s from transcript %s contains an unknown"
" intron. Skipping it from parsing.",
record.name, record.isoform_name
)
tally.skipped.invalid_record += 1
tally.skipped.total += 1
continue
except:
logger.error('Exception raised from record: %s', record.name)
Expand All @@ -135,21 +180,20 @@ def parse_circexplorer(args:argparse.Namespace):
circ_records[gene_id] = []
circ_records[gene_id].append(circ_record)

if not circ_records:
logger.warning('No variant record is saved.')
return
if circ_records:
genes_rank = anno.get_genes_rank()
ordered_keys = sorted(circ_records.keys(), key=lambda x:genes_rank[x])

genes_rank = anno.get_genes_rank()
ordered_keys = sorted(circ_records.keys(), key=lambda x:genes_rank[x])
records = []
for key in ordered_keys:
val = circ_records[key]
records.extend(val)

records = []
for key in ordered_keys:
val = circ_records[key]
records.extend(val)
metadata = common.generate_metadata(args)

metadata = common.generate_metadata(args)
with open(output_path, 'w') as handle:
circ.io.write(records, metadata, handle)

with open(output_path, 'w') as handle:
circ.io.write(records, metadata, handle)
logger.info("CircRNA records written to disk.")

logger.info("CircRNA records written to disk.")
tally.log()
60 changes: 55 additions & 5 deletions moPepGen/cli/parse_fusion_catcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,18 @@
[FusionCatcher](https://github.com/ndaniel/fusioncatcher) and save as a
GVF file. The GVF file can be later used to call variant peptides using
[callVariant](call-variant.md)."""
from typing import List
from __future__ import annotations
from typing import TYPE_CHECKING
from pathlib import Path
import argparse
from moPepGen import get_logger, seqvar, parser, err
from moPepGen.cli import common


if TYPE_CHECKING:
from typing import List
from logging import Logger

INPUT_FILE_FORMATS = ['.tsv', '.txt']
OUTPUT_FILE_FORMATS = ['.gvf']

Expand Down Expand Up @@ -42,16 +47,48 @@ def add_subparser_parse_fusion_catcher(subparsers:argparse._SubParsersAction):
default=5,
metavar='<number>'
)
common.add_args_skip_failed(p)
common.add_args_source(p)
common.add_args_reference(p, proteome=False)
common.add_args_debug_level(p)
p.set_defaults(func=parse_fusion_catcher)
common.print_help_if_missing_args(p)
return p

class TallyTable():
""" Tally table """
def __init__(self, logger:Logger):
""" Constructor """
self.total:int = 0
self.succeed:int = 0
self.skipped:TallyTableSkipped = TallyTableSkipped()
self.logger = logger

def log(self):
""" Show tally results """
self.logger.info("Summary:")
self.logger.info("Totally records read: %i", self.total)
self.logger.info("Records successfully processed: %i", self.succeed)
self.logger.info("Records skipped: %i", self.skipped.total)
if self.skipped.total > 0:
self.logger.info("Out of those skipped,")
self.logger.info(" Invalid gene ID: %i", self.skipped.invalid_gene_id)
self.logger.info(" Invalid position: %i", self.skipped.invalid_position)
self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence)

class TallyTableSkipped():
""" Tally table for failed ones """
def __init__(self):
""" constructor """
self.invalid_gene_id:int = 0
self.invalid_position:int = 0
self.insufficient_evidence:int = 0
self.total:int = 0

def parse_fusion_catcher(args:argparse.Namespace) -> None:
""" Parse FusionCatcher output and save it in GVF format. """
logger = get_logger()
tally = TallyTable(logger)
# unpack args
fusion = args.input_path
output_path:Path = args.output_path
Expand All @@ -69,15 +106,26 @@ def parse_fusion_catcher(args:argparse.Namespace) -> None:
variants:List[seqvar.VariantRecord] = []

for record in parser.FusionCatcherParser.parse(fusion):
if record.counts_of_common_mapping_reads > args.max_common_mapping:
continue
if record.spanning_unique_reads < args.min_spanning_unique:
tally.total += 1
if record.counts_of_common_mapping_reads > args.max_common_mapping \
or record.spanning_unique_reads < args.min_spanning_unique:
tally.skipped.insufficient_evidence += 1
tally.skipped.total += 1
continue
try:
var_records = record.convert_to_variant_records(anno, genome)
variants.extend(var_records)
tally.succeed += 1
except err.GeneNotFoundError:
tally.skipped.invalid_gene_id += 1
tally.skipped.total += 1
continue
variants.extend(var_records)
except:
if args.skip_failed:
tally.skipped.total += 1
tally.skipped.invalid_position += 1
continue
raise

logger.info('FusionCatcher output %s loaded.', fusion)

Expand All @@ -95,3 +143,5 @@ def parse_fusion_catcher(args:argparse.Namespace) -> None:
seqvar.io.write(variants, output_path, metadata)

logger.info("Variants written to disk.")

tally.log()
Loading
Loading