diff --git a/CHANGELOG.md b/CHANGELOG.md index bd6bb75f..01143dde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [1.4.6-rc3] - 2025-03-10 + +- Added --skip-failed flag to callVariant, parseArriba, parserSTARFusion, parseFusionCatcher. + +- Added tally table to parseREDITools, parseCIRCExplorer, and parseRMATS + ## [1.4.6-rc2] - 2025-03-03 ### Fixed diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py index d0a27169..7cb69223 100644 --- a/moPepGen/__init__.py +++ b/moPepGen/__init__.py @@ -8,7 +8,7 @@ from . import constant -__version__ = '1.4.6-rc2' +__version__ = '1.4.6-rc3' ## Error messages ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron' diff --git a/moPepGen/cli/call_variant_peptide.py b/moPepGen/cli/call_variant_peptide.py index b95fe9cb..af640338 100644 --- a/moPepGen/cli/call_variant_peptide.py +++ b/moPepGen/cli/call_variant_peptide.py @@ -32,6 +32,7 @@ INPUT_FILE_FORMATS = ['.gvf'] OUTPUT_FILE_FORMATS = ['.fasta', '.fa'] + # pylint: disable=W0212 def add_subparser_call_variant(subparsers:argparse._SubParsersAction): """ CLI for moPepGen callPeptides """ @@ -143,6 +144,7 @@ def add_subparser_call_variant(subparsers:argparse._SubParsersAction): default=1, metavar='<number>' ) + common.add_args_skip_failed(p) common.add_args_reference(p) common.add_args_cleavage(p) common.add_args_debug_level(p) @@ -158,6 +160,12 @@ def __init__(self, logger:Logger): """ constructor """ self.n_transcripts_total = 0 self.n_transcripts_processed = 0 + self.n_transcripts_invalid = 0 + self.n_transcripts_failed = { + 'variant': 0, + 'fusion': 0, + 'circRNA': 0 + } self.n_total_peptides = 0 self.n_valid_peptides = 0 self.logger = logger @@ -167,6 +175,11 @@ def log(self): self.logger.info("callVariant summary:") self.logger.info("Total transcripts with variants: %i", self.n_transcripts_total) self.logger.info("Total transcripts processed: %i", self.n_transcripts_processed) + self.logger.info("Number of invalid transcripts: %i", self.n_transcripts_invalid) + self.logger.info("Number of transcripts failed when calling for:") + self.logger.info(" Variant peptides: %i", self.n_transcripts_failed['variant']) + self.logger.info(" Fusion peptides: %i", self.n_transcripts_failed['fusion']) + self.logger.info(" circRNA peptides: %i", self.n_transcripts_failed['circRNA']) self.logger.info( "Total variant peptides generated (including redundant): %i", self.n_total_peptides @@ -300,16 +313,112 @@ def write_pgraphs(self, tx_id:str, pgraphs:TypePGraphs): with open(self.graph_output_dir/f"{tx_id}_circRNA_{var_id}_PVG.json", 'wt') as handle: json.dump(data, handle) -TypeDGraphs = Tuple[ - svgraph.ThreeFrameTVG, - Dict[str, svgraph.ThreeFrameTVG], - Dict[str, svgraph.ThreeFrameCVG] -] -TypePGraphs = Tuple[ - svgraph.PeptideVariantGraph, - Dict[str, svgraph.PeptideVariantGraph], - Dict[str, svgraph.PeptideVariantGraph] -] + def gather_data_for_call_variant(self, tx_id:str, pool:seqvar.VariantRecordPoolOnDisk): + """ Gather data for callVariant """ + ref = self.reference_data + tx_ids = [tx_id] + tx_model = ref.anno.transcripts[tx_id] + try: + variant_series = pool[tx_id] + except ValueError: + if self.args.skip_failed: + self.tally.n_transcripts_invalid += 1 + return None + raise + if variant_series.is_empty(): + return None + if self.noncanonical_transcripts and \ + not variant_series.has_any_noncanonical_transcripts(): + return None + tx_ids += variant_series.get_additional_transcripts() + + gene_seqs = {} + if variant_series.is_gene_sequence_needed(): + gene_id = tx_model.transcript.gene_id + _chrom = tx_model.transcript.chrom + gene_model = ref.anno.genes[gene_id] + gene_seq = gene_model.get_gene_sequence(ref.genome[_chrom]) + gene_seqs[gene_id] = gene_seq + + tx_seqs = {} + for _tx_id in tx_ids: + _tx_model = ref.anno.transcripts[_tx_id] + _chrom = _tx_model.transcript.chrom + tx_seqs[_tx_id] = _tx_model.get_transcript_sequence(ref.genome[_chrom]) + _gene_id = _tx_model.transcript.gene_id + if _gene_id not in gene_seqs: + _gene_model = ref.anno.genes[_gene_id] + _gene_seq = _gene_model.get_gene_sequence(ref.genome[_chrom]) + gene_seqs[_gene_id] = _gene_seq + gene_ids = list({ref.anno.transcripts[x].transcript.gene_id for x in tx_ids}) + + gene_models = {} + for gene_id in gene_ids: + dummy_gene_model = copy.deepcopy(ref.anno.genes[gene_id]) + dummy_gene_model.transcripts = [x for x in dummy_gene_model.transcripts + if x in tx_ids] + gene_models[gene_id] = dummy_gene_model + + dummy_anno = gtf.GenomicAnnotation( + genes=gene_models, + transcripts={tx_id: ref.anno.transcripts[tx_id] for tx_id in tx_ids}, + source=ref.anno.source + ) + reference_data = params.ReferenceData( + genome=None, + anno=dummy_anno, + canonical_peptides=set() # Not providing canonical peptide at this point. + ) + dummy_pool = seqvar.VariantRecordPool() + dummy_pool.anno = dummy_anno + dummy_pool.data[tx_id] = variant_series + for add_tx in tx_ids: + if add_tx != tx_id: + try: + dummy_pool[add_tx] = pool[add_tx] + except KeyError: + continue + + return { + 'tx_id': tx_id, + 'variant_series': variant_series, + 'tx_seqs': tx_seqs, + 'gene_seqs': gene_seqs, + 'reference_data': reference_data, + 'pool': dummy_pool, + 'cleavage_params': self.cleavage_params, + 'noncanonical_transcripts': self.noncanonical_transcripts, + 'max_adjacent_as_mnv': self.max_adjacent_as_mnv, + 'truncate_sec': self.truncate_sec, + 'w2f_reassignment': self.w2f_reassignment, + 'save_graph': self.graph_output_dir is not None, + 'timeout': self.args.timeout_seconds, + 'max_variants_per_node': tuple(self.args.max_variants_per_node), + 'additional_variants_per_misc': tuple(self.args.additional_variants_per_misc), + 'backsplicing_only': self.args.backsplicing_only, + 'coding_novel_orf': self.args.coding_novel_orf, + 'skip_failed': self.args.skip_failed + } + + +if TYPE_CHECKING: + TypeDGraphs = Tuple[ + svgraph.ThreeFrameTVG, + Dict[str, svgraph.ThreeFrameTVG], + Dict[str, svgraph.ThreeFrameCVG] + ] + TypePGraphs = Tuple[ + svgraph.PeptideVariantGraph, + Dict[str, svgraph.PeptideVariantGraph], + Dict[str, svgraph.PeptideVariantGraph] + ] + CallVariantOutput = Tuple[ + Dict[Seq, List[AnnotatedPeptideLabel]], + str, + TypeDGraphs, + TypePGraphs, + Tuple[bool,bool,bool] + ] # pylint: disable=unused-argument @common.timeout() def call_variant_peptides_wrapper(tx_id:str, @@ -326,9 +435,11 @@ def call_variant_peptides_wrapper(tx_id:str, backsplicing_only:bool, save_graph:bool, coding_novel_orf:bool, + skip_failed:bool, **kwargs - ) -> Tuple[Dict[Seq, List[AnnotatedPeptideLabel]], str, TypeDGraphs, TypePGraphs]: + ) -> CallVariantOutput: """ wrapper function to call variant peptides """ + # pylint: disable=W0702 logger = get_logger() peptide_anno:Dict[Seq, Dict[str, AnnotatedPeptideLabel]] = {} @@ -347,6 +458,7 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]): ) dgraphs:TypeDGraphs = (None, {}, {}) pgraphs:TypePGraphs = (None, {}, {}) + success_flags = (True, True, True) if variant_series.transcriptional: try: if not noncanonical_transcripts or \ @@ -368,8 +480,12 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]): pgraphs = (pgraph, pgraphs[1], pgraphs[2]) add_peptide_anno(peptide_map) except: - logger.error('Exception raised from %s', tx_id) - raise + if skip_failed: + logger.warning('Variant peptides calling failed from %s', tx_id) + success_flags = (False, success_flags[1], success_flags[2]) + else: + logger.error('Exception raised from %s', tx_id) + raise exclude_variant_types = ['Fusion', 'Insertion', 'Deletion', 'Substitution', 'circRNA'] for variant in variant_series.fusion: @@ -398,11 +514,18 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]): pgraphs[1][variant.id] = pgraph add_peptide_anno(peptide_map) except: - logger.error( - "Exception raised from fusion %s, donor: %s, accepter: %s", - variant.id, tx_id, variant.attrs['ACCEPTER_TRANSCRIPT_ID'] - ) - raise + if skip_failed: + logger.warning( + 'Variant peptides calling failed from %s with fusion: %s', + tx_id, variant.id + ) + success_flags = (success_flags[0], False, success_flags[2]) + else: + logger.error( + "Exception raised from fusion %s, donor: %s, accepter: %s", + variant.id, tx_id, variant.attrs['ACCEPTER_TRANSCRIPT_ID'] + ) + raise if main_peptides: denylist.update([str(x) for x in main_peptides]) @@ -417,15 +540,22 @@ def add_peptide_anno(x:Dict[Seq, List[AnnotatedPeptideLabel]]): ) except: - logger.error("Exception raised from %s", circ_model.id) - raise + if skip_failed: + logger.warning( + 'Variant peptides calling failed from %s with circRNA: %s', + tx_id, circ_model.id + ) + success_flags = (success_flags[0], success_flags[1], False) + else: + logger.error("Exception raised from %s", circ_model.id) + raise dgraphs[2][circ_model.id] = cgraph pgraphs[2][circ_model.id] = pgraph add_peptide_anno(peptide_map) peptide_anno = {k:list(v.values()) for k,v in peptide_anno.items()} - return peptide_anno, tx_id, dgraphs, pgraphs + return peptide_anno, tx_id, dgraphs, pgraphs, success_flags def caller_reducer(dispatch): """ wrapper for ParallelPool. Also reduces the complexity if the run is timed out. """ @@ -462,7 +592,6 @@ def call_variant_peptide(args:argparse.Namespace) -> None: common.print_start_message(args) caller.load_reference() caller.create_in_disk_variant_pool() - noncanonical_transcripts = caller.noncanonical_transcripts ref = caller.reference_data logger = get_logger() caller.logger = logger @@ -482,9 +611,6 @@ def call_variant_peptide(args:argparse.Namespace) -> None: logger.info('Variants sorted') caller.tally.n_transcripts_total = len(tx_sorted) - # Not providing canonical peptide pool to each task for now. - canonical_peptides = set() - if caller.threads > 1: process_pool = ParallelPool(ncpus=caller.threads) @@ -493,82 +619,9 @@ def call_variant_peptide(args:argparse.Namespace) -> None: dispatches = [] i = 0 for tx_id in tx_sorted: - tx_ids = [tx_id] - tx_model = ref.anno.transcripts[tx_id] - variant_series = pool[tx_id] - if variant_series.is_empty(): + dispatch = caller.gather_data_for_call_variant(tx_id, pool) + if not dispatch: continue - if noncanonical_transcripts and \ - not variant_series.has_any_noncanonical_transcripts(): - continue - tx_ids += variant_series.get_additional_transcripts() - - gene_seqs = {} - if variant_series.is_gene_sequence_needed(): - gene_id = tx_model.transcript.gene_id - _chrom = tx_model.transcript.chrom - gene_model = ref.anno.genes[gene_id] - gene_seq = gene_model.get_gene_sequence(ref.genome[_chrom]) - gene_seqs[gene_id] = gene_seq - - tx_seqs = {} - for _tx_id in tx_ids: - _tx_model = ref.anno.transcripts[_tx_id] - _chrom = _tx_model.transcript.chrom - tx_seqs[_tx_id] = _tx_model.get_transcript_sequence(ref.genome[_chrom]) - _gene_id = _tx_model.transcript.gene_id - if _gene_id not in gene_seqs: - _gene_model = ref.anno.genes[_gene_id] - _gene_seq = _gene_model.get_gene_sequence(ref.genome[_chrom]) - gene_seqs[_gene_id] = _gene_seq - gene_ids = list({ref.anno.transcripts[x].transcript.gene_id for x in tx_ids}) - - gene_models = {} - for gene_id in gene_ids: - dummy_gene_model = copy.deepcopy(ref.anno.genes[gene_id]) - dummy_gene_model.transcripts = [x for x in dummy_gene_model.transcripts - if x in tx_ids] - gene_models[gene_id] = dummy_gene_model - - dummy_anno = gtf.GenomicAnnotation( - genes=gene_models, - transcripts={tx_id:ref.anno.transcripts[tx_id] for tx_id in tx_ids}, - source=ref.anno.source - ) - reference_data = params.ReferenceData( - genome=None, - anno=dummy_anno, - canonical_peptides=canonical_peptides - ) - dummy_pool = seqvar.VariantRecordPool() - dummy_pool.anno = dummy_anno - dummy_pool.data[tx_id] = variant_series - for add_tx in tx_ids: - if add_tx != tx_id: - try: - dummy_pool[add_tx] = caller.variant_record_pool[add_tx] - except KeyError: - continue - - dispatch = { - 'tx_id': tx_id, - 'variant_series': variant_series, - 'tx_seqs': tx_seqs, - 'gene_seqs': gene_seqs, - 'reference_data': reference_data, - 'pool': dummy_pool, - 'cleavage_params': caller.cleavage_params, - 'noncanonical_transcripts': noncanonical_transcripts, - 'max_adjacent_as_mnv': caller.max_adjacent_as_mnv, - 'truncate_sec': caller.truncate_sec, - 'w2f_reassignment': caller.w2f_reassignment, - 'save_graph': caller.graph_output_dir is not None, - 'timeout': args.timeout_seconds, - 'max_variants_per_node': tuple(args.max_variants_per_node), - 'additional_variants_per_misc': tuple(args.additional_variants_per_misc), - 'backsplicing_only': args.backsplicing_only, - 'coding_novel_orf': args.coding_novel_orf - } dispatches.append(dispatch) caller.tally.n_transcripts_processed += 1 @@ -586,7 +639,7 @@ def call_variant_peptide(args:argparse.Namespace) -> None: results = [ caller_reducer(dispatches[0]) ] # pylint: disable=W0621 - for peptide_anno, tx_id, dgraphs, pgraphs in results: + for peptide_anno, tx_id, dgraphs, pgraphs, success_flags in results: caller.tally.n_total_peptides += len(peptide_anno) for peptide in peptide_anno: is_valid = peptide_table.is_valid( @@ -600,6 +653,12 @@ def call_variant_peptide(args:argparse.Namespace) -> None: if caller.graph_output_dir is not None: caller.write_dgraphs(tx_id, dgraphs) caller.write_pgraphs(tx_id, pgraphs) + if not success_flags[0]: + caller.tally.n_transcripts_failed['variant'] += 1 + if not success_flags[1]: + caller.tally.n_transcripts_failed['fusion'] += 1 + if not success_flags[2]: + caller.tally.n_transcripts_failed['circRNA'] += 1 dispatches = [] i += 1 diff --git a/moPepGen/cli/parse_arriba.py b/moPepGen/cli/parse_arriba.py index 39abf58f..4ffc1ded 100644 --- a/moPepGen/cli/parse_arriba.py +++ b/moPepGen/cli/parse_arriba.py @@ -2,13 +2,18 @@ [Arriba](https://github.com/suhrig/arriba) and saves as a GVF file. The GVF file can be later used to call variant peptides using [callVariant](call-variant.md).""" -from typing import List +from __future__ import annotations +from typing import TYPE_CHECKING from pathlib import Path import argparse from moPepGen import get_logger, seqvar, parser, err from moPepGen.cli import common +if TYPE_CHECKING: + from typing import List + from logging import Logger + INPUT_FILE_FORMATS = ['.tsv', '.txt'] OUTPUT_FILE_FORMATS = ['.gvf'] @@ -50,6 +55,7 @@ def add_subparser_parse_arriba(subparsers:argparse._SubParsersAction): metavar='<choice>', default='medium' ) + common.add_args_skip_failed(p) common.add_args_source(p) common.add_args_reference(p, proteome=False) common.add_args_debug_level(p) @@ -57,9 +63,42 @@ def add_subparser_parse_arriba(subparsers:argparse._SubParsersAction): common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.skipped:TallyTableSkipped = TallyTableSkipped() + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Summary:") + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) + self.logger.info("Records skipped: %i", self.skipped.total) + if self.skipped.total > 0: + self.logger.info("Out of those skipped,") + self.logger.info(" Invalid gene ID: %i", self.skipped.invalid_gene_id) + self.logger.info(" Invalid position: %i", self.skipped.invalid_position) + self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence) + self.logger.info(" Antisense strand: %i", self.skipped.antisense_strand) + +class TallyTableSkipped(): + """ Tally table for failed ones """ + def __init__(self): + """ constructor """ + self.invalid_gene_id:int = 0 + self.invalid_position:int = 0 + self.insufficient_evidence:int = 0 + self.antisense_strand:int = 0 + self.total:int = 0 + def parse_arriba(args:argparse.Namespace) -> None: """ Parse Arriba output and save it in GVF format. """ logger = get_logger() + tally = TallyTable(logger) # unpack args fusion = args.input_path output_path:Path = args.output_path @@ -82,17 +121,33 @@ def parse_arriba(args:argparse.Namespace) -> None: with open(fusion, 'rt') as handle: for record in parser.ArribaParser.parse(handle): + tally.total += 1 if not record.gene_id1 in anno.genes or not record.gene_id2 in anno.genes: + tally.skipped.invalid_gene_id += 1 + tally.skipped.total += 1 continue if not record.is_valid(min_split_read1, min_split_read2, min_confidence): + tally.skipped.insufficient_evidence += 1 + tally.skipped.total += 1 continue if record.transcript_on_antisense_strand(anno): + tally.skipped.antisense_strand += 1 + tally.skipped.total += 1 continue try: var_records = record.convert_to_variant_records(anno, genome) + variants.extend(var_records) + tally.succeed += 1 except err.GeneNotFoundError: + tally.skipped.invalid_gene_id += 1 + tally.skipped.total += 1 continue - variants.extend(var_records) + except: + if args.skip_failed: + tally.skipped.invalid_position += 1 + tally.skipped.total += 1 + continue + raise logger.info('Arriba output %s loaded.', fusion) @@ -110,3 +165,5 @@ def parse_arriba(args:argparse.Namespace) -> None: seqvar.io.write(variants, output_path, metadata) logger.info("Variants written to disk.") + + tally.log() diff --git a/moPepGen/cli/parse_circexplorer.py b/moPepGen/cli/parse_circexplorer.py index ad7adf3e..0060f551 100644 --- a/moPepGen/cli/parse_circexplorer.py +++ b/moPepGen/cli/parse_circexplorer.py @@ -4,14 +4,18 @@ [callVariant](call-variant.md). Noted that only known circRNA is supported ( \*_circular_known.txt) """ from __future__ import annotations +from typing import TYPE_CHECKING import argparse -from typing import List, Dict from pathlib import Path from moPepGen import get_logger, circ, err from moPepGen.parser import CIRCexplorerParser from moPepGen.cli import common +if TYPE_CHECKING: + from typing import List, Dict + from logging import Logger + INPUT_FILE_FORMATS = ['.tsv', '.txt'] OUTPUT_FILE_FORMATS = ['.gvf'] @@ -74,6 +78,7 @@ def add_subparser_parse_circexplorer(subparsers:argparse._SubParsersAction): default='-100,5', metavar='<number>' ) + common.add_args_skip_failed(p) common.add_args_source(p) common.add_args_reference(p, genome=False, proteome=False) common.add_args_debug_level(p) @@ -81,9 +86,40 @@ def add_subparser_parse_circexplorer(subparsers:argparse._SubParsersAction): common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.skipped:TallyTableSkipped = TallyTableSkipped() + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Summary:") + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) + self.logger.info("Records skipped: %i", self.skipped.total) + if self.skipped.total > 0: + self.logger.info("Out of those skipped,") + self.logger.info(" Invalid circRNA record: %i", self.skipped.invalid_record) + self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence) + +class TallyTableSkipped(): + """ Tally table for failed ones """ + def __init__(self): + """ constructor """ + self.invalid_gene_id:int = 0 + self.invalid_position:int = 0 + self.insufficient_evidence:int = 0 + self.invalid_record:int = 0 + self.total:int = 0 + def parse_circexplorer(args:argparse.Namespace): """ Parse circexplorer known circRNA results. """ logger = get_logger() + tally = TallyTable(logger) input_path:Path = args.input_path output_path:Path = args.output_path @@ -104,11 +140,16 @@ def parse_circexplorer(args:argparse.Namespace): circ_records:Dict[str, List[circ.CircRNAModel]] = {} for record in CIRCexplorerParser.parse(input_path, args.circexplorer3): + tally.total += 1 if not args.circexplorer3: if not record.is_valid(args.min_read_number): + tally.skipped.total += 1 + tally.skipped.insufficient_evidence += 1 continue elif not record.is_valid(args.min_read_number, args.min_fbr_circ, \ args.min_circ_score): + tally.skipped.total += 1 + tally.skipped.insufficient_evidence += 1 continue try: circ_record = record.convert_to_circ_rna(anno, intron_start_range, @@ -119,6 +160,8 @@ def parse_circexplorer(args:argparse.Namespace): " Skipping it from parsing.", record.name, record.isoform_name ) + tally.skipped.invalid_record += 1 + tally.skipped.total += 1 continue except err.IntronNotFoundError: logger.warning( @@ -126,6 +169,8 @@ def parse_circexplorer(args:argparse.Namespace): " intron. Skipping it from parsing.", record.name, record.isoform_name ) + tally.skipped.invalid_record += 1 + tally.skipped.total += 1 continue except: logger.error('Exception raised from record: %s', record.name) @@ -135,21 +180,20 @@ def parse_circexplorer(args:argparse.Namespace): circ_records[gene_id] = [] circ_records[gene_id].append(circ_record) - if not circ_records: - logger.warning('No variant record is saved.') - return + if circ_records: + genes_rank = anno.get_genes_rank() + ordered_keys = sorted(circ_records.keys(), key=lambda x:genes_rank[x]) - genes_rank = anno.get_genes_rank() - ordered_keys = sorted(circ_records.keys(), key=lambda x:genes_rank[x]) + records = [] + for key in ordered_keys: + val = circ_records[key] + records.extend(val) - records = [] - for key in ordered_keys: - val = circ_records[key] - records.extend(val) + metadata = common.generate_metadata(args) - metadata = common.generate_metadata(args) + with open(output_path, 'w') as handle: + circ.io.write(records, metadata, handle) - with open(output_path, 'w') as handle: - circ.io.write(records, metadata, handle) + logger.info("CircRNA records written to disk.") - logger.info("CircRNA records written to disk.") + tally.log() diff --git a/moPepGen/cli/parse_fusion_catcher.py b/moPepGen/cli/parse_fusion_catcher.py index c9f3ec42..1b71ecc4 100644 --- a/moPepGen/cli/parse_fusion_catcher.py +++ b/moPepGen/cli/parse_fusion_catcher.py @@ -2,13 +2,18 @@ [FusionCatcher](https://github.com/ndaniel/fusioncatcher) and save as a GVF file. The GVF file can be later used to call variant peptides using [callVariant](call-variant.md).""" -from typing import List +from __future__ import annotations +from typing import TYPE_CHECKING from pathlib import Path import argparse from moPepGen import get_logger, seqvar, parser, err from moPepGen.cli import common +if TYPE_CHECKING: + from typing import List + from logging import Logger + INPUT_FILE_FORMATS = ['.tsv', '.txt'] OUTPUT_FILE_FORMATS = ['.gvf'] @@ -42,6 +47,7 @@ def add_subparser_parse_fusion_catcher(subparsers:argparse._SubParsersAction): default=5, metavar='<number>' ) + common.add_args_skip_failed(p) common.add_args_source(p) common.add_args_reference(p, proteome=False) common.add_args_debug_level(p) @@ -49,9 +55,40 @@ def add_subparser_parse_fusion_catcher(subparsers:argparse._SubParsersAction): common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.skipped:TallyTableSkipped = TallyTableSkipped() + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Summary:") + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) + self.logger.info("Records skipped: %i", self.skipped.total) + if self.skipped.total > 0: + self.logger.info("Out of those skipped,") + self.logger.info(" Invalid gene ID: %i", self.skipped.invalid_gene_id) + self.logger.info(" Invalid position: %i", self.skipped.invalid_position) + self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence) + +class TallyTableSkipped(): + """ Tally table for failed ones """ + def __init__(self): + """ constructor """ + self.invalid_gene_id:int = 0 + self.invalid_position:int = 0 + self.insufficient_evidence:int = 0 + self.total:int = 0 + def parse_fusion_catcher(args:argparse.Namespace) -> None: """ Parse FusionCatcher output and save it in GVF format. """ logger = get_logger() + tally = TallyTable(logger) # unpack args fusion = args.input_path output_path:Path = args.output_path @@ -69,15 +106,26 @@ def parse_fusion_catcher(args:argparse.Namespace) -> None: variants:List[seqvar.VariantRecord] = [] for record in parser.FusionCatcherParser.parse(fusion): - if record.counts_of_common_mapping_reads > args.max_common_mapping: - continue - if record.spanning_unique_reads < args.min_spanning_unique: + tally.total += 1 + if record.counts_of_common_mapping_reads > args.max_common_mapping \ + or record.spanning_unique_reads < args.min_spanning_unique: + tally.skipped.insufficient_evidence += 1 + tally.skipped.total += 1 continue try: var_records = record.convert_to_variant_records(anno, genome) + variants.extend(var_records) + tally.succeed += 1 except err.GeneNotFoundError: + tally.skipped.invalid_gene_id += 1 + tally.skipped.total += 1 continue - variants.extend(var_records) + except: + if args.skip_failed: + tally.skipped.total += 1 + tally.skipped.invalid_position += 1 + continue + raise logger.info('FusionCatcher output %s loaded.', fusion) @@ -95,3 +143,5 @@ def parse_fusion_catcher(args:argparse.Namespace) -> None: seqvar.io.write(variants, output_path, metadata) logger.info("Variants written to disk.") + + tally.log() diff --git a/moPepGen/cli/parse_reditools.py b/moPepGen/cli/parse_reditools.py index b1542d8c..63ff9867 100644 --- a/moPepGen/cli/parse_reditools.py +++ b/moPepGen/cli/parse_reditools.py @@ -4,13 +4,17 @@ [callVariant](call-variant.md) """ from __future__ import annotations +from typing import TYPE_CHECKING import argparse from pathlib import Path -from typing import Dict, List from moPepGen import get_logger, seqvar, parser from moPepGen.cli import common +if TYPE_CHECKING: + from typing import Dict, List + from logging import Logger + INPUT_FILE_FORMATS = ['.tsv', '.txt'] OUTPUT_FILE_FORMATS = ['.gvf'] @@ -76,9 +80,26 @@ def add_subparser_parse_reditools(subparsers:argparse._SubParsersAction): common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.skipped:int = 0 + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Summary:") + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) + self.logger.info("Records skipped: %i", self.skipped) + def parse_reditools(args:argparse.Namespace) -> None: """ Parse REDItools output and save it in the GVF format. """ logger = get_logger() + tally = TallyTable(logger) # unpack args table_file:Path = args.input_path output_path:Path = args.output_path @@ -102,6 +123,7 @@ def parse_reditools(args:argparse.Namespace) -> None: variants:Dict[str, List[seqvar.VariantRecord]] = {} for record in parser.REDItoolsParser.parse(table_file, transcript_id_column): + tally.total += 1 _vars = record.convert_to_variant_records( anno=anno, min_coverage_alt=min_coverage_alt, @@ -109,6 +131,10 @@ def parse_reditools(args:argparse.Namespace) -> None: min_coverage_rna=min_coverage_rna, min_coverage_dna=min_coverage_dna ) + if not _vars: + tally.skipped += 1 + else: + tally.succeed += 1 for variant in _vars: gene_id = variant.location.seqname if gene_id not in variants: @@ -139,3 +165,5 @@ def parse_reditools(args:argparse.Namespace) -> None: seqvar.io.write(all_records, output_path, metadata) logger.info('Variants written to disk.') + + tally.log() diff --git a/moPepGen/cli/parse_rmats.py b/moPepGen/cli/parse_rmats.py index 2b6c82f6..f0f9a246 100644 --- a/moPepGen/cli/parse_rmats.py +++ b/moPepGen/cli/parse_rmats.py @@ -7,14 +7,18 @@ [callVariant](call-variant.md) """ from __future__ import annotations +from typing import TYPE_CHECKING import argparse -from typing import Dict, Set from pathlib import Path from moPepGen import get_logger, seqvar from moPepGen.parser import RMATSParser from moPepGen.cli import common +if TYPE_CHECKING: + from typing import Dict, Set + from logging import Logger + INPUT_FILE_FORMATS = ['.tsv', '.txt'] OUTPUT_FILE_FORMATS = ['.gvf'] @@ -102,9 +106,27 @@ def add_subparser_parse_rmats(subparsers:argparse._SubParsersAction): common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.skipped:int = 0 + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Summary:") + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) + self.logger.info("Records skipped: %i", self.skipped) + + def parse_rmats(args:argparse.Namespace) -> None: """ Parse rMATS results into TSV """ logger = get_logger() + tally = TallyTable(logger) skipped_exon = args.skipped_exon alternative_5 = args.alternative_5_splicing @@ -136,6 +158,7 @@ def parse_rmats(args:argparse.Namespace) -> None: if path: logger.info("Start parsing %s file %s", event_type, path) for record in RMATSParser.parse(path, event_type): + tally.total += 1 try: var_records = record.convert_to_variant_records( anno=anno, genome=genome, @@ -144,6 +167,10 @@ def parse_rmats(args:argparse.Namespace) -> None: except: logger.error(record.gene_id) raise + if var_records: + tally.succeed += 1 + else: + tally.skipped += 1 for var_record in var_records: tx_id = var_record.transcript_id if tx_id not in variants: @@ -168,3 +195,5 @@ def parse_rmats(args:argparse.Namespace) -> None: seqvar.io.write(variants_sorted, output_path, metadata) logger.info('Variants written to disk.') + + tally.log() diff --git a/moPepGen/cli/parse_star_fusion.py b/moPepGen/cli/parse_star_fusion.py index aa44909f..db3bd065 100644 --- a/moPepGen/cli/parse_star_fusion.py +++ b/moPepGen/cli/parse_star_fusion.py @@ -3,12 +3,16 @@ GVF file. The GVF file can be later used to call variant peptides using [callVariant](call-variant.md).""" from __future__ import annotations +from typing import TYPE_CHECKING import argparse -from typing import List from moPepGen import get_logger, seqvar, parser, err from moPepGen.cli import common +if TYPE_CHECKING: + from typing import List + from logging import Logger + INPUT_FILE_FORMATS = ['.tsv', '.txt'] OUTPUT_FILE_FORMATS = ['.gvf'] @@ -37,6 +41,7 @@ def add_subparser_parse_star_fusion(subparsers:argparse._SubParsersAction): default=5.0, metavar='<number>' ) + common.add_args_skip_failed(p) common.add_args_source(p) common.add_args_reference(p, proteome=False) common.add_args_debug_level(p) @@ -44,9 +49,40 @@ def add_subparser_parse_star_fusion(subparsers:argparse._SubParsersAction): common.print_help_if_missing_args(p) return p +class TallyTable(): + """ Tally table """ + def __init__(self, logger:Logger): + """ Constructor """ + self.total:int = 0 + self.succeed:int = 0 + self.skipped:TallyTableSkipped = TallyTableSkipped() + self.logger = logger + + def log(self): + """ Show tally results """ + self.logger.info("Summary:") + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) + self.logger.info("Records skipped: %i", self.skipped.total) + if self.skipped.total > 0: + self.logger.info("Out of those skipped,") + self.logger.info(" Invalid gene ID: %i", self.skipped.invalid_gene_id) + self.logger.info(" Invalid position: %i", self.skipped.invalid_position) + self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence) + +class TallyTableSkipped(): + """ Tally table for failed ones """ + def __init__(self): + """ constructor """ + self.invalid_gene_id:int = 0 + self.invalid_position:int = 0 + self.insufficient_evidence:int = 0 + self.total:int = 0 + def parse_star_fusion(args:argparse.Namespace) -> None: """ Parse the STAR-Fusion's output and save it in GVF format. """ logger = get_logger() + tally = TallyTable(logger) # unpack args fusion = args.input_path common.validate_file_format( @@ -64,13 +100,25 @@ def parse_star_fusion(args:argparse.Namespace) -> None: variants:List[seqvar.VariantRecord] = [] for record in parser.STARFusionParser.parse(fusion): + tally.total += 1 if record.est_j < args.min_est_j: + tally.skipped.insufficient_evidence += 1 + tally.skipped.total += 1 continue try: var_records = record.convert_to_variant_records(anno, genome) + variants.extend(var_records) + tally.succeed += 1 except err.GeneNotFoundError: + tally.skipped.invalid_gene_id += 1 + tally.skipped.total += 1 continue - variants.extend(var_records) + except: + if args.skip_failed: + tally.skipped.invalid_position += 1 + tally.skipped.total += 1 + continue + raise logger.info('STAR-Fusion output %s loaded.', fusion) @@ -88,3 +136,5 @@ def parse_star_fusion(args:argparse.Namespace) -> None: seqvar.io.write(variants, output_path, metadata) logger.info('Variant info written to disk.') + + tally.log() diff --git a/moPepGen/cli/parse_vep.py b/moPepGen/cli/parse_vep.py index eb7329d5..60edc0ed 100644 --- a/moPepGen/cli/parse_vep.py +++ b/moPepGen/cli/parse_vep.py @@ -59,7 +59,8 @@ def __init__(self, logger:Logger): def log(self): """ Show tally results """ - self.logger.info("Records successfully processed: %i", self.total) + self.logger.info("Totally records read: %i", self.total) + self.logger.info("Records successfully processed: %i", self.succeed) self.logger.info("Records failed: %i", self.failed.total) if self.failed.total > 0: self.logger.info("Out of those failed,") diff --git a/moPepGen/util/fuzz_test.py b/moPepGen/util/fuzz_test.py index 7941d7cb..84d00f2c 100644 --- a/moPepGen/util/fuzz_test.py +++ b/moPepGen/util/fuzz_test.py @@ -639,6 +639,7 @@ def call_variants(self): args.debug_level = 1 args.timeout_seconds = 300 args.coding_novel_orf = False + args.skip_failed = False call_variant_peptide(args=args) def brute_force(self): diff --git a/moPepGen/util/validate_variant_calling.py b/moPepGen/util/validate_variant_calling.py index 9c0e66bc..3513cdbe 100644 --- a/moPepGen/util/validate_variant_calling.py +++ b/moPepGen/util/validate_variant_calling.py @@ -128,6 +128,7 @@ def call_variant(gvf_files:Path, ref_dir:Path, output_fasta:Path, graph_output_d args.noncanonical_transcripts = False args.threads = 1 args.timeout_seconds = 900 + args.skip_failed = False call_variant_peptide(args=args) def call_brute_force(gvf_files:Path, ref_dir:Path, output_path:str, force:bool, diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index c02cb336..f57c83f3 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -3,6 +3,7 @@ from typing import List from pathlib import Path import sys +from unittest.mock import Mock import subprocess as sp from test.integration import TestCaseIntegration from test.unit import create_variant @@ -47,6 +48,7 @@ def create_base_args() -> argparse.Namespace: args.invalid_protein_as_noncoding = False args.threads = 1 args.timeout_seconds = 1800 + args.skip_failed = False return args class TestCallVariantPeptides(TestCaseIntegration): @@ -153,6 +155,33 @@ def test_call_variant_peptide_case1_sect_and_w2f(self): expected = {'vep_moPepGen.fasta', 'vep_moPepGen_peptide_table.txt'} self.assertEqual(files, expected) + def test_call_variant_peptide_skip_failed(self): + """ Test errors were handled by --skip-failed in callVariant """ + # pylint: disable=import-outside-toplevel + import importlib.util + spec = importlib.util.spec_from_file_location( + "moPepGen.cli.call_variant_peptide", + Path(__file__).parent.parent.parent/"moPepGen/cli/call_variant_peptide.py" + ) + call_variant_peptide_module = importlib.util.module_from_spec(spec) + sys.modules["moPepGen.cli.call_variant_peptide"] = call_variant_peptide_module + spec.loader.exec_module(call_variant_peptide_module) + + call_variant_peptide_module.call_peptide_main = Mock(side_effect=ValueError()) + + args = create_base_args() + args.input_path = [self.data_dir/'vep'/'vep_gSNP.gvf'] + args.output_path = self.work_dir/'vep_moPepGen.fasta' + args.genome_fasta = self.data_dir/'genome.fasta' + args.annotation_gtf = self.data_dir/'annotation.gtf' + args.proteome_fasta = self.data_dir/'translate.fasta' + + with self.assertRaises(ValueError): + call_variant_peptide_module.call_variant_peptide(args) + + args.skip_failed = True + call_variant_peptide_module.call_variant_peptide(args) + def test_call_variant_peptide_case2(self): """ Test variant peptide calling with fusion """ args = create_base_args() diff --git a/test/integration/test_parse_arriba.py b/test/integration/test_parse_arriba.py index 5df7e2ec..6876df64 100644 --- a/test/integration/test_parse_arriba.py +++ b/test/integration/test_parse_arriba.py @@ -2,14 +2,15 @@ import argparse import subprocess as sp import sys +from unittest import mock from test.integration import TestCaseIntegration from moPepGen import cli class TestParseArriba(TestCaseIntegration): """ Test cases for moPepGen parseSTARFusion """ - def test_parse_arriba(self): - """ Test parseArriba """ + def create_base_args(self) -> argparse.Namespace: + """ Create base args """ args = argparse.Namespace() args.command = 'parseArriba' args.input_path = self.data_dir/'fusion/arriba.txt' @@ -24,6 +25,12 @@ def test_parse_arriba(self): args.min_split_read2 = 1 args.min_confidence = 'medium' args.quiet = False + args.skip_failed = False + return args + + def test_parse_arriba(self): + """ Test parseArriba """ + args = self.create_base_args() cli.parse_arriba(args) files = {str(file.name) for file in self.work_dir.glob('*')} expected = {'arriba.gvf'} @@ -47,3 +54,16 @@ def test_parse_arriba_cli(self): print(cmd) print(res.stderr.decode('utf-8')) raise + + @mock.patch( + "moPepGen.parser.ArribaParser.ArribaRecord.convert_to_variant_records", + new=mock.MagicMock(side_effect=ValueError()) + ) + def test_parse_arriba_skip_failed(self): + """ Test parseArriba with skip failed """ + args = self.create_base_args() + with self.assertRaises(ValueError): + cli.parse_arriba(args) + + args.skip_failed = True + cli.parse_arriba(args) diff --git a/test/integration/test_parse_fusion_catcher.py b/test/integration/test_parse_fusion_catcher.py index 61442ca4..d15a030b 100644 --- a/test/integration/test_parse_fusion_catcher.py +++ b/test/integration/test_parse_fusion_catcher.py @@ -3,6 +3,7 @@ from pathlib import Path import subprocess as sp import sys +from unittest import mock from test.unit import load_references from test.integration import TestCaseIntegration from moPepGen import cli, parser @@ -54,11 +55,10 @@ def test_fusioncatcher_parser(self): right_seq = gene2_seq.seq[_start2:_end2] self.assertEqual(str(right_seq), fusion_seq[1]) - def test_parse_fusion_catcher(self): - """ Test parseFusionCatcher """ + def create_base_args(self) -> argparse.Namespace: + """ Create base args """ args = argparse.Namespace() args.command = 'parseFusionCatcher' - args.input_path = self.data_dir/'fusion/fusion_catcher.txt' args.source = 'Fusion' args.index_dir = None args.genome_fasta = self.data_dir/'genome.fasta' @@ -68,9 +68,30 @@ def test_parse_fusion_catcher(self): args.output_path = self.work_dir/'fusion_catcher.gvf' args.max_common_mapping = 0 args.min_spanning_unique = 5 + args.skip_failed = False args.quiet = True + return args + + def test_parse_fusion_catcher(self): + """ Test parseFusionCatcher """ + args = self.create_base_args() + args.input_path = self.data_dir/'fusion/fusion_catcher.txt' cli.parse_fusion_catcher(args) files = {str(file.name) for file in self.work_dir.glob('*')} expected = {'fusion_catcher.gvf'} self.assertEqual(files, expected) self.assert_gvf_order(args.output_path, args.annotation_gtf) + + @mock.patch( + "moPepGen.parser.FusionCatcherParser.FusionCatcherRecord.convert_to_variant_records", + new=mock.MagicMock(side_effect=ValueError()) + ) + def test_parse_fusion_catcher_skip_failed(self): + """ Test parseFusionCatcher with --skip-failed """ + args = self.create_base_args() + args.input_path = self.data_dir/'fusion/fusion_catcher.txt' + with self.assertRaises(ValueError): + cli.parse_fusion_catcher(args) + + args.skip_failed = True + cli.parse_fusion_catcher(args) diff --git a/test/integration/test_parse_star_fusion.py b/test/integration/test_parse_star_fusion.py index 0507f5ce..aded4686 100644 --- a/test/integration/test_parse_star_fusion.py +++ b/test/integration/test_parse_star_fusion.py @@ -2,6 +2,7 @@ import argparse import subprocess as sp import sys +from unittest import mock from test.integration import TestCaseIntegration from moPepGen import cli, seqvar from moPepGen.cli.common import load_references @@ -27,12 +28,11 @@ def test_parse_star_fusion_cli(self): print(res.stderr.decode('utf-8')) raise - def test_star_fusion_record_case1(self): - """ Test parseSTARFusion """ + def create_base_args(self) -> argparse.Namespace: + """ Create base args """ args = argparse.Namespace() args.command = 'parseSTARFusion' args.source = 'Fusion' - args.input_path = self.data_dir/'fusion/star_fusion.txt' args.index_dir = None args.genome_fasta = self.data_dir/'genome.fasta' args.annotation_gtf = self.data_dir/'annotation.gtf' @@ -40,6 +40,13 @@ def test_star_fusion_record_case1(self): args.output_path = self.work_dir/'star_fusion.gvf' args.min_est_j = 3.0 args.quiet = True + args.skip_failed =False + return args + + def test_star_fusion_record_case1(self): + """ Test parseSTARFusion """ + args = self.create_base_args() + args.input_path = self.data_dir/'fusion/star_fusion.txt' cli.parse_star_fusion(args) files = {str(file.name) for file in self.work_dir.glob('*')} expected = {'star_fusion.gvf'} @@ -64,19 +71,23 @@ def test_star_fusion_record_case1(self): def test_parse_star_fusion_case1(self): """ test parseSTARFusion case1 """ - args = argparse.Namespace() - args.command = 'parseSTARFusion' + args = self.create_base_args() args.input_path = self.data_dir/'fusion/star_fusion.txt' - args.source = 'Fusion' - args.index_dir = None - args.genome_fasta = self.data_dir/'genome.fasta' - args.annotation_gtf = self.data_dir/'annotation.gtf' - args.reference_source = None - args.output_path = self.work_dir/'star_fusion.gvf' - args.min_est_j = 3.0 - args.quiet = True cli.parse_star_fusion(args) files = {str(file.name) for file in self.work_dir.glob('*')} expected = {'star_fusion.gvf'} self.assertEqual(files, expected) self.assert_gvf_order(args.output_path, args.annotation_gtf) + + @mock.patch( + "moPepGen.parser.STARFusionParser.STARFusionRecord.convert_to_variant_records", + new=mock.MagicMock(side_effect=ValueError()) + ) + def test_parse_star_fusion_skip_failed(self): + """ test parseSTARFusion case1 """ + args = self.create_base_args() + args.input_path = self.data_dir/'fusion/star_fusion.txt' + with self.assertRaises(ValueError): + cli.parse_star_fusion(args) + args.skip_failed = True + cli.parse_star_fusion(args)