Skip to content

Commit 1f1637d

Browse files
committed
feat (moPepGen): Added tally table to parseREDITools, parseCIRCExplorer, and parseRMATS
1 parent 7dc5dae commit 1f1637d

7 files changed

+134
-32
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
1414

1515
- Added --skip-failed flag to callVariant, parseArriba, parserSTARFusion, parseFusionCatcher.
1616

17+
- Added tally table to parseREDITools, parseCIRCExplorer, and parseRMATS
18+
1719
## [1.4.6-rc2] - 2025-03-03
1820

1921
### Fixed

moPepGen/cli/parse_circexplorer.py

+58-14
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@
44
[callVariant](call-variant.md). Noted that only known circRNA is supported (
55
\*_circular_known.txt) """
66
from __future__ import annotations
7+
from typing import TYPE_CHECKING
78
import argparse
8-
from typing import List, Dict
99
from pathlib import Path
1010
from moPepGen import get_logger, circ, err
1111
from moPepGen.parser import CIRCexplorerParser
1212
from moPepGen.cli import common
1313

1414

15+
if TYPE_CHECKING:
16+
from typing import List, Dict
17+
from logging import Logger
18+
1519
INPUT_FILE_FORMATS = ['.tsv', '.txt']
1620
OUTPUT_FILE_FORMATS = ['.gvf']
1721

@@ -74,16 +78,48 @@ def add_subparser_parse_circexplorer(subparsers:argparse._SubParsersAction):
7478
default='-100,5',
7579
metavar='<number>'
7680
)
81+
common.add_args_skip_failed(p)
7782
common.add_args_source(p)
7883
common.add_args_reference(p, genome=False, proteome=False)
7984
common.add_args_debug_level(p)
8085
p.set_defaults(func=parse_circexplorer)
8186
common.print_help_if_missing_args(p)
8287
return p
8388

89+
class TallyTable():
90+
""" Tally table """
91+
def __init__(self, logger:Logger):
92+
""" Constructor """
93+
self.total:int = 0
94+
self.succeed:int = 0
95+
self.skipped:TallyTableSkipped = TallyTableSkipped()
96+
self.logger = logger
97+
98+
def log(self):
99+
""" Show tally results """
100+
self.logger.info("Summary:")
101+
self.logger.info("Totally records read: %i", self.total)
102+
self.logger.info("Records successfully processed: %i", self.succeed)
103+
self.logger.info("Records skipped: %i", self.skipped.total)
104+
if self.skipped.total > 0:
105+
self.logger.info("Out of those skipped,")
106+
self.logger.info(" Invalid circRNA record: %i", self.skipped.invalid_record)
107+
self.logger.info(" Insufficient evidence: %i", self.skipped.insufficient_evidence)
108+
109+
class TallyTableSkipped():
110+
""" Tally table for failed ones """
111+
def __init__(self):
112+
""" constructor """
113+
self.invalid_gene_id:int = 0
114+
self.invalid_position:int = 0
115+
self.insufficient_evidence:int = 0
116+
self.invalid_record:int = 0
117+
self.total:int = 0
118+
84119
def parse_circexplorer(args:argparse.Namespace):
85120
""" Parse circexplorer known circRNA results. """
86121
logger = get_logger()
122+
tally = TallyTable(logger)
87123

88124
input_path:Path = args.input_path
89125
output_path:Path = args.output_path
@@ -104,11 +140,16 @@ def parse_circexplorer(args:argparse.Namespace):
104140
circ_records:Dict[str, List[circ.CircRNAModel]] = {}
105141

106142
for record in CIRCexplorerParser.parse(input_path, args.circexplorer3):
143+
tally.total += 1
107144
if not args.circexplorer3:
108145
if not record.is_valid(args.min_read_number):
146+
tally.skipped.total += 1
147+
tally.skipped.insufficient_evidence += 1
109148
continue
110149
elif not record.is_valid(args.min_read_number, args.min_fbr_circ, \
111150
args.min_circ_score):
151+
tally.skipped.total += 1
152+
tally.skipped.insufficient_evidence += 1
112153
continue
113154
try:
114155
circ_record = record.convert_to_circ_rna(anno, intron_start_range,
@@ -119,13 +160,17 @@ def parse_circexplorer(args:argparse.Namespace):
119160
" Skipping it from parsing.",
120161
record.name, record.isoform_name
121162
)
163+
tally.skipped.invalid_record += 1
164+
tally.skipped.total += 1
122165
continue
123166
except err.IntronNotFoundError:
124167
logger.warning(
125168
"The CIRCexplorer record %s from transcript %s contains an unknown"
126169
" intron. Skipping it from parsing.",
127170
record.name, record.isoform_name
128171
)
172+
tally.skipped.invalid_record += 1
173+
tally.skipped.total += 1
129174
continue
130175
except:
131176
logger.error('Exception raised from record: %s', record.name)
@@ -135,21 +180,20 @@ def parse_circexplorer(args:argparse.Namespace):
135180
circ_records[gene_id] = []
136181
circ_records[gene_id].append(circ_record)
137182

138-
if not circ_records:
139-
logger.warning('No variant record is saved.')
140-
return
183+
if circ_records:
184+
genes_rank = anno.get_genes_rank()
185+
ordered_keys = sorted(circ_records.keys(), key=lambda x:genes_rank[x])
141186

142-
genes_rank = anno.get_genes_rank()
143-
ordered_keys = sorted(circ_records.keys(), key=lambda x:genes_rank[x])
187+
records = []
188+
for key in ordered_keys:
189+
val = circ_records[key]
190+
records.extend(val)
144191

145-
records = []
146-
for key in ordered_keys:
147-
val = circ_records[key]
148-
records.extend(val)
192+
metadata = common.generate_metadata(args)
149193

150-
metadata = common.generate_metadata(args)
194+
with open(output_path, 'w') as handle:
195+
circ.io.write(records, metadata, handle)
151196

152-
with open(output_path, 'w') as handle:
153-
circ.io.write(records, metadata, handle)
197+
logger.info("CircRNA records written to disk.")
154198

155-
logger.info("CircRNA records written to disk.")
199+
tally.log()

moPepGen/cli/parse_reditools.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@
44
[callVariant](call-variant.md)
55
"""
66
from __future__ import annotations
7+
from typing import TYPE_CHECKING
78
import argparse
89
from pathlib import Path
9-
from typing import Dict, List
1010
from moPepGen import get_logger, seqvar, parser
1111
from moPepGen.cli import common
1212

1313

14+
if TYPE_CHECKING:
15+
from typing import Dict, List
16+
from logging import Logger
17+
1418
INPUT_FILE_FORMATS = ['.tsv', '.txt']
1519
OUTPUT_FILE_FORMATS = ['.gvf']
1620

@@ -76,9 +80,26 @@ def add_subparser_parse_reditools(subparsers:argparse._SubParsersAction):
7680
common.print_help_if_missing_args(p)
7781
return p
7882

83+
class TallyTable():
84+
""" Tally table """
85+
def __init__(self, logger:Logger):
86+
""" Constructor """
87+
self.total:int = 0
88+
self.succeed:int = 0
89+
self.skipped:int = 0
90+
self.logger = logger
91+
92+
def log(self):
93+
""" Show tally results """
94+
self.logger.info("Summary:")
95+
self.logger.info("Totally records read: %i", self.total)
96+
self.logger.info("Records successfully processed: %i", self.succeed)
97+
self.logger.info("Records skipped: %i", self.skipped)
98+
7999
def parse_reditools(args:argparse.Namespace) -> None:
80100
""" Parse REDItools output and save it in the GVF format. """
81101
logger = get_logger()
102+
tally = TallyTable(logger)
82103
# unpack args
83104
table_file:Path = args.input_path
84105
output_path:Path = args.output_path
@@ -102,13 +123,18 @@ def parse_reditools(args:argparse.Namespace) -> None:
102123
variants:Dict[str, List[seqvar.VariantRecord]] = {}
103124

104125
for record in parser.REDItoolsParser.parse(table_file, transcript_id_column):
126+
tally.total += 1
105127
_vars = record.convert_to_variant_records(
106128
anno=anno,
107129
min_coverage_alt=min_coverage_alt,
108130
min_frequency_alt=min_frequency_alt,
109131
min_coverage_rna=min_coverage_rna,
110132
min_coverage_dna=min_coverage_dna
111133
)
134+
if not _vars:
135+
tally.skipped += 1
136+
else:
137+
tally.succeed += 1
112138
for variant in _vars:
113139
gene_id = variant.location.seqname
114140
if gene_id not in variants:
@@ -139,3 +165,5 @@ def parse_reditools(args:argparse.Namespace) -> None:
139165
seqvar.io.write(all_records, output_path, metadata)
140166

141167
logger.info('Variants written to disk.')
168+
169+
tally.log()

moPepGen/cli/parse_rmats.py

+30-1
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,18 @@
77
[callVariant](call-variant.md)
88
"""
99
from __future__ import annotations
10+
from typing import TYPE_CHECKING
1011
import argparse
11-
from typing import Dict, Set
1212
from pathlib import Path
1313
from moPepGen import get_logger, seqvar
1414
from moPepGen.parser import RMATSParser
1515
from moPepGen.cli import common
1616

1717

18+
if TYPE_CHECKING:
19+
from typing import Dict, Set
20+
from logging import Logger
21+
1822
INPUT_FILE_FORMATS = ['.tsv', '.txt']
1923
OUTPUT_FILE_FORMATS = ['.gvf']
2024

@@ -102,9 +106,27 @@ def add_subparser_parse_rmats(subparsers:argparse._SubParsersAction):
102106
common.print_help_if_missing_args(p)
103107
return p
104108

109+
class TallyTable():
110+
""" Tally table """
111+
def __init__(self, logger:Logger):
112+
""" Constructor """
113+
self.total:int = 0
114+
self.succeed:int = 0
115+
self.skipped:int = 0
116+
self.logger = logger
117+
118+
def log(self):
119+
""" Show tally results """
120+
self.logger.info("Summary:")
121+
self.logger.info("Totally records read: %i", self.total)
122+
self.logger.info("Records successfully processed: %i", self.succeed)
123+
self.logger.info("Records skipped: %i", self.skipped)
124+
125+
105126
def parse_rmats(args:argparse.Namespace) -> None:
106127
""" Parse rMATS results into TSV """
107128
logger = get_logger()
129+
tally = TallyTable(logger)
108130

109131
skipped_exon = args.skipped_exon
110132
alternative_5 = args.alternative_5_splicing
@@ -136,6 +158,7 @@ def parse_rmats(args:argparse.Namespace) -> None:
136158
if path:
137159
logger.info("Start parsing %s file %s", event_type, path)
138160
for record in RMATSParser.parse(path, event_type):
161+
tally.total += 1
139162
try:
140163
var_records = record.convert_to_variant_records(
141164
anno=anno, genome=genome,
@@ -144,6 +167,10 @@ def parse_rmats(args:argparse.Namespace) -> None:
144167
except:
145168
logger.error(record.gene_id)
146169
raise
170+
if var_records:
171+
tally.succeed += 1
172+
else:
173+
tally.skipped += 1
147174
for var_record in var_records:
148175
tx_id = var_record.transcript_id
149176
if tx_id not in variants:
@@ -168,3 +195,5 @@ def parse_rmats(args:argparse.Namespace) -> None:
168195
seqvar.io.write(variants_sorted, output_path, metadata)
169196

170197
logger.info('Variants written to disk.')
198+
199+
tally.log()

test/integration/test_parse_arriba.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import argparse
33
import subprocess as sp
44
import sys
5-
from unittest.mock import Mock
5+
from unittest import mock
66
from test.integration import TestCaseIntegration
77
from moPepGen import cli
88

@@ -55,12 +55,12 @@ def test_parse_arriba_cli(self):
5555
print(res.stderr.decode('utf-8'))
5656
raise
5757

58+
@mock.patch(
59+
"moPepGen.parser.ArribaParser.ArribaRecord.convert_to_variant_records",
60+
new=mock.MagicMock(side_effect=ValueError())
61+
)
5862
def test_parse_arriba_skip_failed(self):
5963
""" Test parseArriba with skip failed """
60-
from moPepGen import parser
61-
parser.ArribaParser.ArribaRecord.convert_to_variant_records = Mock(
62-
side_effect=ValueError()
63-
)
6464
args = self.create_base_args()
6565
with self.assertRaises(ValueError):
6666
cli.parse_arriba(args)

test/integration/test_parse_fusion_catcher.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pathlib import Path
44
import subprocess as sp
55
import sys
6-
from unittest.mock import Mock
6+
from unittest import mock
77
from test.unit import load_references
88
from test.integration import TestCaseIntegration
99
from moPepGen import cli, parser
@@ -81,13 +81,12 @@ def test_parse_fusion_catcher(self):
8181
self.assertEqual(files, expected)
8282
self.assert_gvf_order(args.output_path, args.annotation_gtf)
8383

84+
@mock.patch(
85+
"moPepGen.parser.FusionCatcherParser.FusionCatcherRecord.convert_to_variant_records",
86+
new=mock.MagicMock(side_effect=ValueError())
87+
)
8488
def test_parse_fusion_catcher_skip_failed(self):
8589
""" Test parseFusionCatcher with --skip-failed """
86-
from moPepGen import parser
87-
parser.FusionCatcherParser.FusionCatcherRecord.convert_to_variant_records = Mock(
88-
side_effect=ValueError()
89-
)
90-
9190
args = self.create_base_args()
9291
args.input_path = self.data_dir/'fusion/fusion_catcher.txt'
9392
with self.assertRaises(ValueError):

test/integration/test_parse_star_fusion.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import argparse
33
import subprocess as sp
44
import sys
5-
from unittest.mock import Mock
5+
from unittest import mock
66
from test.integration import TestCaseIntegration
77
from moPepGen import cli, seqvar
88
from moPepGen.cli.common import load_references
@@ -79,12 +79,12 @@ def test_parse_star_fusion_case1(self):
7979
self.assertEqual(files, expected)
8080
self.assert_gvf_order(args.output_path, args.annotation_gtf)
8181

82+
@mock.patch(
83+
"moPepGen.parser.STARFusionParser.STARFusionRecord.convert_to_variant_records",
84+
new=mock.MagicMock(side_effect=ValueError())
85+
)
8286
def test_parse_star_fusion_skip_failed(self):
8387
""" test parseSTARFusion case1 """
84-
from moPepGen import parser
85-
parser.STARFusionParser.STARFusionRecord.convert_to_variant_records = Mock(
86-
side_effect=ValueError()
87-
)
8888
args = self.create_base_args()
8989
args.input_path = self.data_dir/'fusion/star_fusion.txt'
9090
with self.assertRaises(ValueError):

0 commit comments

Comments
 (0)