diff --git a/flye/polishing/alignment.py b/flye/polishing/alignment.py index c7ad4422c..0a5949cdd 100644 --- a/flye/polishing/alignment.py +++ b/flye/polishing/alignment.py @@ -241,7 +241,7 @@ def _run_minimap(reference_file, reads_files, num_proc, reads_type, out_file): elif reads_type in ["nano-raw", "nano-corrected"]: mode = "map-ont" elif reads_type == "nano-nano_hq": - mode = "map-ont" + mode = "lr:hq" extra_args = ["-k", "17"] cmdline = [MINIMAP_BIN, "'" + reference_file + "'"] diff --git a/lib/minimap2/.github/workflows/ci.yaml b/lib/minimap2/.github/workflows/ci.yaml new file mode 100644 index 000000000..628acea0c --- /dev/null +++ b/lib/minimap2/.github/workflows/ci.yaml @@ -0,0 +1,21 @@ +name: CI + +on: + push: + branches: + - master + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + compiler: [gcc, clang] + + steps: + - name: Checkout minimap2 + uses: actions/checkout@v2 + + - name: Compile with ${{ matrix.compiler }} + run: make CC=${{ matrix.compiler }} diff --git a/lib/minimap2/.gitignore b/lib/minimap2/.gitignore new file mode 100644 index 000000000..d5a6dd6b5 --- /dev/null +++ b/lib/minimap2/.gitignore @@ -0,0 +1,8 @@ +.cproject +.project +.*.swp +*.a +*.o +*.dSYM +minimap2 +mappy.c diff --git a/lib/minimap2/.gitmodules b/lib/minimap2/.gitmodules new file mode 100644 index 000000000..a80f848dc --- /dev/null +++ b/lib/minimap2/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib/simde"] + path = lib/simde + url = https://github.com/nemequ/simde.git diff --git a/lib/minimap2/LICENSE 2.txt b/lib/minimap2/LICENSE 2.txt new file mode 100644 index 000000000..1a06f6490 --- /dev/null +++ b/lib/minimap2/LICENSE 2.txt @@ -0,0 +1,24 @@ +The MIT License + +Copyright (c) 2018- Dana-Farber Cancer Institute + 2017-2018 Broad Institute, Inc. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/lib/minimap2/Makefile b/lib/minimap2/Makefile index 90b1cc4c0..17b13b64b 100644 --- a/lib/minimap2/Makefile +++ b/lib/minimap2/Makefile @@ -30,12 +30,12 @@ endif ifneq ($(asan),) CFLAGS+=-fsanitize=address - LIBS+=-fsanitize=address + LIBS+=-fsanitize=address -ldl endif ifneq ($(tsan),) CFLAGS+=-fsanitize=thread - LIBS+=-fsanitize=thread + LIBS+=-fsanitize=thread -ldl endif .PHONY:all extra clean depend diff --git a/lib/minimap2/NEWS 2.md b/lib/minimap2/NEWS 2.md new file mode 100644 index 000000000..5de4ea629 --- /dev/null +++ b/lib/minimap2/NEWS 2.md @@ -0,0 +1,821 @@ +Release 2.24-r1122 (26 December 2021) +------------------------------------- + +This release improves alignment around long poorly aligned regions. Older +minimap2 may chain through such regions in rare cases which may result in +missing alignments later. The issue has become worse since the the change of +the chaining algorithm in v2.19. v2.23 implements an incomplete remedy. This +release provides a better solution with a X-drop-like heuristic and by enabling +two-bandwidth chaining in the assembly mode. + +(2.24: 26 December 2021, r1122) + + + +Release 2.23-r1111 (18 November 2021) +------------------------------------- + +Notable changes: + + * Bugfix: fixed missing alignments around long inversions (#806 and #816). + This bug affected v2.19 through v2.22. + + * Improvement: avoid extremely long mapping time for pathologic reads with + highly repeated k-mers not in the reference (#771). Use --q-occ-frac=0 + to disable the new heuristic. + + * Change: use --cap-kalloc=1g by default. + +(2.23: 18 November 2021, r1111) + + + +Release 2.22-r1101 (7 August 2021) +---------------------------------- + +When choosing the best alignment, this release uses logarithm gap penalty and +query-specific mismatch penalty. It improves the sensitivity to long INDELs in +repetitive regions. + +Other notable changes: + + * Bugfix: fixed an indirect memory leak that may waste a large amount of + memory given highly repetitive reference such as a 16S RNA database (#749). + All versions of minimap2 have this issue. + + * New feature: added --cap-kalloc to reduce the peak memory. This option is + not enabled by default but may become the default in future releases. + +Known issue: + + * Minimap2 may take a long time to map a read (#771). So far it is not clear + if this happens to v2.18 and earlier versions. + +(2.22: 7 August 2021, r1101) + + + +Release 2.21-r1071 (6 July 2021) +-------------------------------- + +This release fixed a regression in short-read mapping introduced in v2.19 +(#776). It also fixed invalid comparisons of uninitialized variables, though +these are harmless (#752). Long-read alignment should be identical to v2.20. + +(2.21: 6 July 2021, r1071) + + + +Release 2.20-r1061 (27 May 2021) +-------------------------------- + +This release fixed a bug in the Python module and improves the command-line +compatibiliity with v2.18. In v2.19, if `-r` is specified with an `asm*` preset, +users would get alignments more fragmented than v2.18. This could be an issue +for existing pipelines specifying `-r`. This release resolves this issue. + +(2.20: 27 May 2021, r1061) + + + +Release 2.19-r1057 (26 May 2021) +-------------------------------- + +This release includes a few important improvements backported from unimap: + + * Improvement: more contiguous alignment through long INDELs. This is enabled + by the minigraph chaining algorithm. All `asm*` presets now use the new + algorithm. They can find INDELs up to 100kb and may be faster for + chromosome-long contigs. The default mode and `map*` presets use this + algorithm to replace the long-join heuristic. + + * Improvement: better alignment in highly repetitive regions by rescuing + high-occurrence seeds. If the distance between two adjacent seeds is too + large, attempt to choose a fraction of high-occurrence seeds in-between. + Minimap2 now produces fewer clippings and alignment break points in long + satellite regions. + + * Improvement: allow to specify an interval of k-mer occurrences with `-U`. + For repeat-rich genomes, the automatic k-mer occurrence threshold determined + by `-f` may be too large and makes alignment impractically slow. The new + option protects against such cases. Enabled for `asm*` and `map-hifi`. + + * New feature: added the `map-hifi` preset for maping PacBio High-Fidelity + (HiFi) reads. + + * Change to the default: apply `--cap-sw-mem=100m` for genomic alignment. + + * Bugfix: minimap2 could not generate an index file with `-xsr` (#734). + +This release represents the most signficant algorithmic change since v2.1 in +2017. With features backported from unimap, minimap2 now has similar power to +unimap for contig alignment. Unimap will remain an experimental project and is +no longer recommended over minimap2. Sorry for reverting the recommendation in +short time. + +(2.19: 26 May 2021, r1057) + + + +Release 2.18-r1015 (9 April 2021) +--------------------------------- + +This release fixes multiple rare bugs in minimap2 and adds additional +functionality to paftools.js. + +Changes to minimap2: + + * Bugfix: a rare segfault caused by an off-by-one error (#489) + + * Bugfix: minimap2 segfaulted due to an uninitilized variable (#622 and #625). + + * Bugfix: minimap2 parsed spaces as field separators in BED (#721). This led + to issues when the BED name column contains spaces. + + * Bugfix: minimap2 `--split-prefix` did not work with long reference names + (#394). + + * Bugfix: option `--junc-bonus` didn't work (#513) + + * Bugfix: minimap2 didn't return 1 on I/O errors (#532) + + * Bugfix: the `de:f` tag (sequence divergence) could be negative if there were + ambiguous bases + + * Bugfix: fixed two undefined behaviors caused by calling memcpy() on + zero-length blocks (#443) + + * Bugfix: there were duplicated SAM @SQ lines if option `--split-prefix` is in + use (#400 and #527) + + * Bugfix: option -K had to be smaller than 2 billion (#491). This was caused + by a 32-bit integer overflow. + + * Improvement: optionally compile against SIMDe (#597). Minimap2 should work + with IBM POWER CPUs, though this has not been tested. To compile with SIMDe, + please use `make -f Makefile.simde`. + + * Improvement: more informative error message for I/O errors (#454) and for + FASTQ parsing errors (#510) + + * Improvement: abort given malformatted RG line (#541) + + * Improvement: better formula to estimate the `dv:f` tag (approximate sequence + divergence). See DOI:10.1101/2021.01.15.426881. + + * New feature: added the `--mask-len` option to fine control the removal of + redundant hits (#659). The default behavior is unchanged. + +Changes to mappy: + + * Bugfix: mappy caused segmentation fault if the reference index is not + present (#413). + + * Bugfix: fixed a memory leak via 238b6bb3 + + * Change: always require Cython to compile the mappy module (#723). Older + mappy packages at PyPI bundled the C source code generated by Cython such + that end users did not need to install Cython to compile mappy. However, as + Python 3.9 is breaking backward compatibility, older mappy does not work + with Python 3.9 anymore. We have to add this Cython dependency as a + workaround. + +Changes to paftools.js: + + * Bugfix: the "part10-" line from asmgene was wrong (#581) + + * Improvement: compatibility with GTF files from GenBank (#422) + + * New feature: asmgene also checks missing multi-copy genes + + * New feature: added the misjoin command to evaluate large-scale misjoins and + megabase-long inversions. + +Although given the many bug fixes and minor improvements, the core algorithm +stays the same. This version of minimap2 produces nearly identical alignments +to v2.17 except very rare corner cases. + +Now unimap is recommended over minimap2 for aligning long contigs against a +reference genome. It often takes less wall-clock time and is much more +sensitive to long insertions and deletions. + +(2.18: 9 April 2021, r1015) + + + +Release 2.17-r941 (4 May 2019) +------------------------------ + +Changes since the last release: + + * Fixed flawed CIGARs like `5I6D7I` (#392). + + * Bugfix: TLEN should be 0 when either end is unmapped (#373 and #365). + + * Bugfix: mappy is unable to write index (#372). + + * Added option `--junc-bed` to load known gene annotations in the BED12 + format. Minimap2 prefers annotated junctions over novel junctions (#197 and + #348). GTF can be converted to BED12 with `paftools.js gff2bed`. + + * Added option `--sam-hit-only` to suppress unmapped hits in SAM (#377). + + * Added preset `splice:hq` for high-quality CCS or mRNA sequences. It applies + better scoring and improves the sensitivity to small exons. This preset may + introduce false small introns, but the overall accuracy should be higher. + +This version produces nearly identical alignments to v2.16, except for CIGARs +affected by the bug mentioned above. + +(2.17: 5 May 2019, r941) + + + +Release 2.16-r922 (28 February 2019) +------------------------------------ + +This release is 50% faster for mapping ultra-long nanopore reads at comparable +accuracy. For short-read mapping, long-read overlapping and ordinary long-read +mapping, the performance and accuracy remain similar. This speedup is achieved +with a new heuristic to limit the number of chaining iterations (#324). Users +can disable the heuristic by increasing a new option `--max-chain-iter` to a +huge number. + +Other changes to minimap2: + + * Implemented option `--paf-no-hit` to output unmapped query sequences in PAF. + The strand and reference name columns are both `*` at an unmapped line. The + hidden option is available in earlier minimap2 but had a different 2-column + output format instead of PAF. + + * Fixed a bug that leads to wrongly calculated `de` tags when ambiguous bases + are involved (#309). This bug only affects v2.15. + + * Fixed a bug when parsing command-line option `--splice` (#344). This bug was + introduced in v2.13. + + * Fixed two division-by-zero cases (#326). They don't affect final alignments + because the results of the divisions are not used in both case. + + * Added an option `-o` to output alignments to a specified file. It is still + recommended to use UNIX pipes for on-the-fly conversion or compression. + + * Output a new `rl` tag to give the length of query regions harboring + repetitive seeds. + +Changes to paftool.js: + + * Added a new option to convert the MD tag to the long form of the cs tag. + +Changes to mappy: + + * Added the `mappy.Aligner.seq_names` method to return sequence names (#312). + +For NA12878 ultra-long reads, this release changes the alignments of <0.1% of +reads in comparison to v2.15. All these reads have highly fragmented alignments +and are likely to be problematic anyway. For shorter or well aligned reads, +this release should produce mostly identical alignments to v2.15. + +(2.16: 28 February 2019, r922) + + + +Release 2.15-r905 (10 January 2019) +----------------------------------- + +Changes to minimap2: + + * Fixed a rare segmentation fault when option -H is in use (#307). This may + happen when there are very long homopolymers towards the 5'-end of a read. + + * Fixed wrong CIGARs when option --eqx is used (#266). + + * Fixed a typo in the base encoding table (#264). This should have no + practical effect. + + * Fixed a typo in the example code (#265). + + * Improved the C++ compatibility by removing "register" (#261). However, + minimap2 still can't be compiled in the pedantic C++ mode (#306). + + * Output a new "de" tag for gap-compressed sequence divergence. + +Changes to paftools.js: + + * Added "asmgene" to evaluate the completeness of an assembly by measuring the + uniquely mapped single-copy genes. This command learns the idea of BUSCO. + + * Added "vcfpair" to call a phased VCF from phased whole-genome assemblies. An + earlier version of this script is used to produce the ground truth for the + syndip benchmark [PMID:30013044]. + +This release produces identical alignment coordinates and CIGARs in comparison +to v2.14. Users are advised to upgrade due to the several bug fixes. + +(2.15: 10 Janurary 2019, r905) + + + +Release 2.14-r883 (5 November 2018) +----------------------------------- + +Notable changes: + + * Fixed two minor bugs caused by typos (#254 and #266). + + * Fixed a bug that made minimap2 abort when --eqx was used together with --MD + or --cs (#257). + + * Added --cap-sw-mem to cap the size of DP matrices (#259). Base alignment may + take a lot of memory in the splicing mode. This may lead to issues when we + run minimap2 on a cluster with a hard memory limit. The new option avoids + unlimited memory usage at the cost of missing a few long introns. + + * Conforming to C99 and C11 when possible (#261). + + * Warn about malformatted FASTA or FASTQ (#252 and #255). + +This release occasionally produces base alignments different from v2.13. The +overall alignment accuracy remain similar. + +(2.14: 5 November 2018, r883) + + + +Release 2.13-r850 (11 October 2018) +----------------------------------- + +Changes to minimap2: + + * Fixed wrongly formatted SAM when -L is in use (#231 and #233). + + * Fixed an integer overflow in rare cases. + + * Added --hard-mask-level to fine control split alignments (#244). + + * Made --MD work with spliced alignment (#139). + + * Replaced musl's getopt with ketopt for portability. + + * Log peak memory usage on exit. + +This release should produce alignments identical to v2.12 and v2.11. + +(2.13: 11 October 2018, r850) + + + +Release 2.12-r827 (6 August 2018) +--------------------------------- + +Changes to minimap2: + + * Added option --split-prefix to write proper alignments (correct mapping + quality and clustered query sequences) given a multi-part index (#141 and + #189; mostly by @hasindu2008). + + * Fixed a memory leak when option -y is in use. + +Changes to mappy: + + * Support the MD/cs tag (#183 and #203). + + * Allow mappy to index a single sequence, to add extra flags and to change the + scoring system. + +Minimap2 should produce alignments identical to v2.11. + +(2.12: 6 August 2018, r827) + + + +Release 2.11-r797 (20 June 2018) +-------------------------------- + +Changes to minimap2: + + * Improved alignment accuracy in low-complexity regions for SV calling. Thank + @armintoepfer for multiple offline examples. + + * Added option --eqx to encode sequence match/mismatch with the =/X CIGAR + operators (#156, #157 and #175). + + * When compiled with VC++, minimap2 generated wrong alignments due to a + comparison between a signed integer and an unsigned integer (#184). Also + fixed warnings reported by "clang -Wextra". + + * Fixed incorrect anchor filtering due to a missing 64- to 32-bit cast. + + * Fixed incorrect mapping quality for inversions (#148). + + * Fixed incorrect alignment involving ambiguous bases (#155). + + * Fixed incorrect presets: option `-r 2000` is intended to be used with + ava-ont, not ava-pb. The bug was introduced in 2.10. + + * Fixed a bug when --for-only/--rev-only is used together with --sr or + --heap-sort=yes (#166). + + * Fixed option -Y that was not working in the previous releases. + + * Added option --lj-min-ratio to fine control the alignment of long gaps + found by the "long-join" heuristic (#128). + + * Exposed `mm_idx_is_idx`, `mm_idx_load` and `mm_idx_dump` C APIs (#177). + Also fixed a bug when indexing without reference names (this feature is not + exposed to the command line). + +Changes to mappy: + + * Added `__version__` (#165). + + * Exposed the maximum fragment length parameter to mappy (#174). + +Changes to paftools: + + * Don't crash when there is no "cg" tag (#153). + + * Fixed wrong coverage report by "paftools.js call" (#145). + +This version may produce slightly different base-level alignment. The overall +alignment statistics should remain similar. + +(2.11: 20 June 2018, r797) + + + +Release 2.10-r761 (27 March 2018) +--------------------------------- + +Changes to minimap2: + + * Optionally output the MD tag for compatibility with existing tools (#63, + #118 and #137). + + * Use SSE compiler flags more precisely to prevent compiling errors on certain + machines (#127). + + * Added option --min-occ-floor to set a minimum occurrence threshold. Presets + intended for assembly-to-reference alignment set this option to 100. This + option alleviates issues with regions having high copy numbers (#107). + + * Exit with non-zero code on file writing errors (e.g. disk full; #103 and + #132). + + * Added option -y to copy FASTA/FASTQ comments in query sequences to the + output (#136). + + * Added the asm20 preset for alignments between genomes at 5-10% sequence + divergence. + + * Changed the band-width in the ava-ont preset from 500 to 2000. Oxford + Nanopore reads may contain long deletion sequencing errors that break + chaining. + +Changes to mappy, the Python binding: + + * Fixed a typo in Align.seq() (#126). + +Changes to paftools.js, the companion script: + + * Command sam2paf now converts the MD tag to cs. + + * Support VCF output for assembly-to-reference variant calling (#109). + +This version should produce identical alignment for read overlapping, RNA-seq +read mapping, and genomic read mapping. We have also added a cook book to show +the variety uses of minimap2 on real datasets. Please see cookbook.md in the +minimap2 source code directory. + +(2.10: 27 March 2017, r761) + + + +Release 2.9-r720 (23 February 2018) +----------------------------------- + +This release fixed multiple minor bugs. + +* Fixed two bugs that lead to incorrect inversion alignment. Also improved the + sensitivity to small inversions by using double Z-drop cutoff (#112). + +* Fixed an issue that may cause the end of a query sequence unmapped (#104). + +* Added a mappy API to retrieve sequences from the index (#126) and to reverse + complement DNA sequences. Fixed a bug where the `best_n` parameter did not + work (#117). + +* Avoided segmentation fault given incorrect FASTQ input (#111). + +* Combined all auxiliary javascripts to paftools.js. Fixed several bugs in + these scripts at the same time. + +(2.9: 24 February 2018, r720) + + + +Release 2.8-r672 (1 February 2018) +---------------------------------- + +Notable changes in this release include: + + * Speed up short-read alignment by ~10%. The overall mapping accuracy stays + the same, but the output alignments are not always identical to v2.7 due to + unstable sorting employed during chaining. Long-read alignment is not + affected by this change as the speedup is short-read specific. + + * Mappy now supports paired-end short-read alignment (#87). Please see + python/README.rst for details. + + * Added option --for-only and --rev-only to perform alignment against the + forward or the reverse strand of the reference genome only (#91). + + * Alleviated the issue with undesired diagonal alignment in the self mapping + mode (#10). Even if the output is not ideal, it should not interfere with + other alignments. Fully resolving the issue is intricate and may require + additional heuristic thresholds. + + * Enhanced error checking against incorrect input (#92 and #96). + +For long query sequences, minimap2 should output identical alignments to v2.7. + +(2.8: 1 February 2018, r672) + + + +Release 2.7-r654 (9 January 2018) +--------------------------------- + +This release fixed a bug in the splice mode and added a few minor features: + + * Fixed a bug that occasionally takes an intron as a long deletion in the + splice mode. This was caused by wrong backtracking at the last CIGAR + operator. The current fix eliminates the error, but it is not optimal in + that it often produces a wrong junction when the last operator is an intron. + A future version of minimap2 may improve upon this. + + * Support high-end ARM CPUs that implement the NEON instruction set (#81). + This enables minimap2 to work on Raspberry Pi 3 and Odroid XU4. + + * Added a C API to construct a minimizer index from a set of C strings (#80). + + * Check scoring specified on the command line (#79). Due to the 8-bit limit, + excessively large score penalties fail minimap2. + +For genomic sequences, minimap2 should give identical alignments to v2.6. + +(2.7: 9 January 2018, r654) + + + +Release 2.6-r623 (12 December 2017) +----------------------------------- + +This release adds several features and fixes two minor bugs: + + * Optionally build an index without sequences. This helps to reduce the + peak memory for read overlapping and is automatically applied when + base-level alignment is not requested. + + * Approximately estimate per-base sequence divergence (i.e. 1-identity) + without performing base-level alignment, using a MashMap-like method. The + estimate is written to a new dv:f tag. + + * Reduced the number of tiny terminal exons in RNA-seq alignment. The current + setting is conservative. Increase --end-seed-pen to drop more such exons. + + * Reduced the peak memory when aligning long query sequences. + + * Fixed a bug that is caused by HPC minimizers longer than 256bp. This should + have no effect in practice, but it is recommended to rebuild HPC indices if + possible. + + * Fixed a bug when identifying identical hits (#71). This should only affect + artifactual reference consisting of near identical sequences. + +For genomic sequences, minimap2 should give nearly identical alignments to +v2.5, except the new dv:f tag. + +(2.6: 12 December 2017, r623) + + + +Release 2.5-r572 (11 November 2017) +----------------------------------- + +This release fixes several bugs and brings a couple of minor improvements: + + * Fixed a severe bug that leads to incorrect mapping coordinates in rare + corner cases. + + * Fixed underestimated mapping quality for chimeric alignments when the whole + query sequence contain many repetitive minimizers, and for chimeric + alignments caused by Z-drop. + + * Fixed two bugs in Python binding: incorrect strand field (#57) and incorrect + sequence names for Python3 (#55). + + * Improved mapping accuracy for highly overlapping paired ends. + + * Added option -Y to use soft clipping for supplementary alignments (#56). + +(2.5: 11 November 2017, r572) + + + +Release 2.4-r555 (6 November 2017) +---------------------------------- + +As is planned, this release focuses on fine tuning the base algorithm. Notable +changes include + + * Changed the mapping quality scale to match the scale of BWA-MEM. This makes + minimap2 and BWA-MEM achieve similar sensitivity-specificity balance on real + short-read data. + + * Improved the accuracy of splice alignment by modeling one additional base + close to the GT-AG signal. This model is used by default with `-x splice`. + For SIRV control data, however, it is recommended to add `--splice-flank=no` + to disable this feature as the SIRV splice signals are slightly different. + + * Tuned the parameters for Nanopore Direct RNA reads. The recommended command + line is `-axsplice -k14 -uf` (#46). + + * Fixed a segmentation fault when aligning PacBio reads (#47 and #48). This + bug is very rare but it affects all versions of minimap2. It is also + recommended to re-index reference genomes created with `map-pb`. For human, + two minimizers in an old index are wrong. + + * Changed option `-L` in sync with the final decision of hts-specs: a fake + CIGAR takes the form of `SN`. Note that `-L` only enables + future tools to recognize long CIGARs. It is not possible for older tools to + work with such alignments in BAM (#43 and #51). + + * Fixed a tiny issue whereby minimap2 may waste 8 bytes per candidate + alignment. + +The minimap2 technical note hosted at arXiv has also been updated to reflect +recent changes. + +(2.4: 6 November 2017, r555) + + + +Release 2.3-r531 (22 October 2017) +---------------------------------- + +This release come with many improvements and bug fixes: + + * The **sr** preset now supports paired-end short-read alignment. Minimap2 is + 3-4 times as fast as BWA-MEM, but is slightly less accurate on simulated + reads. + + * Meticulous improvements to assembly-to-assembly alignment (special thanks to + Alexey Gurevich from the QUAST team): a) apply a small penalty to matches + between ambiguous bases; b) reduce missing alignments due to spurious + overlaps; c) introduce the short form of the `cs` tag, an improvement to the + SAM MD tag. + + * Make sure gaps are always left-aligned. + + * Recognize `U` bases from Oxford Nanopore Direct RNA-seq (#33). + + * Fixed slightly wrong chaining score. Fixed slightly inaccurate coordinates + for split alignment. + + * Fixed multiple reported bugs: 1) wrong reference name for inversion + alignment (#30); 2) redundant SQ lines when multiple query files are + specified (#39); 3) non-functioning option `-K` (#36). + +This release has implemented all the major features I planned five months ago, +with the addition of spliced long-read alignment. The next couple of releases +will focus on fine tuning of the base algorithms. + +(2.3: 22 October 2017, r531) + + + +Release 2.2-r409 (17 September 2017) +------------------------------------ + +This is a feature release. It improves single-end short-read alignment and +comes with Python bindings. Detailed changes include: + + * Added the **sr** preset for single-end short-read alignment. In this mode, + minimap2 runs faster than BWA-MEM, but is slightly less accurate on + simulated data sets. Paired-end alignment is not supported as of now. + + * Improved mapping quality estimate with more accurate identification of + repetitive hits. This mainly helps short-read alignment. + + * Implemented **mappy**, a Python binding for minimap2, which is available + from PyPI and can be installed with `pip install --user mappy`. Python users + can perform read alignment without the minimap2 executable. + + * Restructured the indexing APIs and documented key minimap2 APIs in the + header file minimap.h. Updated example.c with the new APIs. Old APIs still + work but may become deprecated in future. + +This release may output alignments different from the previous version, though +the overall alignment statistics, such as the number of aligned bases and long +gaps, remain close. + +(2.2: 17 September 2017, r409) + + + +Release 2.1.1-r341 (6 September 2017) +------------------------------------- + +This is a maintenance release that is expected to output identical alignment to +v2.1. Detailed changes include: + + * Support CPU dispatch. By default, minimap2 is compiled with both SSE2 and + SSE4 based implementation of alignment and automatically chooses the right + one at runtime. This avoids unexpected errors on older CPUs (#21). + + * Improved Windows support as is requested by Oxford Nanopore (#19). Minimap2 + now avoids variable-length stacked arrays, eliminates alloca(), ships with + getopt_long() and provides timing functions implemented with Windows APIs. + + * Fixed a potential segmentation fault when specifying -k/-w/-H with + multi-part index (#23). + + * Fixed two memory leaks in example.c + +(2.1.1: 6 September 2017, r341) + + + +Release 2.1-r311 (25 August 2017) +--------------------------------- + +This release adds spliced alignment for long noisy RNA-seq reads. On a SMRT +Iso-Seq and a Oxford Nanopore data sets, minimap2 appears to outperform +traditional mRNA aligners. For DNA alignment, this release gives almost +identical output to v2.0. Other changes include: + + * Added option `-R` to set the read group header line in SAM. + + * Optionally output the `cs:Z` tag in PAF to encode both the query and the + reference sequences in the alignment. + + * Fixed an issue where DP alignment uses excessive memory. + +The minimap2 technical report has been updated with more details and the +evaluation of spliced alignment: + + * Li, H. (2017). Minimap2: fast pairwise alignment for long nucleotide + sequences. [arXiv:1708.01492v2](https://arxiv.org/abs/1708.01492v2). + +(2.1: 25 August 2017, r311) + + + +Release 2.0-r275 (8 August 2017) +-------------------------------- + +This release is identical to version 2.0rc1, except the version number. It is +described and evaluated in the following technical report: + + * Li, H. (2017). Minimap2: fast pairwise alignment for long DNA sequences. + [arXiv:1708.01492v1](https://arxiv.org/abs/1708.01492v1). + +(2.0: 8 August 2017, r275) + + + +Release 2.0rc1-r232 (30 July 2017) +---------------------------------- + +This release improves the accuracy of long-read alignment and added several +minor features. + + * Improved mapping quality estimate for short alignments containing few seed + hits. + + * Fixed a minor bug that affects the chaining accuracy towards the ends of a + chain. Changed the gap cost for chaining to reduce false seeding. + + * Skip potentially wrong seeding and apply dynamic programming more frequently. + This slightly increases run time, but greatly reduces false long gaps. + + * Perform local alignment at Z-drop break point to recover potential inversion + alignment. Output the SA tag in the SAM format. Added scripts to evaluate + mapping accuracy for reads simulated with pbsim. + +This release completes features intended for v2.0. No major features will be +added to the master branch before the final v2.0. + +(2.0rc1: 30 July 2017, r232) + + + +Release r191 (19 July 2017) +--------------------------- + +This is the first public release of minimap2, an aligner for long reads and +assemblies. This release has a few issues and is generally not recommended for +production uses. + +(19 July 2017, r191) diff --git a/lib/minimap2/NEWS.md b/lib/minimap2/NEWS.md index 5de4ea629..d72e8ba15 100644 --- a/lib/minimap2/NEWS.md +++ b/lib/minimap2/NEWS.md @@ -1,3 +1,88 @@ +Release 2.27-r1193 (12 March 2024) +---------------------------------- + +Notable changes to minimap2: + + * New feature: added the `lr:hq` preset for accurate long reads at ~1% error + rate. This was suggested by Oxford Nanopore developers (#1127). It is not + clear if this preset also works well for PacBio HiFi reads. + + * New feature: added the `map-iclr` preset for Illumina Complete Long Reads + (#1069), provided by Illumina developers. + + * New feature: added option `-b` to specify mismatch penalty for base + transitions (i.e. A-to-G or C-to-T changes). + + * New feature: added option `--ds` to generate a new `ds:Z` tag that + indicates uncertainty in INDEL positions. It is an extension to `cs`. The + `mgutils-es6.js` script in minigraph parses `ds`. + + * Bugfix: avoided a NULL pointer dereference (#1154). This would not have an + effect on most systems but would still be good to fix. + + * Bugfix: reverted the value of `ms:i` to pre-2.22 versions (#1146). This was + an oversight. See fcd4df2 for details. + +Notable changes to paftools.js and mappy: + + * New feature: expose `bw_long` to mappy's Aligner class (#1124). + + * Bugfix: fixed several compatibility issues with k8 v1.0 (#1161 and #1166). + Subcommands "call", "pbsim2fq" and "mason2fq" were not working with v1.0. + +Minimap2 should output identical alignments to v2.26, except the ms tag. + +(2.27: 12 March 2024, r1193) + + + +Release 2.26-r1175 (29 April 2023) +---------------------------------- + +Fixed the broken Python package. This is the only change. + +(2.26: 25 April 2023, r1173) + + + +Release 2.25-r1173 (25 April 2023) +---------------------------------- + +Notable changes: + + * Improvement: use the miniprot splice model for RNA-seq alignment by default. + This model considers non-GT-AG splice sites and leads to slightly higher + (<0.1%) accuracy and sensitivity on real human data. + + * Change: increased the default `-I` to `8G` such that minimap2 would create a + uni-part index for a pair of mammalian genomes. This change may increase the + memory for all-vs-all read overlap alignment given large datasets. + + * New feature: output the sequences in secondary alignments with option + `--secondary-seq` (#687). + + * Bugfix: --rmq was not parsed correctly (#1010) + + * Bugfix: possibly incorrect coordinate when applying end bonus to the target + sequence (#1025). This is a ksw2 bug. It does not affect minimap2 as + minimap2 is not using the affected feature. + + * Improvement: incorporated several changes for better compatibility with + Windows (#1051) and for minimap2 integration at Oxford Nanopore Technologies + (#1048 and #1033). + + * Improvement: output the HD-line in SAM output (#1019). + + * Improvement: check minimap2 index file in mappy to prevent segmentation + fault for certain indices (#1008). + +For genomic sequences, minimap2 should give identical output to v2.24. +Long-read RNA-seq alignment may occasionally differ from previous versions. + +(2.25: 25 April 2023, r1173) + + + Release 2.24-r1122 (26 December 2021) ------------------------------------- diff --git a/lib/minimap2/README.md b/lib/minimap2/README.md index 8ed1ea6ba..056dd5d72 100644 --- a/lib/minimap2/README.md +++ b/lib/minimap2/README.md @@ -15,7 +15,7 @@ cd minimap2 && make ./minimap2 -ax map-pb ref.fa pacbio.fq.gz > aln.sam # PacBio CLR genomic reads ./minimap2 -ax map-ont ref.fa ont.fq.gz > aln.sam # Oxford Nanopore genomic reads ./minimap2 -ax map-hifi ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.19 or later) -./minimap2 -ax asm20 ref.fa pacbio-ccs.fq.gz > aln.sam # PacBio HiFi/CCS genomic reads (v2.18 or earlier) +./minimap2 -ax lr:hq ref.fa ont-Q20.fq.gz > aln.sam # Nanopore Q20 genomic reads (v2.27 or later) ./minimap2 -ax sr ref.fa read1.fa read2.fa > aln.sam # short genomic paired-end reads ./minimap2 -ax splice ref.fa rna-reads.fa > aln.sam # spliced long reads (strand unknown) ./minimap2 -ax splice -uf -k14 ref.fa reads.fa > aln.sam # noisy Nanopore Direct RNA-seq @@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.24_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.27_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code @@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`. ```sh minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads +minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads ``` The difference between `map-pb` and `map-ont` is that `map-pb` uses homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary -minimizers as seeds. Emperical evaluation suggests HPC minimizers improve +minimizers as seeds. Empirical evaluation suggests HPC minimizers improve performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning -Nanopore reads. +Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that +accounts for the low overall error rate in the reads, with transversion errors +being less frequent than transitions. #### Map long mRNA/cDNA reads diff --git a/lib/minimap2/align 2.c b/lib/minimap2/align 2.c new file mode 100644 index 000000000..ddbb0bd3f --- /dev/null +++ b/lib/minimap2/align 2.c @@ -0,0 +1,1020 @@ +#include +#include +#include +#include +#include "minimap.h" +#include "mmpriv.h" +#include "ksw2.h" + +static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc_ambi) +{ + int i, j; + a = a < 0? -a : a; + b = b > 0? -b : b; + sc_ambi = sc_ambi > 0? -sc_ambi : sc_ambi; + for (i = 0; i < m - 1; ++i) { + for (j = 0; j < m - 1; ++j) + mat[i * m + j] = i == j? a : b; + mat[i * m + m - 1] = sc_ambi; + } + for (j = 0; j < m; ++j) + mat[(m - 1) * m + j] = sc_ambi; +} + +static inline void mm_seq_rev(uint32_t len, uint8_t *seq) +{ + uint32_t i; + uint8_t t; + for (i = 0; i < len>>1; ++i) + t = seq[i], seq[i] = seq[len - 1 - i], seq[len - 1 - i] = t; +} + +static inline void update_max_zdrop(int32_t score, int i, int j, int32_t *max, int *max_i, int *max_j, int e, int *max_zdrop, int pos[2][2]) +{ + if (score < *max) { + int li = i - *max_i; + int lj = j - *max_j; + int diff = li > lj? li - lj : lj - li; + int z = *max - score - diff * e; + if (z > *max_zdrop) { + *max_zdrop = z; + pos[0][0] = *max_i, pos[0][1] = i; + pos[1][0] = *max_j, pos[1][1] = j; + } + } else *max = score, *max_i = i, *max_j = j; +} + +static int mm_test_zdrop(void *km, const mm_mapopt_t *opt, const uint8_t *qseq, const uint8_t *tseq, uint32_t n_cigar, uint32_t *cigar, const int8_t *mat) +{ + uint32_t k; + int32_t score = 0, max = INT32_MIN, max_i = -1, max_j = -1, i = 0, j = 0, max_zdrop = 0; + int pos[2][2] = {{-1, -1}, {-1, -1}}, q_len, t_len; + + // find the score and the region where score drops most along diagonal + for (k = 0, score = 0; k < n_cigar; ++k) { + uint32_t l, op = cigar[k]&0xf, len = cigar[k]>>4; + if (op == MM_CIGAR_MATCH) { + for (l = 0; l < len; ++l) { + score += mat[tseq[i + l] * 5 + qseq[j + l]]; + update_max_zdrop(score, i+l, j+l, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); + } + i += len, j += len; + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) { + score -= opt->q + opt->e * len; + if (op == MM_CIGAR_INS) j += len; + else i += len; + update_max_zdrop(score, i, j, &max, &max_i, &max_j, opt->e, &max_zdrop, pos); + } + } + + // test if there is an inversion in the most dropped region + q_len = pos[1][1] - pos[1][0], t_len = pos[0][1] - pos[0][0]; + if (!(opt->flag&(MM_F_SPLICE|MM_F_SR|MM_F_FOR_ONLY|MM_F_REV_ONLY)) && max_zdrop > opt->zdrop_inv && q_len < opt->max_gap && t_len < opt->max_gap) { + uint8_t *qseq2; + void *qp; + int q_off, t_off; + qseq2 = (uint8_t*)kmalloc(km, q_len); + for (i = 0; i < q_len; ++i) { + int c = qseq[pos[1][1] - i - 1]; + qseq2[i] = c >= 4? 4 : 3 - c; + } + qp = ksw_ll_qinit(km, 2, q_len, qseq2, 5, mat); + score = ksw_ll_i16(qp, t_len, tseq + pos[0][0], opt->q, opt->e, &q_off, &t_off); + kfree(km, qseq2); + kfree(km, qp); + if (score >= opt->min_chain_score * opt->a && score >= opt->min_dp_max) + return 2; // there is a potential inversion + } + return max_zdrop > opt->zdrop? 1 : 0; +} + +static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, int *qshift, int *tshift) +{ + mm_extra_t *p = r->p; + int32_t toff = 0, qoff = 0, to_shrink = 0; + uint32_t k; + *qshift = *tshift = 0; + if (p->n_cigar <= 1) return; + for (k = 0; k < p->n_cigar; ++k) { // indel left alignment + uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; + if (len == 0) to_shrink = 1; + if (op == MM_CIGAR_MATCH) { + toff += len, qoff += len; + } else if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) { + if (k > 0 && k < p->n_cigar - 1 && (p->cigar[k-1]&0xf) == 0 && (p->cigar[k+1]&0xf) == 0) { + int l, prev_len = p->cigar[k-1] >> 4; + if (op == MM_CIGAR_INS) { + for (l = 0; l < prev_len; ++l) + if (qseq[qoff - 1 - l] != qseq[qoff + len - 1 - l]) + break; + } else { + for (l = 0; l < prev_len; ++l) + if (tseq[toff - 1 - l] != tseq[toff + len - 1 - l]) + break; + } + if (l > 0) + p->cigar[k-1] -= l<<4, p->cigar[k+1] += l<<4, qoff -= l, toff -= l; + if (l == prev_len) to_shrink = 1; + } + if (op == MM_CIGAR_INS) qoff += len; + else toff += len; + } else if (op == MM_CIGAR_N_SKIP) { + toff += len; + } + } + assert(qoff == r->qe - r->qs && toff == r->re - r->rs); + for (k = 0; k < p->n_cigar - 2; ++k) { // fix CIGAR like 5I6D7I + if ((p->cigar[k]&0xf) > 0 && (p->cigar[k]&0xf) + (p->cigar[k+1]&0xf) == 3) { + uint32_t l, s[3] = {0,0,0}; + for (l = k; l < p->n_cigar; ++l) { // count number of adjacent I and D + uint32_t op = p->cigar[l]&0xf; + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL || p->cigar[l]>>4 == 0) + s[op] += p->cigar[l] >> 4; + else break; + } + if (s[1] > 0 && s[2] > 0 && l - k > 2) { // turn to a single I and a single D + p->cigar[k] = s[1]<<4|MM_CIGAR_INS; + p->cigar[k+1] = s[2]<<4|MM_CIGAR_DEL; + for (k += 2; k < l; ++k) + p->cigar[k] &= 0xf; + to_shrink = 1; + } + k = l; + } + } + if (to_shrink) { // squeeze out zero-length operations + int32_t l = 0; + for (k = 0; k < p->n_cigar; ++k) // squeeze out zero-length operations + if (p->cigar[k]>>4 != 0) + p->cigar[l++] = p->cigar[k]; + p->n_cigar = l; + for (k = l = 0; k < p->n_cigar; ++k) // merge two adjacent operations if they are the same + if (k == p->n_cigar - 1 || (p->cigar[k]&0xf) != (p->cigar[k+1]&0xf)) + p->cigar[l++] = p->cigar[k]; + else p->cigar[k+1] += p->cigar[k]>>4<<4; // add length to the next CIGAR operator + p->n_cigar = l; + } + if ((p->cigar[0]&0xf) == MM_CIGAR_INS || (p->cigar[0]&0xf) == MM_CIGAR_DEL) { // get rid of leading I or D + int32_t l = p->cigar[0] >> 4; + if ((p->cigar[0]&0xf) == MM_CIGAR_INS) { + if (r->rev) r->qe -= l; + else r->qs += l; + *qshift = l; + } else r->rs += l, *tshift = l; + --p->n_cigar; + memmove(p->cigar, p->cigar + 1, p->n_cigar * 4); + } +} + +static void mm_update_cigar_eqx(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq) // written by @armintoepfer +{ + uint32_t n_EQX = 0; + uint32_t k, l, m, cap, toff = 0, qoff = 0, n_M = 0; + mm_extra_t *p; + if (r->p == 0) return; + for (k = 0; k < r->p->n_cigar; ++k) { + uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (op == MM_CIGAR_MATCH) { + while (len > 0) { + for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} // run of "="; TODO: N<=>N is converted to "=" + if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } + + for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} // run of "X" + if (l > 0) { ++n_EQX; len -= l; toff += l; qoff += l; } + } + ++n_M; + } else if (op == MM_CIGAR_INS) { + qoff += len; + } else if (op == MM_CIGAR_DEL) { + toff += len; + } else if (op == MM_CIGAR_N_SKIP) { + toff += len; + } + } + // update in-place if we can + if (n_EQX == n_M) { + for (k = 0; k < r->p->n_cigar; ++k) { + uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (op == MM_CIGAR_MATCH) r->p->cigar[k] = len << 4 | MM_CIGAR_EQ_MATCH; + } + return; + } + // allocate new storage + cap = r->p->n_cigar + (n_EQX - n_M) + sizeof(mm_extra_t); + kroundup32(cap); + p = (mm_extra_t*)calloc(cap, 4); + memcpy(p, r->p, sizeof(mm_extra_t)); + p->capacity = cap; + // update cigar while copying + toff = qoff = m = 0; + for (k = 0; k < r->p->n_cigar; ++k) { + uint32_t op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (op == MM_CIGAR_MATCH) { + while (len > 0) { + // match + for (l = 0; l < len && qseq[qoff + l] == tseq[toff + l]; ++l) {} + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_EQ_MATCH; + len -= l; + toff += l, qoff += l; + // mismatch + for (l = 0; l < len && qseq[qoff + l] != tseq[toff + l]; ++l) {} + if (l > 0) p->cigar[m++] = l << 4 | MM_CIGAR_X_MISMATCH; + len -= l; + toff += l, qoff += l; + } + continue; + } else if (op == MM_CIGAR_INS) { + qoff += len; + } else if (op == MM_CIGAR_DEL) { + toff += len; + } else if (op == MM_CIGAR_N_SKIP) { + toff += len; + } + p->cigar[m++] = r->p->cigar[k]; + } + p->n_cigar = m; + free(r->p); + r->p = p; +} + +static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int is_eqx, int log_gap) +{ + uint32_t k, l; + int32_t qshift, tshift, toff = 0, qoff = 0; + double s = 0.0, max = 0.0; + mm_extra_t *p = r->p; + if (p == 0) return; + mm_fix_cigar(r, qseq, tseq, &qshift, &tshift); + qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D + r->blen = r->mlen = 0; + for (k = 0; k < p->n_cigar; ++k) { + uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; + if (op == MM_CIGAR_MATCH) { + int n_ambi = 0, n_diff = 0; + for (l = 0; l < len; ++l) { + int cq = qseq[qoff + l], ct = tseq[toff + l]; + if (ct > 3 || cq > 3) ++n_ambi; + else if (ct != cq) ++n_diff; + s += mat[ct * 5 + cq]; + if (s < 0) s = 0; + else max = max > s? max : s; + } + r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi; + toff += len, qoff += len; + } else if (op == MM_CIGAR_INS) { + int n_ambi = 0; + for (l = 0; l < len; ++l) + if (qseq[qoff + l] > 3) ++n_ambi; + r->blen += len - n_ambi, p->n_ambi += n_ambi; + if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); + else s -= q + e; + if (s < 0) s = 0; + qoff += len; + } else if (op == MM_CIGAR_DEL) { + int n_ambi = 0; + for (l = 0; l < len; ++l) + if (tseq[toff + l] > 3) ++n_ambi; + r->blen += len - n_ambi, p->n_ambi += n_ambi; + if (log_gap) s -= q + (double)e * mg_log2(1.0 + len); + else s -= q + e; + if (s < 0) s = 0; + toff += len; + } else if (op == MM_CIGAR_N_SKIP) { + toff += len; + } + } + p->dp_max = (int32_t)(max + .499); + assert(qoff == r->qe - r->qs && toff == r->re - r->rs); + if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned +} + +static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc() +{ + mm_extra_t *p; + if (n_cigar == 0) return; + if (r->p == 0) { + uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4; + kroundup32(capacity); + r->p = (mm_extra_t*)calloc(capacity, 4); + r->p->capacity = capacity; + } else if (r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4 > r->p->capacity) { + r->p->capacity = r->p->n_cigar + n_cigar + sizeof(mm_extra_t)/4; + kroundup32(r->p->capacity); + r->p = (mm_extra_t*)realloc(r->p, r->p->capacity * 4); + } + p = r->p; + if (p->n_cigar > 0 && (p->cigar[p->n_cigar-1]&0xf) == (cigar[0]&0xf)) { // same CIGAR op at the boundary + p->cigar[p->n_cigar-1] += cigar[0]>>4<<4; + if (n_cigar > 1) memcpy(p->cigar + p->n_cigar, cigar + 1, (n_cigar - 1) * 4); + p->n_cigar += n_cigar - 1; + } else { + memcpy(p->cigar + p->n_cigar, cigar, n_cigar * 4); + p->n_cigar += n_cigar; + } +} + +static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const uint8_t *junc, const int8_t *mat, int w, int end_bonus, int zdrop, int flag, ksw_extz_t *ez) +{ + if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { + int i; + fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop); + for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); + fputc('\n', stderr); + for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); + fputc('\n', stderr); + } + if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) { + ksw_reset_extz(ez); + ez->zdropped = 1; + } else if (opt->flag & MM_F_SPLICE) + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez); + else if (opt->q == opt->q2 && opt->e == opt->e2) + ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez); + else + ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez); + if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { + int i; + fprintf(stderr, "score=%d, cigar=", ez->score); + for (i = 0; i < ez->n_cigar; ++i) + fprintf(stderr, "%d%c", ez->cigar[i]>>4, MM_CIGAR_STR[ez->cigar[i]&0xf]); + fprintf(stderr, "\n"); + } +} + +static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x) +{ + int64_t i, off0 = mi->seq[rid].offset, off = off0 + x; + int c = mm_seq4_get(mi->S, off); + for (i = off - 1; i >= off0; --i) + if (mm_seq4_get(mi->S, i) != c) break; + return (int)(off - i); +} + +static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], mm128_t *a, int32_t *r, int32_t *q) +{ + if (mi->flag & MM_I_HPC) { + const uint8_t *qseq = qseq0[a->x>>63]; + int i, c; + *q = (int32_t)a->y; + for (i = *q - 1, c = qseq[*q]; i > 0; --i) + if (qseq[i] != c) break; + *q = i + 1; + c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x); + *r = (int32_t)a->x + 1 - c; + } else { + *r = (int32_t)a->x - (mi->k>>1); + *q = (int32_t)a->y - (mi->k>>1); + } +} + +static int *collect_long_gaps(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int *n_) +{ + int i, n, *K; + *n_ = 0; + for (i = 1, n = 0; i < cnt1; ++i) { // count the number of gaps longer than min_gap + int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x); + if (gap < -min_gap || gap > min_gap) ++n; + } + if (n <= 1) return 0; + K = (int*)kmalloc(km, n * sizeof(int)); + for (i = 1, n = 0; i < cnt1; ++i) { // store the positions of long gaps + int gap = ((int32_t)a[as1 + i].y - a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - a[as1 + i - 1].x); + if (gap < -min_gap || gap > min_gap) + K[n++] = i; + } + *n_ = n; + return K; +} + +static void mm_filter_bad_seeds(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int diff_thres, int max_ext_len, int max_ext_cnt) +{ + int max_st, max_en, n, i, k, max, *K; + K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n); + if (K == 0) return; + max = 0, max_st = max_en = -1; + for (k = 0;; ++k) { // traverse long gaps + int gap, l, n_ins = 0, n_del = 0, qs, rs, max_diff = 0, max_diff_l = -1; + if (k == n || k >= max_en) { + if (max_en > 0) + for (i = K[max_st]; i < K[max_en]; ++i) + a[as1 + i].y |= MM_SEED_IGNORE; + max = 0, max_st = max_en = -1; + if (k == n) break; + } + i = K[k]; + gap = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - (int32_t)(a[as1 + i].x - a[as1 + i - 1].x); + if (gap > 0) n_ins += gap; + else n_del += -gap; + qs = (int32_t)a[as1 + i - 1].y; + rs = (int32_t)a[as1 + i - 1].x; + for (l = k + 1; l < n && l <= k + max_ext_cnt; ++l) { + int j = K[l], diff; + if ((int32_t)a[as1 + j].y - qs > max_ext_len || (int32_t)a[as1 + j].x - rs > max_ext_len) break; + gap = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x); + if (gap > 0) n_ins += gap; + else n_del += -gap; + diff = n_ins + n_del - abs(n_ins - n_del); + if (max_diff < diff) + max_diff = diff, max_diff_l = l; + } + if (max_diff > diff_thres && max_diff > max) + max = max_diff, max_st = k, max_en = max_diff_l; + } + kfree(km, K); +} + +static void mm_filter_bad_seeds_alt(void *km, int as1, int cnt1, mm128_t *a, int min_gap, int max_ext) +{ + int n, k, *K; + K = collect_long_gaps(km, as1, cnt1, a, min_gap, &n); + if (K == 0) return; + for (k = 0; k < n;) { + int i = K[k], l; + int gap1 = ((int32_t)a[as1 + i].y - (int32_t)a[as1 + i - 1].y) - ((int32_t)a[as1 + i].x - (int32_t)a[as1 + i - 1].x); + int re1 = (int32_t)a[as1 + i].x; + int qe1 = (int32_t)a[as1 + i].y; + gap1 = gap1 > 0? gap1 : -gap1; + for (l = k + 1; l < n; ++l) { + int j = K[l], gap2, q_span_pre, rs2, qs2, m; + if ((int32_t)a[as1 + j].y - qe1 > max_ext || (int32_t)a[as1 + j].x - re1 > max_ext) break; + gap2 = ((int32_t)a[as1 + j].y - (int32_t)a[as1 + j - 1].y) - (int32_t)(a[as1 + j].x - a[as1 + j - 1].x); + q_span_pre = a[as1 + j - 1].y >> 32 & 0xff; + rs2 = (int32_t)a[as1 + j - 1].x + q_span_pre; + qs2 = (int32_t)a[as1 + j - 1].y + q_span_pre; + m = rs2 - re1 < qs2 - qe1? rs2 - re1 : qs2 - qe1; + gap2 = gap2 > 0? gap2 : -gap2; + if (m > gap1 + gap2) break; + re1 = (int32_t)a[as1 + j].x; + qe1 = (int32_t)a[as1 + j].y; + gap1 = gap2; + } + if (l > k + 1) { + int j, end = K[l - 1]; + for (j = K[k]; j < end; ++j) + a[as1 + j].y |= MM_SEED_IGNORE; + a[as1 + end].y |= MM_SEED_LONG_JOIN; + } + k = l; + } + kfree(km, K); +} + +static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int min_match, int32_t *as, int32_t *cnt) +{ + int32_t i, l, m; + *as = r->as, *cnt = r->cnt; + if (r->cnt < 3) return; + m = l = a[r->as].y >> 32 & 0xff; + for (i = r->as + 1; i < r->as + r->cnt - 1; ++i) { + int32_t lq, lr, min, max; + int32_t q_span = a[i].y >> 32 & 0xff; + if (a[i].y & MM_SEED_LONG_JOIN) break; + lr = (int32_t)a[i].x - (int32_t)a[i-1].x; + lq = (int32_t)a[i].y - (int32_t)a[i-1].y; + min = lr < lq? lr : lq; + max = lr > lq? lr : lq; + if (max - min > l >> 1) *as = i; + l += min; + m += min < q_span? min : q_span; + if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break; + } + *cnt = r->as + r->cnt - *as; + m = l = a[r->as + r->cnt - 1].y >> 32 & 0xff; + for (i = r->as + r->cnt - 2; i > *as; --i) { + int32_t lq, lr, min, max; + int32_t q_span = a[i+1].y >> 32 & 0xff; + if (a[i+1].y & MM_SEED_LONG_JOIN) break; + lr = (int32_t)a[i+1].x - (int32_t)a[i].x; + lq = (int32_t)a[i+1].y - (int32_t)a[i].y; + min = lr < lq? lr : lq; + max = lr > lq? lr : lq; + if (max - min > l >> 1) *cnt = i + 1 - *as; + l += min; + m += min < q_span? min : q_span; + if (l >= bw << 1 || (m >= min_match && m >= bw) || m >= r->mlen >> 1) break; + } +} + +static void mm_max_stretch(const mm_reg1_t *r, const mm128_t *a, int32_t *as, int32_t *cnt) +{ + int32_t i, score, max_score, len, max_i, max_len; + + *as = r->as, *cnt = r->cnt; + if (r->cnt < 2) return; + + max_score = -1, max_i = -1, max_len = 0; + score = a[r->as].y >> 32 & 0xff, len = 1; + for (i = r->as + 1; i < r->as + r->cnt; ++i) { + int32_t lq, lr, q_span; + q_span = a[i].y >> 32 & 0xff; + lr = (int32_t)a[i].x - (int32_t)a[i-1].x; + lq = (int32_t)a[i].y - (int32_t)a[i-1].y; + if (lq == lr) { + score += lq < q_span? lq : q_span; + ++len; + } else { + if (score > max_score) + max_score = score, max_len = len, max_i = i - len; + score = q_span, len = 1; + } + } + if (score > max_score) + max_score = score, max_len = len, max_i = i - len; + *as = max_i, *cnt = max_len; +} + +static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a) +{ + uint8_t *qseq, *tseq; + int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len; + void *qp; + rid = a->x<<1>>33; + re = (uint32_t)a->x + 1, rs = re - q_span; + qe = (uint32_t)a->y + 1, qs = qe - q_span; + rs = rs - ext_len > 0? rs - ext_len : 0; + qs = qs - ext_len > 0? qs - ext_len : 0; + re = re + ext_len < (int32_t)mi->seq[rid].len? re + ext_len : mi->seq[rid].len; + qe = qe + ext_len < qlen? qe + ext_len : qlen; + tseq = (uint8_t*)kmalloc(km, re - rs); + if (opt->flag & MM_F_QSTRAND) { + qseq = qseq0[0] + qs; + mm_idx_getseq2(mi, a->x>>63, rid, rs, re, tseq); + } else { + qseq = qseq0[a->x>>63] + qs; + mm_idx_getseq(mi, rid, rs, re, tseq); + } + qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat); + score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off); + kfree(km, tseq); + kfree(km, qp); + return score; +} + +static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1) +{ // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons + int score; + double log_gap; + *as1 = r->as, *cnt1 = r->cnt; + if (r->cnt < 3) return; + log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x); + if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) { + score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]); + if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift" + ++(*as1), --(*cnt1); + } + log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x); + if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) { + score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]); + if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) + --(*cnt1); + } +} + +static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag) +{ + int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); + int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; + uint8_t *tseq, *qseq, *junc; + int32_t i, l, bw, bw_long, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; + int32_t rs, re, qs, qe; + int32_t rs1, qs1, re1, qe1; + int8_t mat[25]; + + if (is_sr) assert(!(mi->flag & MM_I_HPC)); // HPC won't work with SR because with HPC we can't easily tell if there is a gap + + r2->cnt = 0; + if (r->cnt == 0) return; + ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); + bw = (int)(opt->bw * 1.5 + 1.); + bw_long = (int)(opt->bw_long * 1.5 + 1.); + if (bw_long < bw) bw_long = bw; + + if (is_sr && !(mi->flag & MM_I_HPC)) { + mm_max_stretch(r, a, &as1, &cnt1); + rs = (int32_t)a[as1].x + 1 - (int32_t)(a[as1].y>>32&0xff); + qs = (int32_t)a[as1].y + 1 - (int32_t)(a[as1].y>>32&0xff); + re = (int32_t)a[as1+cnt1-1].x + 1; + qe = (int32_t)a[as1+cnt1-1].y + 1; + } else { + if (!(opt->flag & MM_F_NO_END_FLT)) { + if (is_splice) + mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1); + else + mm_fix_bad_ends(r, a, opt->bw, opt->min_chain_score * 2, &as1, &cnt1); + } else as1 = r->as, cnt1 = r->cnt; + mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); + mm_filter_bad_seeds_alt(km, as1, cnt1, a, 30, opt->max_gap>>1); + mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); + mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); + } + assert(cnt1 > 0); + + if (is_splice) { + if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; + if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK; + } + + /* Look for the start and end of regions to perform DP. This sounds easy + * but is in fact tricky. Excessively small regions lead to unnecessary + * clippings and lose alignable sequences. Excessively large regions + * occasionally lead to large overlaps between two chains and may cause + * loss of alignments in corner cases. */ + if (is_sr) { + qs0 = 0, qe0 = qlen; + l = qs; + l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0; + rs0 = rs - l > 0? rs - l : 0; + l = qlen - qe; + l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0; + re0 = re + l < (int32_t)mi->seq[rid].len? re + l : mi->seq[rid].len; + } else { + // compute rs0 and qs0 + rs0 = (int32_t)a[r->as].x + 1 - (int32_t)(a[r->as].y>>32&0xff); + qs0 = (int32_t)a[r->as].y + 1 - (int32_t)(a[r->as].y>>32&0xff); + if (rs0 < 0) rs0 = 0; // this may happen when HPC is in use + assert(qs0 >= 0); // this should never happen, or it is logic error + rs1 = qs1 = 0; + for (i = r->as - 1, l = 0; i >= 0 && a[i].x>>32 == a[r->as].x>>32; --i) { // inspect nearby seeds + int32_t x = (int32_t)a[i].x + 1 - (int32_t)(a[i].y>>32&0xff); + int32_t y = (int32_t)a[i].y + 1 - (int32_t)(a[i].y>>32&0xff); + if (x < rs0 && y < qs0) { + if (++l > opt->min_cnt) { + l = rs0 - x > qs0 - y? rs0 - x : qs0 - y; + rs1 = rs0 - l, qs1 = qs0 - l; + if (rs1 < 0) rs1 = 0; // not strictly necessary; better have this guard for explicit + break; + } + } + } + if (qs > 0 && rs > 0) { + l = qs < opt->max_gap? qs : opt->max_gap; + qs1 = qs1 > qs - l? qs1 : qs - l; + qs0 = qs0 < qs1? qs0 : qs1; // at least include qs0 + l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0; + l = l < opt->max_gap? l : opt->max_gap; + l = l < rs? l : rs; + rs1 = rs1 > rs - l? rs1 : rs - l; + rs0 = rs0 < rs1? rs0 : rs1; + rs0 = rs0 < rs? rs0 : rs; + } else rs0 = rs, qs0 = qs; + // compute re0 and qe0 + re0 = (int32_t)a[r->as + r->cnt - 1].x + 1; + qe0 = (int32_t)a[r->as + r->cnt - 1].y + 1; + re1 = mi->seq[rid].len, qe1 = qlen; + for (i = r->as + r->cnt, l = 0; i < n_a && a[i].x>>32 == a[r->as].x>>32; ++i) { // inspect nearby seeds + int32_t x = (int32_t)a[i].x + 1; + int32_t y = (int32_t)a[i].y + 1; + if (x > re0 && y > qe0) { + if (++l > opt->min_cnt) { + l = x - re0 > y - qe0? x - re0 : y - qe0; + re1 = re0 + l, qe1 = qe0 + l; + break; + } + } + } + if (qe < qlen && re < (int32_t)mi->seq[rid].len) { + l = qlen - qe < opt->max_gap? qlen - qe : opt->max_gap; + qe1 = qe1 < qe + l? qe1 : qe + l; + qe0 = qe0 > qe1? qe0 : qe1; // at least include qe0 + l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0; + l = l < opt->max_gap? l : opt->max_gap; + l = l < (int32_t)mi->seq[rid].len - re? l : mi->seq[rid].len - re; + re1 = re1 < re + l? re1 : re + l; + re0 = re0 > re1? re0 : re1; + } else re0 = re, qe0 = qe; + } + if (a[r->as].y & MM_SEED_SELF) { + int max_ext = r->qs > r->rs? r->qs - r->rs : r->rs - r->qs; + if (r->rs - rs0 > max_ext) rs0 = r->rs - max_ext; + if (r->qs - qs0 > max_ext) qs0 = r->qs - max_ext; + max_ext = r->qe > r->re? r->qe - r->re : r->re - r->qe; + if (re0 - r->re > max_ext) re0 = r->re + max_ext; + if (qe0 - r->qe > max_ext) qe0 = r->qe + max_ext; + } + + assert(re0 > rs0); + tseq = (uint8_t*)kmalloc(km, re0 - rs0); + junc = (uint8_t*)kmalloc(km, re0 - rs0); + + if (qs > 0 && rs > 0) { // left extension; probably the condition can be changed to "qs > qs0 && rs > rs0" + if (opt->flag & MM_F_QSTRAND) { + qseq = &qseq0[0][qs0]; + mm_idx_getseq2(mi, rev, rid, rs0, rs, tseq); + } else { + qseq = &qseq0[rev][qs0]; + mm_idx_getseq(mi, rid, rs0, rs, tseq); + } + mm_idx_bed_junc(mi, rid, rs0, rs, junc); + mm_seq_rev(qs - qs0, qseq); + mm_seq_rev(rs - rs0, tseq); + mm_seq_rev(rs - rs0, junc); + mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, junc, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); + if (ez->n_cigar > 0) { + mm_append_cigar(r, ez->n_cigar, ez->cigar); + r->p->dp_score += ez->max; + } + rs1 = rs - (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1); + qs1 = qs - (ez->reach_end? qs - qs0 : ez->max_q + 1); + mm_seq_rev(qs - qs0, qseq); + } else rs1 = rs, qs1 = qs; + re1 = rs, qe1 = qs; + assert(qs1 >= 0 && rs1 >= 0); + + for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling + if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue; + if (is_sr && !(mi->flag & MM_I_HPC)) { + re = (int32_t)a[as1 + i].x + 1; + qe = (int32_t)a[as1 + i].y + 1; + } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe); + re1 = re, qe1 = qe; + if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) { + int j, bw1 = bw_long, zdrop_code; + if (a[as1+i].y & MM_SEED_LONG_JOIN) + bw1 = qe - qs > re - rs? qe - qs : re - rs; + // perform alignment + if (opt->flag & MM_F_QSTRAND) { + qseq = &qseq0[0][qs]; + mm_idx_getseq2(mi, rev, rid, rs, re, tseq); + } else { + qseq = &qseq0[rev][qs]; + mm_idx_getseq(mi, rid, rs, re, tseq); + } + mm_idx_bed_junc(mi, rid, rs, re, junc); + if (is_sr) { // perform ungapped alignment + assert(qe - qs == re - rs); + ksw_reset_extz(ez); + for (j = 0, ez->score = 0; j < qe - qs; ++j) { + if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2; + else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b; + } + ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, MM_CIGAR_MATCH, qe - qs); + } else { // perform normal gapped alignment + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, opt->zdrop, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop + } + // test Z-drop and inversion Z-drop + if ((zdrop_code = mm_test_zdrop(km, opt, qseq, tseq, ez->n_cigar, ez->cigar, mat)) != 0) + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, junc, mat, bw1, -1, zdrop_code == 2? opt->zdrop_inv : opt->zdrop, extra_flag, ez); // second pass: lift approximate + // update CIGAR + if (ez->n_cigar > 0) + mm_append_cigar(r, ez->n_cigar, ez->cigar); + if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. + if (!r->p) { + assert(ez->n_cigar == 0); + uint32_t capacity = sizeof(mm_extra_t)/4; + kroundup32(capacity); + r->p = (mm_extra_t*)calloc(capacity, 4); + r->p->capacity = capacity; + } + for (j = i - 1; j >= 0; --j) + if ((int32_t)a[as1 + j].x <= rs + ez->max_t) + break; + dropped = 1; + if (j < 0) j = 0; + r->p->dp_score += ez->max; + re1 = rs + (ez->max_t + 1); + qe1 = qs + (ez->max_q + 1); + if (cnt1 - (j + 1) >= opt->min_cnt) { + mm_split_reg(r, r2, as1 + j + 1 - r->as, qlen, a, !!(opt->flag&MM_F_QSTRAND)); + if (zdrop_code == 2) r2->split_inv = 1; + } + break; + } else r->p->dp_score += ez->score; + rs = re, qs = qe; + } + } + + if (!dropped && qe < qe0 && re < re0) { // right extension + if (opt->flag & MM_F_QSTRAND) { + qseq = &qseq0[0][qe]; + mm_idx_getseq2(mi, rev, rid, re, re0, tseq); + } else { + qseq = &qseq0[rev][qe]; + mm_idx_getseq(mi, rid, re, re0, tseq); + } + mm_idx_bed_junc(mi, rid, re, re0, junc); + mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, junc, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez); + if (ez->n_cigar > 0) { + mm_append_cigar(r, ez->n_cigar, ez->cigar); + r->p->dp_score += ez->max; + } + re1 = re + (ez->reach_end? ez->mqe_t + 1 : ez->max_t + 1); + qe1 = qe + (ez->reach_end? qe0 - qe : ez->max_q + 1); + } + assert(qe1 <= qlen); + + r->rs = rs1, r->re = re1; + if (!rev || (opt->flag & MM_F_QSTRAND)) r->qs = qs1, r->qe = qe1; + else r->qs = qlen - qe1, r->qe = qlen - qs1; + + assert(re1 - rs1 <= re0 - rs0); + if (r->p) { + if (opt->flag & MM_F_QSTRAND) { + mm_idx_getseq2(mi, r->rev, rid, rs1, re1, tseq); + qseq = &qseq0[0][qs1]; + } else { + mm_idx_getseq(mi, rid, rs1, re1, tseq); + qseq = &qseq0[r->rev][qs1]; + } + mm_update_extra(r, qseq, tseq, mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR)); + if (rev && r->p->trans_strand) + r->p->trans_strand ^= 3; // flip to the read strand + } + + kfree(km, tseq); + kfree(km, junc); +} + +static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) +{ // NB: this doesn't work with the qstrand mode + int tl, ql, score, ret = 0, q_off, t_off; + uint8_t *tseq, *qseq; + int8_t mat[25]; + void *qp; + + memset(r_inv, 0, sizeof(mm_reg1_t)); + if (!(r1->split&1) || !(r2->split&2)) return 0; + if (r1->id != r1->parent && r1->parent != MM_PARENT_TMP_PRI) return 0; + if (r2->id != r2->parent && r2->parent != MM_PARENT_TMP_PRI) return 0; + if (r1->rid != r2->rid || r1->rev != r2->rev) return 0; + ql = r1->rev? r1->qs - r2->qe : r2->qs - r1->qe; + tl = r2->rs - r1->re; + if (ql < opt->min_chain_score || ql > opt->max_gap) return 0; + if (tl < opt->min_chain_score || tl > opt->max_gap) return 0; + + ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); + tseq = (uint8_t*)kmalloc(km, tl); + mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq); + qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs]; + + mm_seq_rev(ql, qseq); + mm_seq_rev(tl, tseq); + qp = ksw_ll_qinit(km, 2, ql, qseq, 5, mat); + score = ksw_ll_i16(qp, tl, tseq, opt->q, opt->e, &q_off, &t_off); + kfree(km, qp); + mm_seq_rev(ql, qseq); + mm_seq_rev(tl, tseq); + if (score < opt->min_dp_max) goto end_align1_inv; + q_off = ql - (q_off + 1), t_off = tl - (t_off + 1); + mm_align_pair(km, opt, ql - q_off, qseq + q_off, tl - t_off, tseq + t_off, 0, mat, (int)(opt->bw * 1.5), -1, opt->zdrop, KSW_EZ_EXTZ_ONLY, ez); + if (ez->n_cigar == 0) goto end_align1_inv; // should never be here + mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); + r_inv->p->dp_score = ez->max; + r_inv->id = -1; + r_inv->parent = MM_PARENT_UNSET; + r_inv->inv = 1; + r_inv->rev = !r1->rev; + r_inv->rid = r1->rid; + r_inv->div = -1.0f; + if (r_inv->rev == 0) { + r_inv->qs = r2->qe + q_off; + r_inv->qe = r_inv->qs + ez->max_q + 1; + } else { + r_inv->qe = r2->qs - q_off; + r_inv->qs = r_inv->qe - (ez->max_q + 1); + } + r_inv->rs = r1->re + t_off; + r_inv->re = r_inv->rs + ez->max_t + 1; + mm_update_extra(r_inv, &qseq[q_off], &tseq[t_off], mat, opt->q, opt->e, opt->flag & MM_F_EQX, !(opt->flag & MM_F_SR)); + ret = 1; +end_align1_inv: + kfree(km, tseq); + return ret; +} + +static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, mm_reg1_t *regs) +{ + regs = (mm_reg1_t*)realloc(regs, (*n_regs + 1) * sizeof(mm_reg1_t)); + if (i + 1 != *n_regs) + memmove(®s[i + 2], ®s[i + 1], sizeof(mm_reg1_t) * (*n_regs - i - 1)); + regs[i + 1] = *r; + ++*n_regs; + return regs; +} + +static inline void mm_count_gaps(const mm_reg1_t *r, int32_t *n_gap_, int32_t *n_gapo_) +{ + uint32_t i; + int32_t n_gapo = 0, n_gap = 0; + *n_gap_ = *n_gapo_ = -1; + if (r->p == 0) return; + for (i = 0; i < r->p->n_cigar; ++i) { + int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) + ++n_gapo, n_gap += len; + } + *n_gap_ = n_gap, *n_gapo_ = n_gapo; +} + +double mm_event_identity(const mm_reg1_t *r) +{ + int32_t n_gap, n_gapo; + if (r->p == 0) return -1.0f; + mm_count_gaps(r, &n_gap, &n_gapo); + return (double)r->mlen / (r->blen + r->p->n_ambi - n_gap + n_gapo); +} + +static int32_t mm_recal_max_dp(const mm_reg1_t *r, double b2, int32_t match_sc) +{ + uint32_t i; + int32_t n_gap = 0, n_gapo = 0, n_mis; + double gap_cost = 0.0; + if (r->p == 0) return -1; + for (i = 0; i < r->p->n_cigar; ++i) { + int32_t op = r->p->cigar[i] & 0xf, len = r->p->cigar[i] >> 4; + if (op == MM_CIGAR_INS || op == MM_CIGAR_DEL) { + gap_cost += b2 + (double)mg_log2(1.0 + len); + ++n_gapo, n_gap += len; + } + } + n_mis = r->blen + r->p->n_ambi - r->mlen - n_gap; + return (int32_t)(match_sc * (r->mlen - b2 * n_mis - gap_cost) + .499); +} + +void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b) +{ + int32_t max = -1, max2 = -1, i, max_i = -1; + double div, b2; + if (n_regs < 2) return; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + if (r->p == 0) continue; + if (r->p->dp_max > max) max2 = max, max = r->p->dp_max, max_i = i; + else if (r->p->dp_max > max2) max2 = r->p->dp_max; + } + if (max_i < 0 || max < 0 || max2 < 0) return; + if (regs[max_i].qe - regs[max_i].qs < (double)qlen * frac) return; + if (max2 < (double)max * frac) return; + div = 1. - mm_event_identity(®s[max_i]); + if (div < 0.02) div = 0.02; + b2 = 0.5 / div; // max value: 25 + if (b2 * a < b) b2 = (double)a / b; + for (i = 0; i < n_regs; ++i) { + mm_reg1_t *r = ®s[i]; + if (r->p == 0) continue; + r->p->dp_max = mm_recal_max_dp(r, b2, a); + if (r->p->dp_max < 0) r->p->dp_max = 0; + } +} + +mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a) +{ + extern unsigned char seq_nt4_table[256]; + int32_t i, n_regs = *n_regs_, n_a; + uint8_t *qseq0[2]; + ksw_extz_t ez; + + // encode the query sequence + qseq0[0] = (uint8_t*)kmalloc(km, qlen * 2); + qseq0[1] = qseq0[0] + qlen; + for (i = 0; i < qlen; ++i) { + qseq0[0][i] = seq_nt4_table[(uint8_t)qstr[i]]; + qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4; + } + + // align through seed hits + n_a = mm_squeeze_a(km, n_regs, regs, a); + memset(&ez, 0, sizeof(ksw_extz_t)); + for (i = 0; i < n_regs; ++i) { + mm_reg1_t r2; + if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) { // then do two rounds of alignments for both strands + mm_reg1_t s[2], s2[2]; + int which, trans_strand; + s[0] = s[1] = regs[i]; + mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR); + mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV); + if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1; + else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2; + else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively + if (which == 0) { + regs[i] = s[0], r2 = s2[0]; + free(s[1].p); + } else { + regs[i] = s[1], r2 = s2[1]; + free(s[0].p); + } + regs[i].p->trans_strand = trans_strand; + } else { // one round of alignment + mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, n_a, a, &ez, opt->flag); + if (opt->flag&MM_F_SPLICE) + regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2; + } + if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs); + if (i > 0 && regs[i].split_inv && !(opt->flag & MM_F_NO_INV)) { + if (mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) { + regs = mm_insert_reg(&r2, i, &n_regs, regs); + ++i; // skip the inserted INV alignment + } + } + } + *n_regs_ = n_regs; + kfree(km, qseq0[0]); + kfree(km, ez.cigar); + mm_filter_regs(opt, qlen, n_regs_, regs); + if (!(opt->flag&MM_F_SR) && !opt->split_prefix && qlen >= opt->rank_min_len) { + mm_update_dp_max(qlen, *n_regs_, regs, opt->rank_frac, opt->a, opt->b); + mm_filter_regs(opt, qlen, n_regs_, regs); + } + mm_hit_sort(km, n_regs_, regs, opt->alt_drop); + return regs; +} diff --git a/lib/minimap2/align.c b/lib/minimap2/align.c index ddbb0bd3f..a1d90ea64 100644 --- a/lib/minimap2/align.c +++ b/lib/minimap2/align.c @@ -21,6 +21,18 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc mat[(m - 1) * m + j] = sc_ambi; } +static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi) +{ + assert(m == 5); + ksw_gen_simple_mat(m, mat, a, b, sc_ambi); + if (transition == 0 || transition == b) return; + transition = transition > 0? -transition : transition; + mat[0 * m + 2] = transition; // A->G + mat[1 * m + 3] = transition; // C->T + mat[2 * m + 0] = transition; // G->A + mat[3 * m + 1] = transition; // T->C +} + static inline void mm_seq_rev(uint32_t len, uint8_t *seq) { uint32_t i; @@ -283,7 +295,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts toff += len; } } - p->dp_max = (int32_t)(max + .499); + p->dp_max = p->dp_max0 = (int32_t)(max + .499); assert(qoff == r->qe - r->qs && toff == r->re - r->rs); if (is_eqx) mm_update_cigar_eqx(r, qseq, tseq); // NB: it has to be called here as changes to qseq and tseq are not returned } @@ -323,12 +335,16 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } + if (opt->transition != 0 && opt->b != opt->transition) + flag |= KSW_EZ_GENERIC_SC; if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) { ksw_reset_extz(ez); ez->zdropped = 1; - } else if (opt->flag & MM_F_SPLICE) - ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag, junc, ez); - else if (opt->q == opt->q2 && opt->e == opt->e2) + } else if (opt->flag & MM_F_SPLICE) { + int flag_tmp = flag; + if (!(opt->flag & MM_F_SPLICE_OLD)) flag_tmp |= KSW_EZ_SPLICE_CMPLX; + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, zdrop, opt->junc_bonus, flag_tmp, junc, ez); + } else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, zdrop, end_bonus, flag, ez); else ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, zdrop, end_bonus, flag, ez); @@ -584,7 +600,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int r2->cnt = 0; if (r->cnt == 0) return; - ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); + ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi); bw = (int)(opt->bw * 1.5 + 1.); bw_long = (int)(opt->bw_long * 1.5 + 1.); if (bw_long < bw) bw_long = bw; @@ -842,7 +858,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ql < opt->min_chain_score || ql > opt->max_gap) return 0; if (tl < opt->min_chain_score || tl > opt->max_gap) return 0; - ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi); + ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi); tseq = (uint8_t*)kmalloc(km, tl); mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq); qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs]; diff --git a/lib/minimap2/bseq 2.c b/lib/minimap2/bseq 2.c new file mode 100644 index 000000000..075a17dc6 --- /dev/null +++ b/lib/minimap2/bseq 2.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#define __STDC_LIMIT_MACROS +#include "bseq.h" +#include "kvec.h" +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) + +unsigned char seq_comp_table[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', + 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, + 96, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', + 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 +}; + +#define CHECK_PAIR_THRES 1000000 + +struct mm_bseq_file_s { + gzFile fp; + kseq_t *ks; + mm_bseq1_t s; +}; + +mm_bseq_file_t *mm_bseq_open(const char *fn) +{ + mm_bseq_file_t *fp; + gzFile f; + f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(0, "r"); + if (f == 0) return 0; + fp = (mm_bseq_file_t*)calloc(1, sizeof(mm_bseq_file_t)); + fp->fp = f; + fp->ks = kseq_init(fp->fp); + return fp; +} + +void mm_bseq_close(mm_bseq_file_t *fp) +{ + kseq_destroy(fp->ks); + gzclose(fp->fp); + free(fp); +} + +static inline char *kstrdup(const kstring_t *s) +{ + char *t; + t = (char*)malloc(s->l + 1); + memcpy(t, s->s, s->l + 1); + return t; +} + +static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment) +{ + int i; + if (ks->name.l == 0) + fprintf(stderr, "[WARNING]\033[1;31m empty sequence name in the input.\033[0m\n"); + s->name = kstrdup(&ks->name); + s->seq = kstrdup(&ks->seq); + for (i = 0; i < (int)ks->seq.l; ++i) // convert U to T + if (s->seq[i] == 'u' || s->seq[i] == 'U') + --s->seq[i]; + s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0; + s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0; + s->l_seq = ks->seq.l; +} + +mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_) +{ + int64_t size = 0; + int ret; + kvec_t(mm_bseq1_t) a = {0,0,0}; + kseq_t *ks = fp->ks; + *n_ = 0; + if (fp->s.seq) { + kv_resize(mm_bseq1_t, 0, a, 256); + kv_push(mm_bseq1_t, 0, a, fp->s); + size = fp->s.l_seq; + memset(&fp->s, 0, sizeof(mm_bseq1_t)); + } + while ((ret = kseq_read(ks)) >= 0) { + mm_bseq1_t *s; + assert(ks->seq.l <= INT32_MAX); + if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256); + kv_pushp(mm_bseq1_t, 0, a, &s); + kseq2bseq(ks, s, with_qual, with_comment); + size += s->l_seq; + if (size >= chunk_size) { + if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) { + while ((ret = kseq_read(ks)) >= 0) { + kseq2bseq(ks, &fp->s, with_qual, with_comment); + if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) { + kv_push(mm_bseq1_t, 0, a, fp->s); + memset(&fp->s, 0, sizeof(mm_bseq1_t)); + } else break; + } + } + break; + } + } + if (ret < -1) { + if (a.n) fprintf(stderr, "[WARNING]\033[1;31m failed to parse the FASTA/FASTQ record next to '%s'. Continue anyway.\033[0m\n", a.a[a.n-1].name); + else fprintf(stderr, "[WARNING]\033[1;31m failed to parse the first FASTA/FASTQ record. Continue anyway.\033[0m\n"); + } + *n_ = a.n; + return a.a; +} + +mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_) +{ + return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_); +} + +mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_) +{ + return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_); +} + +mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_) +{ + int i; + int64_t size = 0; + kvec_t(mm_bseq1_t) a = {0,0,0}; + *n_ = 0; + if (n_fp < 1) return 0; + while (1) { + int n_read = 0; + for (i = 0; i < n_fp; ++i) + if (kseq_read(fp[i]->ks) >= 0) + ++n_read; + if (n_read < n_fp) { + if (n_read > 0) + fprintf(stderr, "[W::%s]\033[1;31m query files have different number of records; extra records skipped.\033[0m\n", __func__); + break; // some file reaches the end + } + if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256); + for (i = 0; i < n_fp; ++i) { + mm_bseq1_t *s; + kv_pushp(mm_bseq1_t, 0, a, &s); + kseq2bseq(fp[i]->ks, s, with_qual, with_comment); + size += s->l_seq; + } + if (size >= chunk_size) break; + } + *n_ = a.n; + return a.a; +} + +mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_) +{ + return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_); +} + +int mm_bseq_eof(mm_bseq_file_t *fp) +{ + return (ks_eof(fp->ks->f) && fp->s.seq == 0); +} diff --git a/lib/minimap2/bseq 2.h b/lib/minimap2/bseq 2.h new file mode 100644 index 000000000..c0bdc6330 --- /dev/null +++ b/lib/minimap2/bseq 2.h @@ -0,0 +1,64 @@ +#ifndef MM_BSEQ_H +#define MM_BSEQ_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct mm_bseq_file_s; +typedef struct mm_bseq_file_s mm_bseq_file_t; + +typedef struct { + int l_seq, rid; + char *name, *seq, *qual, *comment; +} mm_bseq1_t; + +mm_bseq_file_t *mm_bseq_open(const char *fn); +void mm_bseq_close(mm_bseq_file_t *fp); +mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int with_comment, int frag_mode, int *n_); +mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int frag_mode, int *n_); +mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int64_t chunk_size, int with_qual, int *n_); +mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int with_comment, int *n_); +mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int64_t chunk_size, int with_qual, int *n_); +int mm_bseq_eof(mm_bseq_file_t *fp); + +extern unsigned char seq_nt4_table[256]; +extern unsigned char seq_comp_table[256]; + +static inline int mm_qname_len(const char *s) +{ + int l; + l = strlen(s); + return l >= 3 && s[l-1] >= '0' && s[l-1] <= '9' && s[l-2] == '/'? l - 2 : l; +} + +static inline int mm_qname_same(const char *s1, const char *s2) +{ + int l1, l2; + l1 = mm_qname_len(s1); + l2 = mm_qname_len(s2); + return (l1 == l2 && strncmp(s1, s2, l1) == 0); +} + +static inline void mm_revcomp_bseq(mm_bseq1_t *s) +{ + int i, t, l = s->l_seq; + for (i = 0; i < l>>1; ++i) { + t = s->seq[l - i - 1]; + s->seq[l - i - 1] = seq_comp_table[(uint8_t)s->seq[i]]; + s->seq[i] = seq_comp_table[t]; + } + if (l&1) s->seq[l>>1] = seq_comp_table[(uint8_t)s->seq[l>>1]]; + if (s->qual) + for (i = 0; i < l>>1; ++i) + t = s->qual[l - i - 1], s->qual[l - i - 1] = s->qual[i], s->qual[i] = t; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/minimap2/chain.c b/lib/minimap2/chain.c deleted file mode 100644 index a2f7ac5bf..000000000 --- a/lib/minimap2/chain.c +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include "minimap.h" -#include "mmpriv.h" -#include "kalloc.h" - -static const char LogTable256[256] = { -#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), - LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) -}; - -static inline int ilog2_32(uint32_t v) -{ - uint32_t t, tt; - if ((tt = v>>16)) return (t = tt>>8) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; - return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; -} - -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) -{ // TODO: make sure this works when n has more than 32 bits - int32_t k, *f, *p, *t, *v, n_u, n_v; - int64_t i, j, st = 0; - uint64_t *u, *u2, sum_qspan = 0; - float avg_qspan; - mm128_t *b, *w; - - if (_u) *_u = 0, *n_u_ = 0; - if (n == 0 || a == 0) { - kfree(km, a); - return 0; - } - f = (int32_t*)kmalloc(km, n * 4); - p = (int32_t*)kmalloc(km, n * 4); - t = (int32_t*)kmalloc(km, n * 4); - v = (int32_t*)kmalloc(km, n * 4); - memset(t, 0, n * 4); - - for (i = 0; i < n; ++i) sum_qspan += a[i].y>>32&0xff; - avg_qspan = (float)sum_qspan / n; - - // fill the score and backtrack arrays - for (i = 0; i < n; ++i) { - uint64_t ri = a[i].x; - int64_t max_j = -1; - int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!! - int32_t max_f = q_span, n_skip = 0, min_d; - int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; - while (st < i && ri > a[st].x + max_dist_x) ++st; - if (i - st > max_iter) st = i - max_iter; - for (j = i - 1; j >= st; --j) { - int64_t dr = ri - a[j].x; - int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd, gap_cost; - int32_t sidj = (a[j].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; - if ((sidi == sidj && dr == 0) || dq <= 0) continue; // don't skip if an anchor is used by multiple segments; see below - if ((sidi == sidj && dq > max_dist_y) || dq > max_dist_x) continue; - dd = dr > dq? dr - dq : dq - dr; - if (sidi == sidj && dd > bw) continue; - if (n_segs > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) continue; - min_d = dq < dr? dq : dr; - sc = min_d > q_span? q_span : dq < dr? dq : dr; - log_dd = dd? ilog2_32(dd) : 0; - gap_cost = 0; - if (is_cdna || sidi != sidj) { - int c_log, c_lin; - c_lin = (int)(dd * .01 * avg_qspan); - c_log = log_dd; - if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus - else if (dr > dq || sidi != sidj) gap_cost = c_lin < c_log? c_lin : c_log; - else gap_cost = c_lin + (c_log>>1); - } else gap_cost = (int)(dd * .01 * avg_qspan) + (log_dd>>1); - sc -= (int)((double)gap_cost * gap_scale + .499); - sc += f[j]; - if (sc > max_f) { - max_f = sc, max_j = j; - if (n_skip > 0) --n_skip; - } else if (t[j] == i) { - if (++n_skip > max_skip) - break; - } - if (p[j] >= 0) t[p[j]] = i; - } - f[i] = max_f, p[i] = max_j; - v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak - } - - // find the ending positions of chains - memset(t, 0, n * 4); - for (i = 0; i < n; ++i) - if (p[i] >= 0) t[p[i]] = 1; - for (i = n_u = 0; i < n; ++i) - if (t[i] == 0 && v[i] >= min_sc) - ++n_u; - if (n_u == 0) { - kfree(km, a); kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, v); - return 0; - } - u = (uint64_t*)kmalloc(km, n_u * 8); - for (i = n_u = 0; i < n; ++i) { - if (t[i] == 0 && v[i] >= min_sc) { - j = i; - while (j >= 0 && f[j] < v[j]) j = p[j]; // find the peak that maximizes f[] - if (j < 0) j = i; // TODO: this should really be assert(j>=0) - u[n_u++] = (uint64_t)f[j] << 32 | j; - } - } - radix_sort_64(u, u + n_u); - for (i = 0; i < n_u>>1; ++i) { // reverse, s.t. the highest scoring chain is the first - uint64_t t = u[i]; - u[i] = u[n_u - i - 1], u[n_u - i - 1] = t; - } - - // backtrack - memset(t, 0, n * 4); - for (i = n_v = k = 0; i < n_u; ++i) { // starting from the highest score - int32_t n_v0 = n_v, k0 = k; - j = (int32_t)u[i]; - do { - v[n_v++] = j; - t[j] = 1; - j = p[j]; - } while (j >= 0 && t[j] == 0); - if (j < 0) { - if (n_v - n_v0 >= min_cnt) u[k++] = u[i]>>32<<32 | (n_v - n_v0); - } else if ((int32_t)(u[i]>>32) - f[j] >= min_sc) { - if (n_v - n_v0 >= min_cnt) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0); - } - if (k0 == k) n_v = n_v0; // no new chain added, reset - } - *n_u_ = n_u = k, *_u = u; // NB: note that u[] may not be sorted by score here - - // free temporary arrays - kfree(km, f); kfree(km, p); kfree(km, t); - - // write the result to b[] - b = (mm128_t*)kmalloc(km, n_v * sizeof(mm128_t)); - for (i = 0, k = 0; i < n_u; ++i) { - int32_t k0 = k, ni = (int32_t)u[i]; - for (j = 0; j < ni; ++j) - b[k] = a[v[k0 + (ni - j - 1)]], ++k; - } - kfree(km, v); - - // sort u[] and a[] by a[].x, such that adjacent chains may be joined (required by mm_join_long) - w = (mm128_t*)kmalloc(km, n_u * sizeof(mm128_t)); - for (i = k = 0; i < n_u; ++i) { - w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; - k += (int32_t)u[i]; - } - radix_sort_128x(w, w + n_u); - u2 = (uint64_t*)kmalloc(km, n_u * 8); - for (i = k = 0; i < n_u; ++i) { - int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; - u2[i] = u[j]; - memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); - k += n; - } - if (n_u) memcpy(u, u2, n_u * 8); - if (k) memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot - kfree(km, a); kfree(km, w); kfree(km, u2); - return b; -} diff --git a/lib/minimap2/code_of_conduct.md b/lib/minimap2/code_of_conduct.md new file mode 100644 index 000000000..b7175c1fb --- /dev/null +++ b/lib/minimap2/code_of_conduct.md @@ -0,0 +1,30 @@ +## Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all +people who contribute through reporting issues, posting feature requests, +updating documentation, submitting pull requests or patches, and other +activities. + +We are committed to making participation in this project a harassment-free +experience for everyone, regardless of level of experience, gender, gender +identity and expression, sexual orientation, disability, personal appearance, +body size, race, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual +language or imagery, derogatory comments or personal attacks, trolling, public +or private harassment, insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct. Project maintainers or +contributors who do not follow the Code of Conduct may be removed from the +project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by opening an issue or contacting the maintainer via email. + +This Code of Conduct is adapted from the [Contributor Covenant][cc], [version +1.0.0][v1]. + +[cc]: http://contributor-covenant.org/ +[v1]: http://contributor-covenant.org/version/1/0/0/ diff --git a/lib/minimap2/cookbook 2.md b/lib/minimap2/cookbook 2.md new file mode 100644 index 000000000..8b3c1053a --- /dev/null +++ b/lib/minimap2/cookbook 2.md @@ -0,0 +1,243 @@ +## Table of Contents + +- [Introduction & Installation](#intro) +- [Mapping Genomic Reads](#map-reads) + * [Mapping long reads](#map-pb) + * [Mapping Illumina paired-end reads](#map-sr) + * [Evaluating mapping accuracy with simulated reads (for developers)](#mapeval) +- [Mapping Long RNA-seq Reads](#map-rna) + * [Mapping Nanopore 2D cDNA reads](#map-ont-cdna-2d) + * [Mapping Nanopore direct-RNA reads](#map-direct-rna) + * [Mapping PacBio Iso-seq reads](#map-iso-seq) +- [Full-Genome Alignment](#genome-aln) + * [Intra-species assembly alignment](#asm-to-ref) + * [Cross-species full-genome alignment](#x-species) + * [Eyeballing alignment](#view-aln) + * [Calling variants from assembly-to-reference alignment](#asm-var) + * [Constructing self-homology map](#hom-map) + * [Lift Over (for developers)](#liftover) +- [Read Overlap](#read-overlap) + * [Long-read overlap](#long-read-overlap) + * [Evaluating overlap sensitivity (for developers)](#ov-eval) + +## Introduction & Installation + +This cookbook walks you through a variety of applications of minimap2 and its +companion script `paftools.js`. All data here are freely available from the +minimap2 release page at version tag [v2.10][v2.10]. Some examples only work +with v2.10 or later. + +To acquire the data used in this cookbook and to install minimap2 and paftools, +please follow the command lines below: +```sh +# install minimap2 executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.24_x64-linux/{minimap2,k8,paftools.js} . # copy executables +export PATH="$PATH:"`pwd` # put the current directory on PATH +# download example datasets +curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - +``` + +## Mapping Genomic Reads + +### Mapping long reads +```sh +minimap2 -ax map-pb -t4 ecoli_ref.fa ecoli_p6_25x_canu.fa > mapped.sam +``` +Alternatively, you can create a minimap2 index first and then map: +```sh +minimap2 -x map-pb -d ecoli-pb.mmi ecoli_ref.fa # create an index +minimap2 -ax map-pb ecoli-pb.mmi ecoli_p6_25x_canu.fa > mapped.sam +``` +This will save you a couple of minutes when you map against the human genome. +**HOWEVER**, key algorithm parameters such as the k-mer length and window +size can't be changed after indexing. Minimap2 will give you a warning if +parameters used in a pre-built index doesn't match parameters on the command +line. **Please always make sure you are using an intended pre-built index.** + +### Mapping Illumina paired-end reads: +```sh +minimap2 -ax sr -t4 ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq > mapped-sr.sam +``` + +### Evaluating mapping accuracy with simulated reads (for developers) +```sh +minimap2 -ax sr ecoli_ref.fa ecoli_mason_1.fq ecoli_mason_2.fq | paftools.js mapeval - +``` +The output is: +``` +Q 60 19712 0 0.000000000 19712 +Q 0 282 219 0.010953286 19994 +U 6 +``` +where a `U`-line gives the number of unmapped reads (for SAM input only); a +`Q`-line gives: + +1. Mapping quality (mapQ) threshold +2. Number of mapped reads between this threshold and the previous mapQ threshold. +3. Number of wrong mappings in the same mapQ interval +4. Accumulative mapping error rate +5. Accumulative number of mappings + +For `paftools.js mapeval` to work, you need to encode the true read positions +in read names in the right format. For [pbsim2][pbsim] and [mason2][mason2], we +provide scripts to generate the right format. Simulated reads in this cookbook +were created with the following command lines: +```sh +# in the pbsim2 source code directory: +src/pbsim --depth 1 --length-min 5000 --length-mean 20000 --accuracy-mean 0.95 --hmm_model data/R94.model ../ecoli_ref.fa +paftools.js pbsim2fq ../ecoli_ref.fa.fai sd_0001.maf > ../ecoli_pbsim.fa + +# mason2 simulation +mason_simulator --illumina-prob-mismatch-scale 2.5 -ir ecoli_ref.fa -n 10000 -o tmp-l.fq -or tmp-r.fq -oa tmp.sam +paftools.js mason2fq tmp.sam | seqtk seq -1 > ecoli_mason_1.fq +paftools.js mason2fq tmp.sam | seqtk seq -2 > ecoli_mason_2.fq +``` + + + +## Mapping Long RNA-seq Reads + +### Mapping Nanopore 2D cDNA reads +```sh +minimap2 -ax splice SIRV_E2.fa SIRV_ont-cdna.fa > aln.sam +``` +You can compare the alignment to the true annotations with: +```sh +paftools.js junceval SIRV_E2C.gtf aln.sam +``` +It gives the percentage of introns found in the annotation. For SIRV data, it +is possible to achieve higher junction accuracy with +```sh +minimap2 -ax splice --splice-flank=no SIRV_E2.fa SIRV_ont-cdna.fa | paftools.js junceval SIRV_E2C.gtf +``` +This is because minimap2 models one additional evolutionarily conserved base +around a canonical junction, but SIRV doesn't honor this signal. Option +`--splice-flank=no` asks minimap2 no to model this additional base. + +In the output a tag `ts:A:+` indicates that the read strand is the same as the +transcript strand; `ts:A:-` indicates the read strand is opposite to the +transcript strand. This tag is inferred from the GT-AG signal and is thus only +available to spliced reads. + +### Mapping Nanopore direct-RNA reads +```sh +minimap2 -ax splice -k14 -uf SIRV_E2.fa SIRV_ont-drna.fa > aln.sam +``` +Direct-RNA reads are noisier, so we use a shorter k-mer for improved +sensitivity. Here, option `-uf` forces minimap2 to map reads to the forward +transcript strand only because direct-RNA reads are stranded. Again, applying +`--splice-flank=no` helps junction accuracy for SIRV data. + +### Mapping PacBio Iso-seq reads +```sh +minimap2 -ax splice -uf -C5 SIRV_E2.fa SIRV_iso-seq.fq > aln.sam +``` +Option `-C5` reduces the penalty on non-canonical splicing sites. It helps +to align such sites correctly for data with low error rate such as Iso-seq +reads and traditional cDNAs. On this example, minimap2 makes one junction +error. Applying `--splice-flank=no` fixes this alignment error. + +Note that the command line above is optimized for the final Iso-seq reads. +PacBio's Iso-seq pipeline produces intermediate sequences at varying quality. +For example, some intermediate reads are not stranded. For these reads, option +`-uf` will lead to more errors. Please revise the minimap2 command line +accordingly. + + + +## Full-Genome Alignment + +### Intra-species assembly alignment +```sh +# option "--cs" is recommended as paftools.js may need it +minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf +``` +Here `ecoli_canu.fa` is the Canu assembly of `ecoli_p6_25x_canu.fa`. This +command line outputs alignments in the [PAF format][paf]. Use `-a` instead of +`-c` to get output in the SAM format. + +### Cross-species full-genome alignment +```sh +minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa > ecoli_O104:H4.paf +sort -k6,6 -k8,8n ecoli_O104:H4.paf | paftools.js call -f ecoli_ref.fa -L10000 -l1000 - > out.vcf +``` +Minimap2 has three presets for full-genome alignment: "asm5" for sequence +divergence below 1%, "asm10" for divergence around a couple of percent and +"asm20" for divergence not more than 10%. In theory, with the right setting, +minimap2 should work for sequence pairs with sequence divergence up to ~15%, +but this has not been carefully evaluated. + +### Eyeballing alignment +```sh +# option "--cs" required; minimap2-r741 or higher required for the "asm20" preset +minimap2 -cx asm20 --cs ecoli_ref.fa ecoli_O104:H4.fa | paftools.js view - | less -S +``` +This prints the alignment in a BLAST-like format. + +### Calling variants from assembly-to-reference alignment +```sh +# don't forget the "--cs" option; otherwise it doesn't work +minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa \ + | sort -k6,6 -k8,8n \ + | paftools.js call -f ecoli_ref.fa - > out.vcf +``` +Without option `-f`, `paftools.js call` outputs in a custom format. In this +format, lines starting with `R` give the regions covered by one contig only. +This information is not available in the VCF output. + +### Constructing self-homology map +```sh +minimap2 -DP -k19 -w19 -m200 ecoli_ref.fa ecoli_ref.fa > out.paf +``` +Option `-D` asks minimap2 to ignore anchors from perfect self match and `-P` +outputs all chains. For large nomes, we don't recommend to perform base-level +alignment (with `-c`, `-a` or `--cs`) when `-P` is applied. This is because +base-alignment is slow and occasionally gives wrong alignments close to the +diagonal of a dotter plot. For E. coli, though, base-alignment is still fast. + +### Lift over (for developers) +```sh +minimap2 -cx asm5 --cs ecoli_ref.fa ecoli_canu.fa > ecoli_canu.paf +echo -e 'tig00000001\t200000\t300000' | paftools.js liftover ecoli_canu.paf - +``` +This lifts over a region on query sequences to one or multiple regions on +reference sequences. Note that this paftools.js command may not be efficient +enough to lift millions of regions. + + + +## Read Overlap + +### Long read overlap +```sh +# For pacbio reads: +minimap2 -x ava-pb ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf +# For Nanopore reads (ava-ont also works with PacBio but not as good): +minimap2 -x ava-ont -r 10000 ecoli_p6_25x_canu.fa ecoli_p6_25x_canu.fa > overlap.paf +# If you have miniasm installed: +miniasm -f ecoli_p6_25x_canu.fa overlap.paf > asm.gfa +``` +Here we explicitly applied `-r 10000`. We are considering to set this as the +default for the `ava-ont` mode as this seems to improve the contiguity for +nanopore read assembly (Loman, personal communication). + +*Minimap2 doesn't work well with short-read overlap.* + +### Evaluating overlap sensitivity (for developers) + +```sh +# read to reference mapping +minimap2 -cx map-pb ecoli_ref.fa ecoli_p6_25x_canu.fa > to-ref.paf +# evaluate overlap sensitivity +sort -k6,6 -k8,8n to-ref.paf | paftools.js ov-eval - overlap.paf +``` +You can see that for PacBio reads, minimap2 achieves higher overlap sensitivity +with `-x ava-pb` (99% vs 93% with `-x ava-ont`). + + + +[pbsim]: https://github.com/yukiteruono/pbsim2 +[mason2]: https://github.com/seqan/seqan/tree/master/apps/mason2 +[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md +[v2.10]: https://github.com/lh3/minimap2/releases/tag/v2.10 diff --git a/lib/minimap2/cookbook.md b/lib/minimap2/cookbook.md index 8b3c1053a..c337d0503 100644 --- a/lib/minimap2/cookbook.md +++ b/lib/minimap2/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.24_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.27/minimap2-2.27_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.27_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/lib/minimap2/format.c b/lib/minimap2/format.c index f2d489988..d00a96d40 100644 --- a/lib/minimap2/format.c +++ b/lib/minimap2/format.c @@ -139,10 +139,48 @@ int mm_write_sam_hdr(const mm_idx_t *idx, const char *rg, const char *ver, int a return ret; } -static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int write_tag) +static void write_indel_ds(kstring_t *str, int64_t len, const uint8_t *seq, int64_t ll, int64_t lr) // write an indel to ds; adapted from minigraph { - int i, q_off, t_off; - if (write_tag) mm_sprintf_lite(s, "\tcs:Z:"); + int64_t i; + if (ll + lr >= len) { + mm_sprintf_lite(str, "["); + for (i = 0; i < len; ++i) + mm_sprintf_lite(str, "%c", "acgtn"[seq[i]]); + mm_sprintf_lite(str, "]"); + } else { + int64_t k = 0; + if (ll > 0) { + mm_sprintf_lite(str, "["); + for (i = 0; i < ll; ++i) + mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]); + mm_sprintf_lite(str, "]"); + k += ll; + } + for (i = 0; i < len - lr - ll; ++i) + mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]); + k += len - lr - ll; + if (lr > 0) { + mm_sprintf_lite(str, "["); + for (i = 0; i < lr; ++i) + mm_sprintf_lite(str, "%c", "acgtn"[seq[k+i]]); + mm_sprintf_lite(str, "]"); + } + } +} + +static void write_cs_ds_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp, int no_iden, int is_ds, int write_tag) +{ + int i, q_off, t_off, q_len = 0, t_len = 0; + if (write_tag) mm_sprintf_lite(s, "\t%cs:Z:", is_ds? 'd' : 'c'); + for (i = 0; i < (int)r->p->n_cigar; ++i) { + int op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; + if (op == MM_CIGAR_MATCH || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH) + q_len += len, t_len += len; + else if (op == MM_CIGAR_INS) + q_len += len; + else if (op == MM_CIGAR_DEL || op == MM_CIGAR_N_SKIP) + t_len += len; + } for (i = q_off = t_off = 0; i < (int)r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; assert((op >= MM_CIGAR_MATCH && op <= MM_CIGAR_N_SKIP) || op == MM_CIGAR_EQ_MATCH || op == MM_CIGAR_X_MISMATCH); @@ -168,14 +206,42 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq } q_off += len, t_off += len; } else if (op == MM_CIGAR_INS) { - for (j = 0, tmp[len] = 0; j < len; ++j) - tmp[j] = "acgtn"[qseq[q_off + j]]; - mm_sprintf_lite(s, "+%s", tmp); + if (is_ds) { + int z, ll, lr, y = q_off; + for (z = 1; z <= len; ++z) + if (y - z < 0 || qseq[y + len - z] != qseq[y - z]) + break; + lr = z - 1; + for (z = 0; z < len; ++z) + if (y + len + z >= q_len || qseq[y + len + z] != qseq[y + z]) + break; + ll = z; + mm_sprintf_lite(s, "+"); + write_indel_ds(s, len, &qseq[y], ll, lr); + } else { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "acgtn"[qseq[q_off + j]]; + mm_sprintf_lite(s, "+%s", tmp); + } q_off += len; } else if (op == MM_CIGAR_DEL) { - for (j = 0, tmp[len] = 0; j < len; ++j) - tmp[j] = "acgtn"[tseq[t_off + j]]; - mm_sprintf_lite(s, "-%s", tmp); + if (is_ds) { + int z, ll, lr, x = t_off; + for (z = 1; z <= len; ++z) + if (x - z < 0 || tseq[x + len - z] != tseq[x - z]) + break; + lr = z - 1; + for (z = 0; z < len; ++z) + if (x + len + z >= t_len || tseq[x + z] != tseq[x + len + z]) + break; + ll = z; + mm_sprintf_lite(s, "-"); + write_indel_ds(s, len, &tseq[x], ll, lr); + } else { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "acgtn"[tseq[t_off + j]]; + mm_sprintf_lite(s, "-%s", tmp); + } t_off += len; } else { // intron assert(len >= 2); @@ -218,7 +284,7 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); } -static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int write_tag, int is_qstrand) +static void write_cs_ds_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD, int is_ds, int write_tag, int is_qstrand) { extern unsigned char seq_nt4_table[256]; int i; @@ -244,8 +310,8 @@ static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_ } } } - if (is_MD) write_MD_core(s, tseq, qseq, r, tmp, write_tag); - else write_cs_core(s, tseq, qseq, r, tmp, no_iden, write_tag); + if (is_MD == 1) write_MD_core(s, tseq, qseq, r, tmp, write_tag); + else write_cs_ds_core(s, tseq, qseq, r, tmp, no_iden, is_ds, write_tag); kfree(km, qseq); kfree(km, tseq); kfree(km, tmp); } @@ -256,7 +322,7 @@ int mm_gen_cs_or_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, cons str.s = *buf, str.l = 0, str.m = *max_len; t.l_seq = strlen(seq); t.seq = (char*)seq; - write_cs_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, is_qstrand); + write_cs_ds_or_MD(km, &str, mi, &t, r, no_iden, is_MD, 0, 0, is_qstrand); *max_len = str.m; *buf = str.s; return str.l; @@ -278,7 +344,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) if (r->id == r->parent) type = r->inv? 'I' : 'P'; else type = r->inv? 'i' : 'S'; if (r->p) { - mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi); + mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max0, r->p->dp_score, r->p->n_ambi); if (r->p->trans_strand == 1 || r->p->trans_strand == 2) mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]); } @@ -326,8 +392,8 @@ void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const for (k = 0; k < r->p->n_cigar; ++k) mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, MM_CIGAR_STR[r->p->cigar[k]&0xf]); } - if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) - write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, !!(opt_flag&MM_F_QSTRAND)); + if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD))) + write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, !!(opt_flag&MM_F_QSTRAND)); if ((opt_flag & MM_F_COPY_COMMENT) && t->comment) mm_sprintf_lite(s, "\t%s", t->comment); } @@ -370,7 +436,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co clip_len[0] = r->rev? qlen - r->qe : r->qs; clip_len[1] = r->rev? r->qs : qlen - r->qe; if (in_tag) { - int clip_char = ((sam_flag&0x800 || (sam_flag&0x100 && opt_flag&MM_F_SECONDARY_SEQ)) && + int clip_char = (((sam_flag&0x800) || ((sam_flag&0x100) && (opt_flag&MM_F_SECONDARY_SEQ))) && !(opt_flag&MM_F_SOFTCLIP)) ? 5 : 4; mm_sprintf_lite(s, "\tCG:B:I"); if (clip_len[0]) mm_sprintf_lite(s, ",%u", clip_len[0]<<4|clip_char); @@ -378,7 +444,7 @@ static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, co mm_sprintf_lite(s, ",%u", r->p->cigar[k]); if (clip_len[1]) mm_sprintf_lite(s, ",%u", clip_len[1]<<4|clip_char); } else { - int clip_char = ((sam_flag&0x800 || (sam_flag&0x100 && opt_flag&MM_F_SECONDARY_SEQ)) && + int clip_char = (((sam_flag&0x800) || ((sam_flag&0x100) && (opt_flag&MM_F_SECONDARY_SEQ))) && !(opt_flag&MM_F_SOFTCLIP)) ? 'H' : 'S'; assert(clip_len[0] < qlen && clip_len[1] < qlen); if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char); @@ -535,8 +601,8 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se } } } - if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) - write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, 1, 0); + if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_DS|MM_F_OUT_MD))) + write_cs_ds_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD, !!(opt_flag&MM_F_OUT_DS), 1, 0); if (cigar_in_tag) write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag); } diff --git a/lib/minimap2/index.c b/lib/minimap2/index.c index c9bd01f65..cd8b40e4d 100644 --- a/lib/minimap2/index.c +++ b/lib/minimap2/index.c @@ -192,6 +192,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f) if (f <= 0.) return INT32_MAX; for (i = 0; i < 1<b; ++i) if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h); + if (n == 0) return INT32_MAX; a = (uint32_t*)malloc(n * 4); for (i = n = 0; i < 1<b; ++i) { idxhash_t *h = (idxhash_t*)mi->B[i].h; @@ -358,7 +359,7 @@ static void *worker_pipeline(void *shared, int step, void *in) for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; if (t->l_seq > 0) - mm_sketch2(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, p->mi->flag&MM_I_SYNCMER, &s->a); + mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a); else if (mm_verbose >= 2) fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name); free(t->seq); free(t->name); @@ -446,7 +447,7 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha sum_len += p->len; if (p->len > 0) { a.n = 0; - mm_sketch2(0, s, p->len, w, k, i, is_hpc, 0, &a); // TODO: mm_idx_str() doesn't support syncmer + mm_sketch(0, s, p->len, w, k, i, is_hpc, &a); mm_idx_add(mi, a.n, a.a); } } diff --git a/lib/minimap2/kalloc 2.c b/lib/minimap2/kalloc 2.c new file mode 100644 index 000000000..849955290 --- /dev/null +++ b/lib/minimap2/kalloc 2.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include "kalloc.h" + +/* In kalloc, a *core* is a large chunk of contiguous memory. Each core is + * associated with a master header, which keeps the size of the current core + * and the pointer to next core. Kalloc allocates small *blocks* of memory from + * the cores and organizes free memory blocks in a circular single-linked list. + * + * In the following diagram, "@" stands for the header of a free block (of type + * header_t), "#" for the header of an allocated block (of type size_t), "-" + * for free memory, and "+" for allocated memory. + * + * master This region is core 1. master This region is core 2. + * | | + * *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------ + * | | | | + * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr + */ +typedef struct header_t { + size_t size; + struct header_t *ptr; +} header_t; + +typedef struct { + void *par; + size_t min_core_size; + header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */ +} kmem_t; + +static void panic(const char *s) +{ + fprintf(stderr, "%s\n", s); + abort(); +} + +void *km_init2(void *km_par, size_t min_core_size) +{ + kmem_t *km; + km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t)); + km->par = km_par; + km->min_core_size = min_core_size > 0? min_core_size : 0x80000; + return (void*)km; +} + +void *km_init(void) { return km_init2(0, 0); } + +void km_destroy(void *_km) +{ + kmem_t *km = (kmem_t*)_km; + void *km_par; + header_t *p, *q; + if (km == NULL) return; + km_par = km->par; + for (p = km->core_head; p != NULL;) { + q = p->ptr; + kfree(km_par, p); + p = q; + } + kfree(km_par, km); +} + +static header_t *morecore(kmem_t *km, size_t nu) +{ + header_t *q; + size_t bytes, *p; + nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */ + bytes = nu * sizeof(header_t); + q = (header_t*)kmalloc(km->par, bytes); + if (!q) panic("[morecore] insufficient memory"); + q->ptr = km->core_head, q->size = nu, km->core_head = q; + p = (size_t*)(q + 1); + *p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */ + kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */ + return km->loop_head; +} + +void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */ +{ + header_t *p, *q; + kmem_t *km = (kmem_t*)_km; + + if (!ap) return; + if (km == NULL) { + free(ap); + return; + } + p = (header_t*)((size_t*)ap - 1); + p->size = *((size_t*)ap - 1); + /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions: + * + * a) "p>q && pptr": @------#++++++++#+++++++@------- @---------------#+++++++@------- + * (can also be in | | | -> | | + * two cores) q p q->ptr q q->ptr + * + * @-------- #+++++++++@-------- @-------- @------------------ + * | | | -> | | + * q p q->ptr q q->ptr + * + * b) "q>=q->ptr && (p>q || pptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @---------------- + * | | | -> | | + * q->ptr q p q->ptr q + * + * #+++++++@----- #++++++++@------- @------------- #++++++++@------- + * | | | -> | | + * p q->ptr q q->ptr q + */ + for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr) + if (q >= q->ptr && (p > q || p < q->ptr)) break; + if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */ + p->size += q->ptr->size; + p->ptr = q->ptr->ptr; + } else if (p + p->size > q->ptr && q->ptr >= p) { + panic("[kfree] The end of the allocated block enters a free block."); + } else p->ptr = q->ptr; /* backup q->ptr */ + + if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */ + q->size += p->size; + q->ptr = p->ptr; + km->loop_head = q; + } else if (q + q->size > p && p >= q) { + panic("[kfree] The end of a free block enters the allocated block."); + } else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */ +} + +void *kmalloc(void *_km, size_t n_bytes) +{ + kmem_t *km = (kmem_t*)_km; + size_t n_units; + header_t *p, *q; + + if (n_bytes == 0) return 0; + if (km == NULL) return malloc(n_bytes); + n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */ + + if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */ + q = km->loop_head = km->base.ptr = &km->base; + for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */ + if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */ + if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */ + else { /* split the block. NB: memory is allocated at the end of the block! */ + p->size -= n_units; /* reduce the size of the free block */ + p += p->size; /* p points to the allocated block */ + *(size_t*)p = n_units; /* set the size */ + } + km->loop_head = q; /* set the end of chain */ + return (size_t*)p + 1; + } + if (p == km->loop_head) { /* then ask for more "cores" */ + if ((p = morecore(km, n_units)) == 0) return 0; + } + } +} + +void *kcalloc(void *_km, size_t count, size_t size) +{ + kmem_t *km = (kmem_t*)_km; + void *p; + if (size == 0 || count == 0) return 0; + if (km == NULL) return calloc(count, size); + p = kmalloc(km, count * size); + memset(p, 0, count * size); + return p; +} + +void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle +{ + kmem_t *km = (kmem_t*)_km; + size_t cap, *p, *q; + + if (n_bytes == 0) { + kfree(km, ap); return 0; + } + if (km == NULL) return realloc(ap, n_bytes); + if (ap == NULL) return kmalloc(km, n_bytes); + p = (size_t*)ap - 1; + cap = (*p) * sizeof(header_t) - sizeof(size_t); + if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */ + q = (size_t*)kmalloc(km, n_bytes); + memcpy(q, ap, cap); + kfree(km, ap); + return q; +} + +void km_stat(const void *_km, km_stat_t *s) +{ + kmem_t *km = (kmem_t*)_km; + header_t *p; + memset(s, 0, sizeof(km_stat_t)); + if (km == NULL || km->loop_head == NULL) return; + for (p = km->loop_head;; p = p->ptr) { + s->available += p->size * sizeof(header_t); + if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */ + if (p->ptr > p && p + p->size > p->ptr) + panic("[km_stat] The end of a free block enters another free block."); + if (p->ptr == km->loop_head) break; + } + for (p = km->core_head; p != NULL; p = p->ptr) { + size_t size = p->size * sizeof(header_t); + ++s->n_cores; + s->capacity += size; + s->largest = s->largest > size? s->largest : size; + } +} diff --git a/lib/minimap2/kalloc 2.h b/lib/minimap2/kalloc 2.h new file mode 100644 index 000000000..93bff5e25 --- /dev/null +++ b/lib/minimap2/kalloc 2.h @@ -0,0 +1,76 @@ +#ifndef _KALLOC_H_ +#define _KALLOC_H_ + +#include /* for size_t */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t capacity, available, n_blocks, n_cores, largest; +} km_stat_t; + +void *kmalloc(void *km, size_t size); +void *krealloc(void *km, void *ptr, size_t size); +void *kcalloc(void *km, size_t count, size_t size); +void kfree(void *km, void *ptr); + +void *km_init(void); +void *km_init2(void *km_par, size_t min_core_size); +void km_destroy(void *km); +void km_stat(const void *_km, km_stat_t *s); + +#ifdef __cplusplus +} +#endif + +#define KMALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kmalloc((km), (len) * sizeof(*(ptr)))) +#define KCALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kcalloc((km), (len), sizeof(*(ptr)))) +#define KREALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))krealloc((km), (ptr), (len) * sizeof(*(ptr)))) + +#define KEXPAND(km, a, m) do { \ + (m) = (m) >= 4? (m) + ((m)>>1) : 16; \ + KREALLOC((km), (a), (m)); \ + } while (0) + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + void *km; \ + } kmp_##name##_t; \ + SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \ + kmp_##name##_t *mp; \ + KCALLOC(km, mp, 1); \ + mp->km = km; \ + return mp; \ + } \ + SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \ + kfree(mp->km, mp->buf); kfree(mp->km, mp); \ + } \ + SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \ + mp->buf[mp->n++] = p; \ + } + +#define KALLOC_POOL_INIT(name, kmptype_t) \ + KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t) + +#endif diff --git a/lib/minimap2/kalloc.c b/lib/minimap2/kalloc.c index 849955290..f5de41ad6 100644 --- a/lib/minimap2/kalloc.c +++ b/lib/minimap2/kalloc.c @@ -40,7 +40,8 @@ void *km_init2(void *km_par, size_t min_core_size) kmem_t *km; km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t)); km->par = km_par; - km->min_core_size = min_core_size > 0? min_core_size : 0x80000; + if (km_par) km->min_core_size = min_core_size > 0? min_core_size : ((kmem_t*)km_par)->min_core_size - 2; + else km->min_core_size = min_core_size > 0? min_core_size : 0x80000; return (void*)km; } @@ -183,6 +184,16 @@ void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made mo return q; } +void *krelocate(void *km, void *ap, size_t n_bytes) +{ + void *p; + if (km == 0 || ap == 0) return ap; + p = kmalloc(km, n_bytes); + memcpy(p, ap, n_bytes); + kfree(km, ap); + return p; +} + void km_stat(const void *_km, km_stat_t *s) { kmem_t *km = (kmem_t*)_km; @@ -203,3 +214,11 @@ void km_stat(const void *_km, km_stat_t *s) s->largest = s->largest > size? s->largest : size; } } + +void km_stat_print(const void *km) +{ + km_stat_t st; + km_stat(km, &st); + fprintf(stderr, "[km_stat] cap=%ld, avail=%ld, largest=%ld, n_core=%ld, n_block=%ld\n", + st.capacity, st.available, st.largest, st.n_blocks, st.n_cores); +} diff --git a/lib/minimap2/kalloc.h b/lib/minimap2/kalloc.h index 93bff5e25..437867238 100644 --- a/lib/minimap2/kalloc.h +++ b/lib/minimap2/kalloc.h @@ -13,6 +13,7 @@ typedef struct { void *kmalloc(void *km, size_t size); void *krealloc(void *km, void *ptr, size_t size); +void *krelocate(void *km, void *ap, size_t n_bytes); void *kcalloc(void *km, size_t count, size_t size); void kfree(void *km, void *ptr); @@ -20,11 +21,21 @@ void *km_init(void); void *km_init2(void *km_par, size_t min_core_size); void km_destroy(void *km); void km_stat(const void *_km, km_stat_t *s); +void km_stat_print(const void *km); #ifdef __cplusplus } #endif +#define Kmalloc(km, type, cnt) ((type*)kmalloc((km), (cnt) * sizeof(type))) +#define Kcalloc(km, type, cnt) ((type*)kcalloc((km), (cnt), sizeof(type))) +#define Krealloc(km, type, ptr, cnt) ((type*)krealloc((km), (ptr), (cnt) * sizeof(type))) + +#define Kexpand(km, type, a, m) do { \ + (m) = (m) >= 4? (m) + ((m)>>1) : 16; \ + (a) = Krealloc(km, type, (a), (m)); \ + } while (0) + #define KMALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kmalloc((km), (len) * sizeof(*(ptr)))) #define KCALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kcalloc((km), (len), sizeof(*(ptr)))) #define KREALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))krealloc((km), (ptr), (len) * sizeof(*(ptr)))) @@ -50,7 +61,7 @@ void km_stat(const void *_km, km_stat_t *s); } kmp_##name##_t; \ SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \ kmp_##name##_t *mp; \ - KCALLOC(km, mp, 1); \ + mp = Kcalloc(km, kmp_##name##_t, 1); \ mp->km = km; \ return mp; \ } \ @@ -66,7 +77,7 @@ void km_stat(const void *_km, km_stat_t *s); } \ SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ --mp->cnt; \ - if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \ + if (mp->n == mp->max) Kexpand(mp->km, kmptype_t*, mp->buf, mp->max); \ mp->buf[mp->n++] = p; \ } diff --git a/lib/minimap2/kdq 2.h b/lib/minimap2/kdq 2.h new file mode 100644 index 000000000..8ae5c971d --- /dev/null +++ b/lib/minimap2/kdq 2.h @@ -0,0 +1,132 @@ +#ifndef __AC_KDQ_H +#define __AC_KDQ_H + +#include +#include +#include +#include "kalloc.h" + +#define __KDQ_TYPE(type) \ + typedef struct { \ + uint64_t front:58, bits:6, count, mask; \ + type *a; \ + void *km; \ + } kdq_##type##_t; + +#define kdq_t(type) kdq_##type##_t +#define kdq_size(q) ((q)->count) +#define kdq_first(q) ((q)->a[(q)->front]) +#define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask]) +#define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask]) + +#define __KDQ_IMPL(type, SCOPE) \ + SCOPE kdq_##type##_t *kdq_init_##type(void *km) \ + { \ + kdq_##type##_t *q; \ + q = (kdq_##type##_t*)kcalloc(km, 1, sizeof(kdq_##type##_t)); \ + q->bits = 2, q->mask = (1ULL<bits) - 1; \ + q->a = (type*)kmalloc(km, (1<bits) * sizeof(type)); \ + q->km = km; \ + return q; \ + } \ + SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \ + { \ + if (q == 0) return; \ + kfree(q->km, q->a); kfree(q->km, q); \ + } \ + SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \ + { \ + size_t new_size = 1ULL<bits; \ + if (new_size < q->count) { /* not big enough */ \ + int i; \ + for (i = 0; i < 64; ++i) \ + if (1ULL< q->count) break; \ + new_bits = i, new_size = 1ULL<bits) return q->bits; /* unchanged */ \ + if (new_bits > q->bits) q->a = (type*)krealloc(q->km, q->a, (1ULL<front + q->count <= old_size) { /* unwrapped */ \ + if (q->front + q->count > new_size) /* only happens for shrinking */ \ + memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \ + } else { /* wrapped */ \ + memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \ + q->front = new_size - (old_size - q->front); \ + } \ + q->bits = new_bits, q->mask = (1ULL<bits) - 1; \ + if (new_bits < q->bits) q->a = (type*)krealloc(q->km, q->a, (1ULL<bits; \ + } \ + SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \ + { \ + if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ + return &q->a[((q->count++) + q->front) & (q)->mask]; \ + } \ + SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \ + { \ + if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ + q->a[((q->count++) + q->front) & (q)->mask] = v; \ + } \ + SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \ + { \ + if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ + ++q->count; \ + q->front = q->front? q->front - 1 : (1ULL<bits) - 1; \ + return &q->a[q->front]; \ + } \ + SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \ + { \ + type *p; \ + p = kdq_unshiftp_##type(q); \ + *p = v; \ + } \ + SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \ + { \ + return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \ + } \ + SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \ + { \ + type *d = 0; \ + if (q->count == 0) return 0; \ + d = &q->a[q->front++]; \ + q->front &= q->mask; \ + --q->count; \ + return d; \ + } + +#define KDQ_INIT2(type, SCOPE) \ + __KDQ_TYPE(type) \ + __KDQ_IMPL(type, SCOPE) + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused) + +#define KDQ_DECLARE(type) \ + __KDQ_TYPE(type) \ + kdq_##type##_t *kdq_init_##type(); \ + void kdq_destroy_##type(kdq_##type##_t *q); \ + int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \ + type *kdq_pushp_##type(kdq_##type##_t *q); \ + void kdq_push_##type(kdq_##type##_t *q, type v); \ + type *kdq_unshiftp_##type(kdq_##type##_t *q); \ + void kdq_unshift_##type(kdq_##type##_t *q, type v); \ + type *kdq_pop_##type(kdq_##type##_t *q); \ + type *kdq_shift_##type(kdq_##type##_t *q); + +#define kdq_init(type, km) kdq_init_##type(km) +#define kdq_destroy(type, q) kdq_destroy_##type(q) +#define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits) +#define kdq_pushp(type, q) kdq_pushp_##type(q) +#define kdq_push(type, q, v) kdq_push_##type(q, v) +#define kdq_pop(type, q) kdq_pop_##type(q) +#define kdq_unshiftp(type, q) kdq_unshiftp_##type(q) +#define kdq_unshift(type, q, v) kdq_unshift_##type(q, v) +#define kdq_shift(type, q) kdq_shift_##type(q) + +#endif diff --git a/lib/minimap2/ketopt 2.h b/lib/minimap2/ketopt 2.h new file mode 100644 index 000000000..8ae181179 --- /dev/null +++ b/lib/minimap2/ketopt 2.h @@ -0,0 +1,120 @@ +#ifndef KETOPT_H +#define KETOPT_H + +#include /* for strchr() and strncmp() */ + +#define ko_no_argument 0 +#define ko_required_argument 1 +#define ko_optional_argument 2 + +typedef struct { + int ind; /* equivalent to optind */ + int opt; /* equivalent to optopt */ + char *arg; /* equivalent to optarg */ + int longidx; /* index of a long option; or -1 if short */ + /* private variables not intended for external uses */ + int i, pos, n_args; +} ketopt_t; + +typedef struct { + char *name; + int has_arg; + int val; +} ko_longopt_t; + +static ketopt_t KETOPT_INIT = { 1, 0, 0, -1, 1, 0, 0 }; + +static void ketopt_permute(char *argv[], int j, int n) /* move argv[j] over n elements to the left */ +{ + int k; + char *p = argv[j]; + for (k = 0; k < n; ++k) + argv[j - k] = argv[j - k - 1]; + argv[j - k] = p; +} + +/** + * Parse command-line options and arguments + * + * This fuction has a similar interface to GNU's getopt_long(). Each call + * parses one option and returns the option name. s->arg points to the option + * argument if present. The function returns -1 when all command-line arguments + * are parsed. In this case, s->ind is the index of the first non-option + * argument. + * + * @param s status; shall be initialized to KETOPT_INIT on the first call + * @param argc length of argv[] + * @param argv list of command-line arguments; argv[0] is ignored + * @param permute non-zero to move options ahead of non-option arguments + * @param ostr option string + * @param longopts long options + * + * @return ASCII for a short option; ko_longopt_t::val for a long option; -1 if + * argv[] is fully processed; '?' for an unknown option or an ambiguous + * long option; ':' if an option argument is missing + */ +static int ketopt(ketopt_t *s, int argc, char *argv[], int permute, const char *ostr, const ko_longopt_t *longopts) +{ + int opt = -1, i0, j; + if (permute) { + while (s->i < argc && (argv[s->i][0] != '-' || argv[s->i][1] == '\0')) + ++s->i, ++s->n_args; + } + s->arg = 0, s->longidx = -1, i0 = s->i; + if (s->i >= argc || argv[s->i][0] != '-' || argv[s->i][1] == '\0') { + s->ind = s->i - s->n_args; + return -1; + } + if (argv[s->i][0] == '-' && argv[s->i][1] == '-') { /* "--" or a long option */ + if (argv[s->i][2] == '\0') { /* a bare "--" */ + ketopt_permute(argv, s->i, s->n_args); + ++s->i, s->ind = s->i - s->n_args; + return -1; + } + s->opt = 0, opt = '?', s->pos = -1; + if (longopts) { /* parse long options */ + int k, n_exact = 0, n_partial = 0; + const ko_longopt_t *o = 0, *o_exact = 0, *o_partial = 0; + for (j = 2; argv[s->i][j] != '\0' && argv[s->i][j] != '='; ++j) {} /* find the end of the option name */ + for (k = 0; longopts[k].name != 0; ++k) + if (strncmp(&argv[s->i][2], longopts[k].name, j - 2) == 0) { + if (longopts[k].name[j - 2] == 0) ++n_exact, o_exact = &longopts[k]; + else ++n_partial, o_partial = &longopts[k]; + } + if (n_exact > 1 || (n_exact == 0 && n_partial > 1)) return '?'; + o = n_exact == 1? o_exact : n_partial == 1? o_partial : 0; + if (o) { + s->opt = opt = o->val, s->longidx = o - longopts; + if (argv[s->i][j] == '=') s->arg = &argv[s->i][j + 1]; + if (o->has_arg == 1 && argv[s->i][j] == '\0') { + if (s->i < argc - 1) s->arg = argv[++s->i]; + else opt = ':'; /* missing option argument */ + } + } + } + } else { /* a short option */ + char *p; + if (s->pos == 0) s->pos = 1; + opt = s->opt = argv[s->i][s->pos++]; + p = strchr((char*)ostr, opt); + if (p == 0) { + opt = '?'; /* unknown option */ + } else if (p[1] == ':') { + if (argv[s->i][s->pos] == 0) { + if (s->i < argc - 1) s->arg = argv[++s->i]; + else opt = ':'; /* missing option argument */ + } else s->arg = &argv[s->i][s->pos]; + s->pos = -1; + } + } + if (s->pos < 0 || argv[s->i][s->pos] == 0) { + ++s->i, s->pos = 0; + if (s->n_args > 0) /* permute */ + for (j = i0; j < s->i; ++j) + ketopt_permute(argv, j, s->n_args); + } + s->ind = s->i - s->n_args; + return opt; +} + +#endif diff --git a/lib/minimap2/khash 2.h b/lib/minimap2/khash 2.h new file mode 100644 index 000000000..6373a9358 --- /dev/null +++ b/lib/minimap2/khash 2.h @@ -0,0 +1,615 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include +#include "kalloc.h" + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(0, 1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree(0, (void *)h->keys); kfree(0, h->flags); \ + kfree(0, (void *)h->vals); \ + kfree(0, h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(0, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(0, new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(0, new_flags); return -1; } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(0, h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/lib/minimap2/krmq 2.h b/lib/minimap2/krmq 2.h new file mode 100644 index 000000000..8fa1cceed --- /dev/null +++ b/lib/minimap2/krmq 2.h @@ -0,0 +1,474 @@ +/* The MIT License + + Copyright (c) 2019 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* An example: + +#include +#include +#include +#include "krmq.h" + +struct my_node { + char key; + KRMQ_HEAD(struct my_node) head; +}; +#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key)) +KRMQ_INIT(my, struct my_node, head, my_cmp) + +int main(void) { + const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate + struct my_node *root = 0; + int i, l = strlen(str); + for (i = 0; i < l; ++i) { // insert in the input order + struct my_node *q, *p = malloc(sizeof(*p)); + p->key = str[i]; + q = krmq_insert(my, &root, p, 0); + if (p != q) free(p); // if already present, free + } + krmq_itr_t(my) itr; + krmq_itr_first(my, root, &itr); // place at first + do { // traverse + const struct my_node *p = krmq_at(&itr); + putchar(p->key); + free((void*)p); // free node + } while (krmq_itr_next(my, &itr)); + putchar('\n'); + return 0; +} +*/ + +#ifndef KRMQ_H +#define KRMQ_H + +#ifdef __STRICT_ANSI__ +#define inline __inline__ +#endif + +#define KRMQ_MAX_DEPTH 64 + +#define krmq_size(head, p) ((p)? (p)->head.size : 0) +#define krmq_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0) + +#define KRMQ_HEAD(__type) \ + struct { \ + __type *p[2], *s; \ + signed char balance; /* balance factor */ \ + unsigned size; /* #elements in subtree */ \ + } + +#define __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \ + __scope __type *krmq_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \ + const __type *p = root; \ + unsigned cnt = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + if (cnt_) *cnt_ = cnt; \ + return (__type*)p; \ + } \ + __scope __type *krmq_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \ + const __type *p = root, *l = 0, *u = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp < 0) u = p, p = p->__head.p[0]; \ + else if (cmp > 0) l = p, p = p->__head.p[1]; \ + else { l = u = p; break; } \ + } \ + if (lower) *lower = (__type*)l; \ + if (upper) *upper = (__type*)u; \ + return (__type*)p; \ + } + +#define __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_rmq_##suf(const __type *root, const __type *lo, const __type *up) { /* CLOSED interval */ \ + const __type *p = root, *path[2][KRMQ_MAX_DEPTH], *min; \ + int plen[2] = {0, 0}, pcmp[2][KRMQ_MAX_DEPTH], i, cmp, lca; \ + if (root == 0) return 0; \ + while (p) { \ + cmp = __cmp(lo, p); \ + path[0][plen[0]] = p, pcmp[0][plen[0]++] = cmp; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + p = root; \ + while (p) { \ + cmp = __cmp(up, p); \ + path[1][plen[1]] = p, pcmp[1][plen[1]++] = cmp; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + for (i = 0; i < plen[0] && i < plen[1]; ++i) /* find the LCA */ \ + if (path[0][i] == path[1][i] && pcmp[0][i] <= 0 && pcmp[1][i] >= 0) \ + break; \ + if (i == plen[0] || i == plen[1]) return 0; /* no elements in the closed interval */ \ + lca = i, min = path[0][lca]; \ + for (i = lca + 1; i < plen[0]; ++i) { \ + if (pcmp[0][i] <= 0) { \ + if (__lt2(path[0][i], min)) min = path[0][i]; \ + if (path[0][i]->__head.p[1] && __lt2(path[0][i]->__head.p[1]->__head.s, min)) \ + min = path[0][i]->__head.p[1]->__head.s; \ + } \ + } \ + for (i = lca + 1; i < plen[1]; ++i) { \ + if (pcmp[1][i] >= 0) { \ + if (__lt2(path[1][i], min)) min = path[1][i]; \ + if (path[1][i]->__head.p[0] && __lt2(path[1][i]->__head.p[0]->__head.s, min)) \ + min = path[1][i]->__head.p[0]->__head.s; \ + } \ + } \ + return (__type*)min; \ + } + +#define __KRMQ_ROTATE(suf, __type, __head, __lt2) \ + /* */ \ + static inline void krmq_update_min_##suf(__type *p, const __type *q, const __type *r) { \ + p->__head.s = !q || __lt2(p, q->__head.s)? p : q->__head.s; \ + p->__head.s = !r || __lt2(p->__head.s, r->__head.s)? p->__head.s : r->__head.s; \ + } \ + /* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \ + static inline __type *krmq_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \ + int opp = 1 - dir; /* opposite direction */ \ + __type *q = p->__head.p[opp], *s = p->__head.s; \ + unsigned size_p = p->__head.size; \ + p->__head.size -= q->__head.size - krmq_size_child(__head, q, dir); \ + q->__head.size = size_p; \ + krmq_update_min_##suf(p, p->__head.p[dir], q->__head.p[dir]); \ + q->__head.s = s; \ + p->__head.p[opp] = q->__head.p[dir]; \ + q->__head.p[dir] = p; \ + return q; \ + } \ + /* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \ + static inline __type *krmq_rotate2_##suf(__type *p, int dir) { \ + int b1, opp = 1 - dir; \ + __type *q = p->__head.p[opp], *r = q->__head.p[dir], *s = p->__head.s; \ + unsigned size_x_dir = krmq_size_child(__head, r, dir); \ + r->__head.size = p->__head.size; \ + p->__head.size -= q->__head.size - size_x_dir; \ + q->__head.size -= size_x_dir + 1; \ + krmq_update_min_##suf(p, p->__head.p[dir], r->__head.p[dir]); \ + krmq_update_min_##suf(q, q->__head.p[opp], r->__head.p[opp]); \ + r->__head.s = s; \ + p->__head.p[opp] = r->__head.p[dir]; \ + r->__head.p[dir] = p; \ + q->__head.p[dir] = r->__head.p[opp]; \ + r->__head.p[opp] = q; \ + b1 = dir == 0? +1 : -1; \ + if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \ + else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \ + else q->__head.balance = b1, p->__head.balance = 0; \ + r->__head.balance = 0; \ + return r; \ + } + +#define __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \ + unsigned char stack[KRMQ_MAX_DEPTH]; \ + __type *path[KRMQ_MAX_DEPTH]; \ + __type *bp, *bq; \ + __type *p, *q, *r = 0; /* _r_ is potentially the new root */ \ + int i, which = 0, top, b1, path_len; \ + unsigned cnt = 0; \ + bp = *root_, bq = 0; \ + /* find the insertion location */ \ + for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + if (cmp == 0) { \ + if (cnt_) *cnt_ = cnt; \ + return p; \ + } \ + if (p->__head.balance != 0) \ + bq = q, bp = p, top = 0; \ + stack[top++] = which = (cmp > 0); \ + path[path_len++] = p; \ + } \ + if (cnt_) *cnt_ = cnt; \ + x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0, x->__head.s = x; \ + if (q == 0) *root_ = x; \ + else q->__head.p[which] = x; \ + if (bp == 0) return x; \ + for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \ + for (i = path_len - 1; i >= 0; --i) { \ + krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \ + if (path[i]->__head.s != x) break; \ + } \ + for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \ + if (stack[top] == 0) --p->__head.balance; \ + else ++p->__head.balance; \ + if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \ + /* re-balance */ \ + which = (bp->__head.balance < 0); \ + b1 = which == 0? +1 : -1; \ + q = bp->__head.p[1 - which]; \ + if (q->__head.balance == b1) { \ + r = krmq_rotate1_##suf(bp, which); \ + q->__head.balance = bp->__head.balance = 0; \ + } else r = krmq_rotate2_##suf(bp, which); \ + if (bq == 0) *root_ = r; \ + else bq->__head.p[bp != bq->__head.p[0]] = r; \ + return x; \ + } + +#define __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \ + __scope __type *krmq_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \ + __type *p, *path[KRMQ_MAX_DEPTH], fake; \ + unsigned char dir[KRMQ_MAX_DEPTH]; \ + int i, d = 0, cmp; \ + unsigned cnt = 0; \ + fake = **root_, fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \ + if (cnt_) *cnt_ = 0; \ + if (x) { \ + for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \ + int which = (cmp > 0); \ + if (cmp > 0) cnt += krmq_size_child(__head, p, 0) + 1; \ + dir[d] = which; \ + path[d++] = p; \ + p = p->__head.p[which]; \ + if (p == 0) { \ + if (cnt_) *cnt_ = 0; \ + return 0; \ + } \ + } \ + cnt += krmq_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \ + } else { \ + for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \ + dir[d] = 0, path[d++] = p; \ + p = path[--d]; \ + } \ + if (cnt_) *cnt_ = cnt; \ + for (i = 1; i < d; ++i) --path[i]->__head.size; \ + if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \ + path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \ + } else { \ + __type *q = p->__head.p[1]; \ + if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3,q=2 */ \ + q->__head.p[0] = p->__head.p[0]; \ + q->__head.balance = p->__head.balance; \ + path[d-1]->__head.p[dir[d-1]] = q; \ + path[d] = q, dir[d++] = 1; \ + q->__head.size = p->__head.size - 1; \ + } else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \ + __type *r; \ + int e = d++; /* backup _d_ */\ + for (;;) { \ + dir[d] = 0; \ + path[d++] = q; \ + r = q->__head.p[0]; \ + if (r->__head.p[0] == 0) break; \ + q = r; \ + } \ + r->__head.p[0] = p->__head.p[0]; \ + q->__head.p[0] = r->__head.p[1]; \ + r->__head.p[1] = p->__head.p[1]; \ + r->__head.balance = p->__head.balance; \ + path[e-1]->__head.p[dir[e-1]] = r; \ + path[e] = r, dir[e] = 1; \ + for (i = e + 1; i < d; ++i) --path[i]->__head.size; \ + r->__head.size = p->__head.size - 1; \ + } \ + } \ + for (i = d - 1; i >= 0; --i) /* not sure why adding condition "path[i]->__head.s==p" doesn't work */ \ + krmq_update_min_##suf(path[i], path[i]->__head.p[0], path[i]->__head.p[1]); \ + while (--d > 0) { \ + __type *q = path[d]; \ + int which, other, b1 = 1, b2 = 2; \ + which = dir[d], other = 1 - which; \ + if (which) b1 = -b1, b2 = -b2; \ + q->__head.balance += b1; \ + if (q->__head.balance == b1) break; \ + else if (q->__head.balance == b2) { \ + __type *r = q->__head.p[other]; \ + if (r->__head.balance == -b1) { \ + path[d-1]->__head.p[dir[d-1]] = krmq_rotate2_##suf(q, which); \ + } else { \ + path[d-1]->__head.p[dir[d-1]] = krmq_rotate1_##suf(q, which); \ + if (r->__head.balance == 0) { \ + r->__head.balance = -b1; \ + q->__head.balance = b1; \ + break; \ + } else r->__head.balance = q->__head.balance = 0; \ + } \ + } \ + } \ + *root_ = fake.__head.p[0]; \ + return p; \ + } + +#define krmq_free(__type, __head, __root, __free) do { \ + __type *_p, *_q; \ + for (_p = __root; _p; _p = _q) { \ + if (_p->__head.p[0] == 0) { \ + _q = _p->__head.p[1]; \ + __free(_p); \ + } else { \ + _q = _p->__head.p[0]; \ + _p->__head.p[0] = _q->__head.p[1]; \ + _q->__head.p[1] = _p; \ + } \ + } \ + } while (0) + +#define __KRMQ_ITR(suf, __scope, __type, __head, __cmp) \ + struct krmq_itr_##suf { \ + const __type *stack[KRMQ_MAX_DEPTH], **top; \ + }; \ + __scope void krmq_itr_first_##suf(const __type *root, struct krmq_itr_##suf *itr) { \ + const __type *p; \ + for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \ + *++itr->top = p; \ + } \ + __scope int krmq_itr_find_##suf(const __type *root, const __type *x, struct krmq_itr_##suf *itr) { \ + const __type *p = root; \ + itr->top = itr->stack - 1; \ + while (p != 0) { \ + int cmp; \ + *++itr->top = p; \ + cmp = __cmp(x, p); \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + return p? 1 : 0; \ + } \ + __scope int krmq_itr_next_bidir_##suf(struct krmq_itr_##suf *itr, int dir) { \ + const __type *p; \ + if (itr->top < itr->stack) return 0; \ + dir = !!dir; \ + p = (*itr->top)->__head.p[dir]; \ + if (p) { /* go down */ \ + for (; p; p = p->__head.p[!dir]) \ + *++itr->top = p; \ + return 1; \ + } else { /* go up */ \ + do { \ + p = *itr->top--; \ + } while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \ + return itr->top < itr->stack? 0 : 1; \ + } \ + } \ + +/** + * Insert a node to the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node to insert (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return _x_ if not present in the tree, or the node equal to x. + */ +#define krmq_insert(suf, proot, x, cnt) krmq_insert_##suf(proot, x, cnt) + +/** + * Find a node in the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param x node value to find (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return node equal to _x_ if present, or NULL if absent + */ +#define krmq_find(suf, root, x, cnt) krmq_find_##suf(root, x, cnt) +#define krmq_interval(suf, root, x, lower, upper) krmq_interval_##suf(root, x, lower, upper) +#define krmq_rmq(suf, root, lo, up) krmq_rmq_##suf(root, lo, up) + +/** + * Delete a node from the tree + * + * @param suf name suffix used in KRMQ_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node value to delete; if NULL, delete the first node (in) + * + * @return node removed from the tree if present, or NULL if absent + */ +#define krmq_erase(suf, proot, x, cnt) krmq_erase_##suf(proot, x, cnt) +#define krmq_erase_first(suf, proot) krmq_erase_##suf(proot, 0, 0) + +#define krmq_itr_t(suf) struct krmq_itr_##suf + +/** + * Place the iterator at the smallest object + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param itr iterator + */ +#define krmq_itr_first(suf, root, itr) krmq_itr_first_##suf(root, itr) + +/** + * Place the iterator at the object equal to or greater than the query + * + * @param suf name suffix used in KRMQ_INIT() + * @param root root of the tree + * @param x query (in) + * @param itr iterator (out) + * + * @return 1 if find; 0 otherwise. krmq_at(itr) is NULL if and only if query is + * larger than all objects in the tree + */ +#define krmq_itr_find(suf, root, x, itr) krmq_itr_find_##suf(root, x, itr) + +/** + * Move to the next object in order + * + * @param itr iterator (modified) + * + * @return 1 if there is a next object; 0 otherwise + */ +#define krmq_itr_next(suf, itr) krmq_itr_next_bidir_##suf(itr, 1) +#define krmq_itr_prev(suf, itr) krmq_itr_next_bidir_##suf(itr, 0) + +/** + * Return the pointer at the iterator + * + * @param itr iterator + * + * @return pointer if present; NULL otherwise + */ +#define krmq_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top) + +#define KRMQ_INIT2(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_FIND(suf, __scope, __type, __head, __cmp) \ + __KRMQ_RMQ(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ROTATE(suf, __type, __head, __lt2) \ + __KRMQ_INSERT(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ERASE(suf, __scope, __type, __head, __cmp, __lt2) \ + __KRMQ_ITR(suf, __scope, __type, __head, __cmp) + +#define KRMQ_INIT(suf, __type, __head, __cmp, __lt2) \ + KRMQ_INIT2(suf,, __type, __head, __cmp, __lt2) + +#endif diff --git a/lib/minimap2/kseq 2.h b/lib/minimap2/kseq 2.h new file mode 100644 index 000000000..15d798321 --- /dev/null +++ b/lib/minimap2/kseq 2.h @@ -0,0 +1,256 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + int begin, end; \ + int is_eof:2, bufsize:30; \ + type_t f; \ + unsigned char *buf; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(SCOPE, type_t, __bufsize) \ + SCOPE kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; ks->bufsize = __bufsize; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + SCOPE void ks_destroy(kstream_t *ks) \ + { \ + if (!ks) return; \ + free(ks->buf); \ + free(ks); \ + } + +#define __KS_INLINED(__read) \ + static inline klib_unused int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(SCOPE, __read) \ + SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(SCOPE, type_t, __bufsize) \ + __KS_GETUNTIL(SCOPE, __read) \ + __KS_INLINED(__read) + +#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) + +#define KSTREAM_DECLARE(type_t, __read) \ + __KS_TYPE(type_t) \ + extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ + extern kstream_t *ks_init(type_t f); \ + extern void ks_destroy(kstream_t *ks); \ + __KS_INLINED(__read) + +/****************** + * FASTA/Q parser * + ******************/ + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/lib/minimap2/ksort 2.h b/lib/minimap2/ksort 2.h new file mode 100644 index 000000000..d7599d146 --- /dev/null +++ b/lib/minimap2/ksort 2.h @@ -0,0 +1,153 @@ +/* The MIT License + + Copyright (c) 2008, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +// This is a simplified version of ksort.h + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_heapdown_##name(size_t i, size_t n, type_t l[]) \ + { \ + size_t k = i; \ + type_t tmp = l[i]; \ + while ((k = (k << 1) + 1) < n) { \ + if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \ + if (__sort_lt(l[k], tmp)) break; \ + l[i] = l[k]; i = k; \ + } \ + l[i] = tmp; \ + } \ + void ks_heapmake_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapdown_##name(i, lsize, l); \ + } \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } \ + +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#define RS_MIN_SIZE 64 +#define RS_MAX_BITS 8 + +#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ + typedef struct { \ + rstype_t *b, *e; \ + } rsbucket_##name##_t; \ + void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ + { \ + rstype_t *i; \ + for (i = beg + 1; i < end; ++i) \ + if (rskey(*i) < rskey(*(i - 1))) { \ + rstype_t *j, tmp = *i; \ + for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ + *j = *(j - 1); \ + *j = tmp; \ + } \ + } \ + void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ + { \ + rstype_t *i; \ + int size = 1<b = k->e = beg; \ + for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ + for (k = b + 1; k != be; ++k) \ + k->e += (k-1)->e - beg, k->b = (k-1)->e; \ + for (k = b; k != be;) { \ + if (k->b != k->e) { \ + rsbucket_##name##_t *l; \ + if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ + rstype_t tmp = *k->b, swap; \ + do { \ + swap = tmp; tmp = *l->b; *l->b++ = swap; \ + l = b + (rskey(tmp)>>s&m); \ + } while (l != k); \ + *k->b++ = tmp; \ + } else ++k->b; \ + } else ++k; \ + } \ + for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ + if (s) { \ + s = s > n_bits? s - n_bits : 0; \ + for (k = b; k != be; ++k) \ + if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ + else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ + } \ + } \ + void radix_sort_##name(rstype_t *beg, rstype_t *end) \ + { \ + if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ + else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \ + } + +#endif diff --git a/lib/minimap2/ksw2.h b/lib/minimap2/ksw2.h index cbd1ddc4c..1f94c6f63 100644 --- a/lib/minimap2/ksw2.h +++ b/lib/minimap2/ksw2.h @@ -15,6 +15,7 @@ #define KSW_EZ_SPLICE_FOR 0x100 #define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_FLANK 0x400 +#define KSW_EZ_SPLICE_CMPLX 0x800 // The subset of CIGAR operators used by ksw code. // Use MM_CIGAR_* from minimap.h if you need the full list. diff --git a/lib/minimap2/ksw2_extd2_sse 2.c b/lib/minimap2/ksw2_extd2_sse 2.c new file mode 100644 index 000000000..162e9e264 --- /dev/null +++ b/lib/minimap2/ksw2_extd2_sse 2.c @@ -0,0 +1,402 @@ +#include +#include +#include +#include "ksw2.h" + +#ifdef __SSE2__ +#ifdef USE_SIMDE +#include +#else +#include +#endif + +#ifdef KSW_SSE2_ONLY +#undef __SSE4_1__ +#endif + +#ifdef __SSE4_1__ +#ifdef USE_SIMDE +#include +#else +#include +#endif +#endif + +#ifdef KSW_CPU_DISPATCH +#ifdef __SSE4_1__ +void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +#else +void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +#endif +#else +void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) +#endif // ~KSW_CPU_DISPATCH +{ +#define __dp_code_block1 \ + z = _mm_load_si128(&s[t]); \ + xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ + tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ + xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ + x1_ = tmp; \ + vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ + tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ + vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ + v1_ = tmp; \ + a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ + ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ + b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \ + x2t1= _mm_load_si128(&x2[t]); \ + tmp = _mm_srli_si128(x2t1, 15); \ + x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \ + x21_= tmp; \ + a2= _mm_add_epi8(x2t1, vt1); \ + b2= _mm_add_epi8(_mm_load_si128(&y2[t]), ut); + +#define __dp_code_block2 \ + _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ + _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ + tmp = _mm_sub_epi8(z, q_); \ + a = _mm_sub_epi8(a, tmp); \ + b = _mm_sub_epi8(b, tmp); \ + tmp = _mm_sub_epi8(z, q2_); \ + a2= _mm_sub_epi8(a2, tmp); \ + b2= _mm_sub_epi8(b2, tmp); + + int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff; + int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); + int32_t *H = 0, H0 = 0, last_H0_t = 0; + uint8_t *qr, *sf, *mem, *mem2 = 0; + __m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_, sc_N_; + __m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0; + + ksw_reset_extz(ez); + if (m <= 1 || qlen <= 0 || tlen <= 0) return; + + if (q2 + e2 < q + e) t = q, q = q2, q2 = t, t = e, e = e2, e2 = t; // make sure q+e no larger than q2+e2 + + zero_ = _mm_set1_epi8(0); + q_ = _mm_set1_epi8(q); + q2_ = _mm_set1_epi8(q2); + qe_ = _mm_set1_epi8(q + e); + qe2_ = _mm_set1_epi8(q2 + e2); + sc_mch_ = _mm_set1_epi8(mat[0]); + sc_mis_ = _mm_set1_epi8(mat[1]); + sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e2) : _mm_set1_epi8(mat[m*m-1]); + m1_ = _mm_set1_epi8(m - 1); // wildcard + + if (w < 0) w = tlen > qlen? tlen : qlen; + wl = wr = w; + tlen_ = (tlen + 15) / 16; + n_col_ = qlen < tlen? qlen : tlen; + n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1; + qlen_ = (qlen + 15) / 16; + for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { + max_sc = max_sc > mat[t]? max_sc : mat[t]; + min_sc = min_sc < mat[t]? min_sc : mat[t]; + } + if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches + + long_thres = e != e2? (q2 - q) / (e - e2) - 1 : 0; + if (q2 + e2 + long_thres * e2 > q + e + long_thres * e) + ++long_thres; + long_diff = long_thres * (e - e2) - (q2 - q) - e2; + + mem = (uint8_t*)kcalloc(km, tlen_ * 8 + qlen_ + 1, 16); + u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned + v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_, y2 = x2 + tlen_; + s = y2 + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; + memset(u, -q - e, tlen_ * 16); + memset(v, -q - e, tlen_ * 16); + memset(x, -q - e, tlen_ * 16); + memset(y, -q - e, tlen_ * 16); + memset(x2, -q2 - e2, tlen_ * 16); + memset(y2, -q2 - e2, tlen_ * 16); + if (!approx_max) { + H = (int32_t*)kmalloc(km, tlen_ * 16 * 4); + for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF; + } + if (with_cigar) { + mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16); + p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4); + off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); + off_end = off + qlen + tlen - 1; + } + + for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t]; + memcpy(sf, target, tlen); + + for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { + int st = 0, en = tlen - 1, st0, en0, st_, en_; + int8_t x1, x21, v1; + uint8_t *qrr = qr + (qlen - 1 - r); + int8_t *u8 = (int8_t*)u, *v8 = (int8_t*)v, *x8 = (int8_t*)x, *x28 = (int8_t*)x2; + __m128i x1_, x21_, v1_; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; + if (en > r) en = r; + if (st < (r-wr+1)>>1) st = (r-wr+1)>>1; // take the ceil + if (en > (r+wl)>>1) en = (r+wl)>>1; // take the floor + if (st > en) { + ez->zdropped = 1; + break; + } + st0 = st, en0 = en; + st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; + // set boundary conditions + if (st > 0) { + if (st - 1 >= last_st && st - 1 <= last_en) { + x1 = x8[st - 1], x21 = x28[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round + } else { + x1 = -q - e, x21 = -q2 - e2; + v1 = -q - e; + } + } else { + x1 = -q - e, x21 = -q2 - e2; + v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : -e2; + } + if (en >= r) { + ((int8_t*)y)[r] = -q - e, ((int8_t*)y2)[r] = -q2 - e2; + u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : -e2; + } + // loop fission: set scores first + if (!(flag & KSW_EZ_GENERIC_SC)) { + for (t = st0; t <= en0; t += 16) { + __m128i sq, st, tmp, mask; + sq = _mm_loadu_si128((__m128i*)&sf[t]); + st = _mm_loadu_si128((__m128i*)&qrr[t]); + mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); + tmp = _mm_cmpeq_epi8(sq, st); +#ifdef __SSE4_1__ + tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); + tmp = _mm_blendv_epi8(tmp, sc_N_, mask); +#else + tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); + tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_)); +#endif + _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp); + } + } else { + for (t = st0; t <= en0; ++t) + ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]]; + } + // core loop + x1_ = _mm_cvtsi32_si128((uint8_t)x1); + x21_ = _mm_cvtsi32_si128((uint8_t)x21); + v1_ = _mm_cvtsi32_si128((uint8_t)v1); + st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); + if (!with_cigar) { // score only + for (t = st_; t <= en_; ++t) { + __m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +#ifdef __SSE4_1__ + z = _mm_max_epi8(z, a); + z = _mm_max_epi8(z, b); + z = _mm_max_epi8(z, a2); + z = _mm_max_epi8(z, b2); + z = _mm_min_epi8(z, sc_mch_); + __dp_code_block2; // save u[] and v[]; update a, b, a2 and b2 + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_)); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), qe2_)); + _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_max_epi8(b2, zero_), qe2_)); +#else + tmp = _mm_cmpgt_epi8(a, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); + tmp = _mm_cmpgt_epi8(b, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); + tmp = _mm_cmpgt_epi8(a2, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); + tmp = _mm_cmpgt_epi8(b2, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); + __dp_code_block2; + tmp = _mm_cmpgt_epi8(a, zero_); + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); + tmp = _mm_cmpgt_epi8(b, zero_); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); + tmp = _mm_cmpgt_epi8(a2, zero_); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); + tmp = _mm_cmpgt_epi8(b2, zero_); + _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); +#endif + } + } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment + __m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { + __m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +#ifdef __SSE4_1__ + d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0 + z = _mm_max_epi8(z, a); + d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d + z = _mm_max_epi8(z, b); + d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d + z = _mm_max_epi8(z, a2); + d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d + z = _mm_max_epi8(z, b2); + z = _mm_min_epi8(z, sc_mch_); +#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() + tmp = _mm_cmpgt_epi8(a, z); + d = _mm_and_si128(tmp, _mm_set1_epi8(1)); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); + tmp = _mm_cmpgt_epi8(b, z); + d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2))); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); + tmp = _mm_cmpgt_epi8(a2, z); + d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); + tmp = _mm_cmpgt_epi8(b2, z); + d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4))); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +#endif + __dp_code_block2; + tmp = _mm_cmpgt_epi8(a, zero_); + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 + tmp = _mm_cmpgt_epi8(b, zero_); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 + tmp = _mm_cmpgt_epi8(a2, zero_); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + tmp = _mm_cmpgt_epi8(b2, zero_); + _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 + _mm_store_si128(&pr[t], d); + } + } else { // gap right-alignment + __m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { + __m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +#ifdef __SSE4_1__ + d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1 + z = _mm_max_epi8(z, a); + d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 + z = _mm_max_epi8(z, b); + d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 + z = _mm_max_epi8(z, a2); + d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4 + z = _mm_max_epi8(z, b2); + z = _mm_min_epi8(z, sc_mch_); +#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() + tmp = _mm_cmpgt_epi8(z, a); + d = _mm_andnot_si128(tmp, _mm_set1_epi8(1)); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a)); + tmp = _mm_cmpgt_epi8(z, b); + d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2))); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b)); + tmp = _mm_cmpgt_epi8(z, a2); + d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2)); + tmp = _mm_cmpgt_epi8(z, b2); + d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4))); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +#endif + __dp_code_block2; + tmp = _mm_cmpgt_epi8(zero_, a); + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 + tmp = _mm_cmpgt_epi8(zero_, b); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 + tmp = _mm_cmpgt_epi8(zero_, a2); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), qe2_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + tmp = _mm_cmpgt_epi8(zero_, b2); + _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b2), qe2_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 + _mm_store_si128(&pr[t], d); + } + } + if (!approx_max) { // find the exact max with a 32-bit score array + int32_t max_H, max_t; + // compute H[], max_H and max_t + if (r > 0) { + int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; + __m128i max_H_, max_t_; + max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element + max_t = en0; + max_H_ = _mm_set1_epi32(max_H); + max_t_ = _mm_set1_epi32(max_t); + for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; + __m128i H1, tmp, t_; + H1 = _mm_loadu_si128((__m128i*)&H[t]); + t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); + H1 = _mm_add_epi32(H1, t_); + _mm_storeu_si128((__m128i*)&H[t], H1); + t_ = _mm_set1_epi32(t); + tmp = _mm_cmpgt_epi32(H1, max_H_); +#ifdef __SSE4_1__ + max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); + max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); +#else + max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); + max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); +#endif + } + _mm_storeu_si128((__m128i*)HH, max_H_); + _mm_storeu_si128((__m128i*)tt, max_t_); + for (i = 0; i < 4; ++i) + if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i; + for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE + H[t] += (int32_t)v8[t]; + if (H[t] > max_H) + max_H = H[t], max_t = t; + } + } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0 + // update ez + if (en0 == tlen - 1 && H[en0] > ez->mte) + ez->mte = H[en0], ez->mte_q = r - en; + if (r - st0 == qlen - 1 && H[st0] > ez->mqe) + ez->mqe = H[st0], ez->mqe_t = st0; + if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e2)) break; + if (r == qlen + tlen - 2 && en0 == tlen - 1) + ez->score = H[tlen - 1]; + } else { // find approximate max; Z-drop might be inaccurate, too. + if (r > 0) { + if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) { + int32_t d0 = v8[last_H0_t]; + int32_t d1 = u8[last_H0_t + 1]; + if (d0 > d1) H0 += d0; + else H0 += d1, ++last_H0_t; + } else if (last_H0_t >= st0 && last_H0_t <= en0) { + H0 += v8[last_H0_t]; + } else { + ++last_H0_t, H0 += u8[last_H0_t]; + } + } else H0 = v8[0] - qe, last_H0_t = 0; + if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e2)) break; + if (r == qlen + tlen - 2 && en0 == tlen - 1) + ez->score = H0; + } + last_st = st, last_en = en; + //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging + } + kfree(km, mem); + if (!approx_max) kfree(km, H); + if (with_cigar) { // backtrack + int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); + if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) { + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int)ez->max) { + ez->reach_end = 1; + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } else if (ez->max_t >= 0 && ez->max_q >= 0) { + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } + kfree(km, mem2); kfree(km, off); + } +} +#endif // __SSE2__ diff --git a/lib/minimap2/ksw2_extd2_sse.c b/lib/minimap2/ksw2_extd2_sse.c index 162e9e264..8f96eb31d 100644 --- a/lib/minimap2/ksw2_extd2_sse.c +++ b/lib/minimap2/ksw2_extd2_sse.c @@ -358,7 +358,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0 // update ez if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; + ez->mte = H[en0], ez->mte_q = r - en0; if (r - st0 == qlen - 1 && H[st0] > ez->mqe) ez->mqe = H[st0], ez->mqe_t = st0; if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e2)) break; diff --git a/lib/minimap2/ksw2_exts2_sse.c b/lib/minimap2/ksw2_exts2_sse.c index 4157e3820..746778e2c 100644 --- a/lib/minimap2/ksw2_exts2_sse.c +++ b/lib/minimap2/ksw2_exts2_sse.c @@ -71,6 +71,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin ksw_reset_extz(ez); if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return; + assert((flag & KSW_EZ_SPLICE_FOR) == 0 || (flag & KSW_EZ_SPLICE_REV) == 0); // can't be both set zero_ = _mm_set1_epi8(0); q_ = _mm_set1_epi8(q); @@ -118,55 +119,93 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding! if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { - int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272 - memset(donor, -noncan, tlen_ * 16); - memset(acceptor, -noncan, tlen_ * 16); + const int sp0[4] = { 8, 15, 21, 30 }; + int sp[4]; + if (flag & KSW_EZ_SPLICE_CMPLX) { + for (t = 0; t < 4; ++t) + sp[t] = (int)((double)sp0[t] / 3. + .499); + } else { + sp[0] = flag&KSW_EZ_SPLICE_FLANK? noncan / 2 : 0; + sp[1] = sp[2] = sp[3] = noncan; + } + memset(donor, -sp[3], tlen_ * 16); + memset(acceptor, -sp[3], tlen_ * 16); if (!(flag & KSW_EZ_REV_CIGAR)) { for (t = 0; t < tlen - 4; ++t) { - int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG - if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr... - if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr... - if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2; - if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost; + int z = 3; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t+1] == 2 && target[t+2] == 3) // |GT. + z = target[t+3] == 0 || target[t+3] == 2? -1 : 0; // |GTr or not + else if (target[t+1] == 2 && target[t+2] == 1) z = 1; // |GC. + else if (target[t+1] == 0 && target[t+2] == 3) z = 2; // |AT. + } else if (flag & KSW_EZ_SPLICE_REV) { + if (target[t+1] == 1 && target[t+2] == 3) // |CT. (revcomp of .AG|) + z = target[t+3] == 0 || target[t+3] == 2? -1 : 0; + else if (target[t+1] == 2 && target[t+2] == 3) z = 2; // |GT. (revcomp of .AC|) + } + ((int8_t*)donor)[t] = z < 0? 0 : -sp[z]; } - if (junc) - for (t = 0; t < tlen - 1; ++t) - if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8))) - ((int8_t*)donor)[t] += junc_bonus; for (t = 2; t < tlen; ++t) { - int can_type = 0; - if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG - if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC - if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2; - if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost; + int z = 3; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t-1] == 0 && target[t] == 2) // .AG| + z = target[t-2] == 1 || target[t-2] == 3? -1 : 0; // yAG| or not + else if (target[t-1] == 0 && target[t] == 1) z = 2; // .AC| + } else if (flag & KSW_EZ_SPLICE_REV) { + if (target[t-1] == 0 && target[t] == 1) // .AC| (revcomp of |GT.) + z = target[t-2] == 1 || target[t-2] == 3? -1 : 0; // yAC| or not + else if (target[t-1] == 2 && target[t] == 1) z = 1; // .GC| (revcomp of |GC.) + else if (target[t-1] == 0 && target[t] == 3) z = 2; // .AT| (revcomp of |AT.) + } + ((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z]; } - if (junc) - for (t = 0; t < tlen; ++t) - if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4))) - ((int8_t*)acceptor)[t] += junc_bonus; } else { for (t = 0; t < tlen - 4; ++t) { - int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG - if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 0) can_type = 1; // GAy... - if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 0) can_type = 1; // CAy... - if (can_type && (target[t+3] == 1 || target[t+3] == 3)) can_type = 2; - if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost; + int z = 3; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t+1] == 2 && target[t+2] == 0) // |GA. (rev of .AG|) + z = target[t+3] == 1 || target[t+3] == 3? -1 : 0; + else if (target[t+1] == 1 && target[t+2] == 0) z = 2; // |CA. (rev of .AC|) + } else if (flag & KSW_EZ_SPLICE_REV) { + if (target[t+1] == 1 && target[t+2] == 0) // |CA. (comp of |GT.) + z = target[t+3] == 1 || target[t+3] == 3? -1 : 0; + else if (target[t+1] == 1 && target[t+2] == 2) z = 1; // |CG. (comp of |GC.) + else if (target[t+1] == 3 && target[t+2] == 0) z = 2; // |TA. (comp of |AT.) + } + ((int8_t*)donor)[t] = z < 0? 0 : -sp[z]; } - if (junc) - for (t = 0; t < tlen - 1; ++t) - if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4))) - ((int8_t*)donor)[t] += junc_bonus; for (t = 2; t < tlen; ++t) { - int can_type = 0; - if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 3 && target[t] == 2) can_type = 1; // ...rTG - if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 3 && target[t] == 1) can_type = 1; // ...rTC - if (can_type && (target[t-2] == 0 || target[t-2] == 2)) can_type = 2; - if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost; + int z = 3; + if (flag & KSW_EZ_SPLICE_FOR) { + if (target[t-1] == 3 && target[t] == 2) // .TG| (rev of |GT.) + z = target[t-2] == 0 || target[t-2] == 2? -1 : 0; + else if (target[t-1] == 1 && target[t] == 2) z = 1; // .CG| (rev of |GC.) + else if (target[t-1] == 3 && target[t] == 0) z = 2; // .TA| (rev of |AT.) + } else if (flag & KSW_EZ_SPLICE_REV) { + if (target[t-1] == 3 && target[t] == 1) // .TC| (comp of .AG|) + z = target[t-2] == 0 || target[t-2] == 2? -1 : 0; + else if (target[t-1] == 3 && target[t] == 2) z = 2; // .TG| (comp of .AC|) + } + ((int8_t*)acceptor)[t] = z < 0? 0 : -sp[z]; } - if (junc) - for (t = 0; t < tlen; ++t) - if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8))) - ((int8_t*)acceptor)[t] += junc_bonus; + } + } + + if (junc) { + if (!(flag & KSW_EZ_REV_CIGAR)) { + for (t = 0; t < tlen - 1; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&8))) + ((int8_t*)donor)[t] += junc_bonus; + for (t = 0; t < tlen; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&4))) + ((int8_t*)acceptor)[t] += junc_bonus; + } else { + for (t = 0; t < tlen - 1; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t+1]&2)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t+1]&4))) + ((int8_t*)donor)[t] += junc_bonus; + for (t = 0; t < tlen; ++t) + if (((flag & KSW_EZ_SPLICE_FOR) && (junc[t]&1)) || ((flag & KSW_EZ_SPLICE_REV) && (junc[t]&8))) + ((int8_t*)acceptor)[t] += junc_bonus; } } @@ -376,7 +415,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0 // update ez if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; + ez->mte = H[en0], ez->mte_q = r - en0; if (r - st0 == qlen - 1 && H[st0] > ez->mqe) ez->mqe = H[st0], ez->mqe_t = st0; if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break; diff --git a/lib/minimap2/ksw2_extz2_sse.c b/lib/minimap2/ksw2_extz2_sse.c index ad1913140..a2154fe4f 100644 --- a/lib/minimap2/ksw2_extz2_sse.c +++ b/lib/minimap2/ksw2_extz2_sse.c @@ -269,7 +269,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin } else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0 // update ez if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; + ez->mte = H[en0], ez->mte_q = r - en0; if (r - st0 == qlen - 1 && H[st0] > ez->mqe) ez->mqe = H[st0], ez->mqe_t = st0; if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break; diff --git a/lib/minimap2/kthread 2.c b/lib/minimap2/kthread 2.c new file mode 100644 index 000000000..ffdf9408c --- /dev/null +++ b/lib/minimap2/kthread 2.c @@ -0,0 +1,159 @@ +#include +#include +#include +#include +#include "kthread.h" + +#if (defined(WIN32) || defined(_WIN32)) && defined(_MSC_VER) +#define __sync_fetch_and_add(ptr, addend) _InterlockedExchangeAdd((void*)ptr, addend) +#endif + +/************ + * kt_for() * + ************/ + +struct kt_for_t; + +typedef struct { + struct kt_for_t *t; + long i; +} ktf_worker_t; + +typedef struct kt_for_t { + int n_threads; + long n; + ktf_worker_t *w; + void (*func)(void*,long,int); + void *data; +} kt_for_t; + +static inline long steal_work(kt_for_t *t) +{ + int i, min_i = -1; + long k, min = LONG_MAX; + for (i = 0; i < t->n_threads; ++i) + if (min > t->w[i].i) min = t->w[i].i, min_i = i; + k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); + return k >= t->n? -1 : k; +} + +static void *ktf_worker(void *data) +{ + ktf_worker_t *w = (ktf_worker_t*)data; + long i; + for (;;) { + i = __sync_fetch_and_add(&w->i, w->t->n_threads); + if (i >= w->t->n) break; + w->t->func(w->t->data, i, w - w->t->w); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func(w->t->data, i, w - w->t->w); + pthread_exit(0); +} + +void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) +{ + if (n_threads > 1) { + int i; + kt_for_t t; + pthread_t *tid; + t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t*)calloc(n_threads, sizeof(ktf_worker_t)); + tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i = i; + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + free(tid); free(t.w); + } else { + long j; + for (j = 0; j < n; ++j) func(data, j, 0); + } +} + +/***************** + * kt_pipeline() * + *****************/ + +struct ktp_t; + +typedef struct { + struct ktp_t *pl; + int64_t index; + int step; + void *data; +} ktp_worker_t; + +typedef struct ktp_t { + void *shared; + void *(*func)(void*, int, void*); + int64_t index; + int n_workers, n_steps; + ktp_worker_t *workers; + pthread_mutex_t mutex; + pthread_cond_t cv; +} ktp_t; + +static void *ktp_worker(void *data) +{ + ktp_worker_t *w = (ktp_worker_t*)data; + ktp_t *p = w->pl; + while (w->step < p->n_steps) { + // test whether we can kick off the job with this worker + pthread_mutex_lock(&p->mutex); + for (;;) { + int i; + // test whether another worker is doing the same step + for (i = 0; i < p->n_workers; ++i) { + if (w == &p->workers[i]) continue; // ignore itself + if (p->workers[i].step <= w->step && p->workers[i].index < w->index) + break; + } + if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps + pthread_cond_wait(&p->cv, &p->mutex); + } + pthread_mutex_unlock(&p->mutex); + + // working on w->step + w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL + + // update step and let other workers know + pthread_mutex_lock(&p->mutex); + w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps; + if (w->step == 0) w->index = p->index++; + pthread_cond_broadcast(&p->cv); + pthread_mutex_unlock(&p->mutex); + } + pthread_exit(0); +} + +void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps) +{ + ktp_t aux; + pthread_t *tid; + int i; + + if (n_threads < 1) n_threads = 1; + aux.n_workers = n_threads; + aux.n_steps = n_steps; + aux.func = func; + aux.shared = shared_data; + aux.index = 0; + pthread_mutex_init(&aux.mutex, 0); + pthread_cond_init(&aux.cv, 0); + + aux.workers = (ktp_worker_t*)calloc(n_threads, sizeof(ktp_worker_t)); + for (i = 0; i < n_threads; ++i) { + ktp_worker_t *w = &aux.workers[i]; + w->step = 0; w->pl = &aux; w->data = 0; + w->index = aux.index++; + } + + tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + free(tid); free(aux.workers); + + pthread_mutex_destroy(&aux.mutex); + pthread_cond_destroy(&aux.cv); +} diff --git a/lib/minimap2/kthread 2.h b/lib/minimap2/kthread 2.h new file mode 100644 index 000000000..c3cd165e5 --- /dev/null +++ b/lib/minimap2/kthread 2.h @@ -0,0 +1,15 @@ +#ifndef KTHREAD_H +#define KTHREAD_H + +#ifdef __cplusplus +extern "C" { +#endif + +void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); +void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/minimap2/lchain 2.c b/lib/minimap2/lchain 2.c new file mode 100644 index 000000000..244d301e4 --- /dev/null +++ b/lib/minimap2/lchain 2.c @@ -0,0 +1,368 @@ +#include +#include +#include +#include +#include "mmpriv.h" +#include "kalloc.h" +#include "krmq.h" + +static int64_t mg_chain_bk_end(int32_t max_drop, const mm128_t *z, const int32_t *f, const int64_t *p, int32_t *t, int64_t k) +{ + int64_t i = z[k].y, end_i = -1, max_i = i; + int32_t max_s = 0; + if (i < 0 || t[i] != 0) return i; + do { + int32_t s; + t[i] = 2; + end_i = i = p[i]; + s = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (s > max_s) max_s = s, max_i = i; + else if (max_s - s > max_drop) break; + } while (i >= 0 && t[i] == 0); + for (i = z[k].y; i >= 0 && i != end_i; i = p[i]) // reset modified t[] + t[i] = 0; + return max_i; +} + +uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t max_drop, int32_t *n_u_, int32_t *n_v_) +{ + mm128_t *z; + uint64_t *u; + int64_t i, k, n_z, n_v; + int32_t n_u; + + *n_u_ = *n_v_ = 0; + for (i = 0, n_z = 0; i < n; ++i) // precompute n_z + if (f[i] >= min_sc) ++n_z; + if (n_z == 0) return 0; + KMALLOC(km, z, n_z); + for (i = 0, k = 0; i < n; ++i) // populate z[] + if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i; + radix_sort_128x(z, z + n_z); + + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u + if (t[z[k].y] == 0) { + int64_t n_v0 = n_v, end_i; + int32_t sc; + end_i = mg_chain_bk_end(max_drop, z, f, p, t, k); + for (i = z[k].y; i != end_i; i = p[i]) + ++n_v, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + ++n_u; + else n_v = n_v0; + } + } + KMALLOC(km, u, n_u); + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[] + if (t[z[k].y] == 0) { + int64_t n_v0 = n_v, end_i; + int32_t sc; + end_i = mg_chain_bk_end(max_drop, z, f, p, t, k); + for (i = z[k].y; i != end_i; i = p[i]) + v[n_v++] = i, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0); + else n_v = n_v0; + } + } + kfree(km, z); + assert(n_v < INT32_MAX); + *n_u_ = n_u, *n_v_ = n_v; + return u; +} + +static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mm128_t *a) +{ + mm128_t *b, *w; + uint64_t *u2; + int64_t i, j, k; + + // write the result to b[] + KMALLOC(km, b, n_v); + for (i = 0, k = 0; i < n_u; ++i) { + int32_t k0 = k, ni = (int32_t)u[i]; + for (j = 0; j < ni; ++j) + b[k++] = a[v[k0 + (ni - j - 1)]]; + } + kfree(km, v); + + // sort u[] and a[] by the target position, such that adjacent chains may be joined + KMALLOC(km, w, n_u); + for (i = k = 0; i < n_u; ++i) { + w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; + k += (int32_t)u[i]; + } + radix_sort_128x(w, w + n_u); + KMALLOC(km, u2, n_u); + for (i = k = 0; i < n_u; ++i) { + int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; + u2[i] = u[j]; + memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mm128_t)); + k += n; + } + memcpy(u, u2, n_u * 8); + memcpy(b, a, k * sizeof(mm128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot + kfree(km, a); kfree(km, w); kfree(km, u2); + return b; +} + +static inline int32_t comput_sc(const mm128_t *ai, const mm128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap, float chn_pen_skip, int is_cdna, int n_seg) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc; + int32_t sidi = (ai->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + int32_t sidj = (aj->y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; + if (dq <= 0 || dq > max_dist_x) return INT32_MIN; + dr = (int32_t)(ai->x - aj->x); + if (sidi == sidj && (dr == 0 || dq > max_dist_y)) return INT32_MIN; + dd = dr > dq? dr - dq : dq - dr; + if (sidi == sidj && dd > bw) return INT32_MIN; + if (n_seg > 1 && !is_cdna && sidi == sidj && dr > max_dist_y) return INT32_MIN; + dg = dr < dq? dr : dq; + q_span = aj->y>>32&0xff; + sc = q_span < dg? q_span : dg; + if (dd || dg > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + if (is_cdna || sidi != sidj) { + if (sidi != sidj && dr == 0) ++sc; // possibly due to overlapping paired ends; give a minor bonus + else if (dr > dq || sidi != sidj) sc -= (int)(lin_pen < log_pen? lin_pen : log_pen); // deletion or jump between paired ends + else sc -= (int)(lin_pen + .5f * log_pen); + } else sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +/* Input: + * a[].x: rev<<63 | tid<<32 | tpos + * a[].y: flags<<40 | q_span<<32 | q_pos + * Output: + * n_u: #chains + * u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[]) + * input a[] is deallocated on return + */ +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int is_cdna, int n_seg, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +{ // TODO: make sure this works when n has more than 32 bits + int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw; + int64_t *p, i, j, max_ii, st = 0, n_iter = 0; + uint64_t *u; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) { + kfree(km, a); + return 0; + } + if (max_dist_x < bw) max_dist_x = bw; + if (max_dist_y < bw && !is_cdna) max_dist_y = bw; + if (is_cdna) max_drop = INT32_MAX; + KMALLOC(km, p, n); + KMALLOC(km, f, n); + KMALLOC(km, v, n); + KCALLOC(km, t, n); + + // fill the score and backtrack arrays + for (i = 0, max_ii = -1; i < n; ++i) { + int64_t max_j = -1, end_j; + int32_t max_f = a[i].y>>32&0xff, n_skip = 0; + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st; + if (i - st > max_iter) st = i - max_iter; + for (j = i - 1; j >= st; --j) { + int32_t sc; + sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg); + ++n_iter; + if (sc == INT32_MIN) continue; + sc += f[j]; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + end_j = j; + if (max_ii < 0 || a[i].x - a[max_ii].x > (int64_t)max_dist_x) { + int32_t max = INT32_MIN; + max_ii = -1; + for (j = i - 1; j >= st; --j) + if (max < f[j]) max = f[j], max_ii = j; + } + if (max_ii >= 0 && max_ii < end_j) { + int32_t tmp; + tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_seg); + if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) + max_f = tmp + f[max_ii], max_j = max_ii; + } + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + if (max_ii < 0 || (a[i].x - a[max_ii].x <= (int64_t)max_dist_x && f[max_ii] < f[i])) + max_ii = i; + if (mmax_f < max_f) mmax_f = max_f; + } + + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + return compact_a(km, n_u, u, n_v, v, a); +} + +typedef struct lc_elem_s { + int32_t y; + int64_t i; + double pri; + KRMQ_HEAD(struct lc_elem_s) head; +} lc_elem_t; + +#define lc_elem_cmp(a, b) ((a)->y < (b)->y? -1 : (a)->y > (b)->y? 1 : ((a)->i > (b)->i) - ((a)->i < (b)->i)) +#define lc_elem_lt2(a, b) ((a)->pri < (b)->pri) +KRMQ_INIT(lc_elem, lc_elem_t, head, lc_elem_cmp, lc_elem_lt2) + +KALLOC_POOL_INIT(rmq, lc_elem_t) + +static inline int32_t comput_sc_simple(const mm128_t *ai, const mm128_t *aj, float chn_pen_gap, float chn_pen_skip, int32_t *exact, int32_t *width) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr, dd, dg, q_span, sc; + dr = (int32_t)(ai->x - aj->x); + *width = dd = dr > dq? dr - dq : dq - dr; + dg = dr < dq? dr : dq; + q_span = aj->y>>32&0xff; + sc = q_span < dg? q_span : dg; + if (exact) *exact = (dd == 0 && dg <= q_span); + if (dd || dq > q_span) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd + chn_pen_skip * (float)dg; + log_pen = dd >= 1? mg_log2(dd + 1) : 0.0f; // mg_log2() only works for dd>=2 + sc -= (int)(lin_pen + .5f * log_pen); + } + return sc; +} + +mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +{ + int32_t *f,*t, *v, n_u, n_v, mmax_f = 0, max_rmq_size = 0, max_drop = bw; + int64_t *p, i, i0, st = 0, st_inner = 0; + uint64_t *u; + lc_elem_t *root = 0, *root_inner = 0; + void *mem_mp = 0; + kmp_rmq_t *mp; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) { + kfree(km, a); + return 0; + } + if (max_dist < bw) max_dist = bw; + if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0; + KMALLOC(km, p, n); + KMALLOC(km, f, n); + KCALLOC(km, t, n); + KMALLOC(km, v, n); + mem_mp = km_init2(km, 0x10000); + mp = kmp_init_rmq(mem_mp); + + // fill the score and backtrack arrays + for (i = i0 = 0; i < n; ++i) { + int64_t max_j = -1; + int32_t q_span = a[i].y>>32&0xff, max_f = q_span; + lc_elem_t s, *q, *r, lo, hi; + // add in-range anchors + if (i0 < i && a[i0].x != a[i].x) { + int64_t j; + for (j = i0; j < i; ++j) { + q = kmp_alloc_rmq(mp); + q->y = (int32_t)a[j].y, q->i = j, q->pri = -(f[j] + 0.5 * chn_pen_gap * ((int32_t)a[j].x + (int32_t)a[j].y)); + krmq_insert(lc_elem, &root, q, 0); + if (max_dist_inner > 0) { + r = kmp_alloc_rmq(mp); + *r = *q; + krmq_insert(lc_elem, &root_inner, r, 0); + } + } + i0 = i; + } + // get rid of active chains out of range + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist || krmq_size(head, root) > cap_rmq_size)) { + s.y = (int32_t)a[st].y, s.i = st; + if ((q = krmq_find(lc_elem, root, &s, 0)) != 0) { + q = krmq_erase(lc_elem, &root, q, 0); + kmp_free_rmq(mp, q); + } + ++st; + } + if (max_dist_inner > 0) { // similar to the block above, but applied to the inner tree + while (st_inner < i && (a[i].x>>32 != a[st_inner].x>>32 || a[i].x > a[st_inner].x + max_dist_inner || krmq_size(head, root_inner) > cap_rmq_size)) { + s.y = (int32_t)a[st_inner].y, s.i = st_inner; + if ((q = krmq_find(lc_elem, root_inner, &s, 0)) != 0) { + q = krmq_erase(lc_elem, &root_inner, q, 0); + kmp_free_rmq(mp, q); + } + ++st_inner; + } + } + // RMQ + lo.i = INT32_MAX, lo.y = (int32_t)a[i].y - max_dist; + hi.i = 0, hi.y = (int32_t)a[i].y; + if ((q = krmq_rmq(lc_elem, root, &lo, &hi)) != 0) { + int32_t sc, exact, width, n_skip = 0; + int64_t j = q->i; + assert(q->y >= lo.y && q->y <= hi.y); + sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, &exact, &width); + if (width <= bw && sc > max_f) max_f = sc, max_j = j; + if (!exact && root_inner && (int32_t)a[i].y > 0) { + lc_elem_t *lo, *hi; + s.y = (int32_t)a[i].y - 1, s.i = n; + krmq_interval(lc_elem, root_inner, &s, &lo, &hi); + if (lo) { + const lc_elem_t *q; + int32_t width, n_rmq_iter = 0; + krmq_itr_t(lc_elem) itr; + krmq_itr_find(lc_elem, root_inner, lo, &itr); + while ((q = krmq_at(&itr)) != 0) { + if (q->y < (int32_t)a[i].y - max_dist_inner) break; + ++n_rmq_iter; + j = q->i; + sc = f[j] + comput_sc_simple(&a[i], &a[j], chn_pen_gap, chn_pen_skip, 0, &width); + if (width <= bw) { + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_chn_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + if (!krmq_itr_prev(lc_elem, &itr)) break; + } + } + } + } + // set max + assert(max_j < 0 || (a[max_j].x < a[i].x && (int32_t)a[max_j].y < (int32_t)a[i].y)); + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + if (mmax_f < max_f) mmax_f = max_f; + if (max_rmq_size < krmq_size(head, root)) max_rmq_size = krmq_size(head, root); + } + km_destroy(mem_mp); + + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, max_drop, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + return compact_a(km, n_u, u, n_v, v, a); +} diff --git a/lib/minimap2/lchain.c b/lib/minimap2/lchain.c index 244d301e4..7df5cab5d 100644 --- a/lib/minimap2/lchain.c +++ b/lib/minimap2/lchain.c @@ -35,7 +35,7 @@ uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_ for (i = 0, n_z = 0; i < n; ++i) // precompute n_z if (f[i] >= min_sc) ++n_z; if (n_z == 0) return 0; - KMALLOC(km, z, n_z); + z = Kmalloc(km, mm128_t, n_z); for (i = 0, k = 0; i < n; ++i) // populate z[] if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i; radix_sort_128x(z, z + n_z); @@ -54,7 +54,7 @@ uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_ else n_v = n_v0; } } - KMALLOC(km, u, n_u); + u = Kmalloc(km, uint64_t, n_u); memset(t, 0, n * 4); for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[] if (t[z[k].y] == 0) { @@ -82,7 +82,7 @@ static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32 int64_t i, j, k; // write the result to b[] - KMALLOC(km, b, n_v); + b = Kmalloc(km, mm128_t, n_v); for (i = 0, k = 0; i < n_u; ++i) { int32_t k0 = k, ni = (int32_t)u[i]; for (j = 0; j < ni; ++j) @@ -91,13 +91,13 @@ static mm128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32 kfree(km, v); // sort u[] and a[] by the target position, such that adjacent chains may be joined - KMALLOC(km, w, n_u); + w = Kmalloc(km, mm128_t, n_u); for (i = k = 0; i < n_u; ++i) { w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; k += (int32_t)u[i]; } radix_sort_128x(w, w + n_u); - KMALLOC(km, u2, n_u); + u2 = Kmalloc(km, uint64_t, n_u); for (i = k = 0; i < n_u; ++i) { int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; u2[i] = u[j]; @@ -160,10 +160,10 @@ mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int if (max_dist_x < bw) max_dist_x = bw; if (max_dist_y < bw && !is_cdna) max_dist_y = bw; if (is_cdna) max_drop = INT32_MAX; - KMALLOC(km, p, n); - KMALLOC(km, f, n); - KMALLOC(km, v, n); - KCALLOC(km, t, n); + p = Kmalloc(km, int64_t, n); + f = Kmalloc(km, int32_t, n); + v = Kmalloc(km, int32_t, n); + t = Kcalloc(km, int32_t, n); // fill the score and backtrack arrays for (i = 0, max_ii = -1; i < n; ++i) { @@ -264,10 +264,10 @@ mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_ski } if (max_dist < bw) max_dist = bw; if (max_dist_inner <= 0 || max_dist_inner >= max_dist) max_dist_inner = 0; - KMALLOC(km, p, n); - KMALLOC(km, f, n); - KCALLOC(km, t, n); - KMALLOC(km, v, n); + p = Kmalloc(km, int64_t, n); + f = Kmalloc(km, int32_t, n); + t = Kcalloc(km, int32_t, n); + v = Kmalloc(km, int32_t, n); mem_mp = km_init2(km, 0x10000); mp = kmp_init_rmq(mem_mp); diff --git a/lib/minimap2/main 2.c b/lib/minimap2/main 2.c new file mode 100644 index 000000000..135e26078 --- /dev/null +++ b/lib/minimap2/main 2.c @@ -0,0 +1,464 @@ +#include +#include +#include +#include +#include "bseq.h" +#include "minimap.h" +#include "mmpriv.h" +#include "ketopt.h" + +#define MM_VERSION "2.24-r1155-dirty" + +#ifdef __linux__ +#include +#include +void liftrlimit() +{ + struct rlimit r; + getrlimit(RLIMIT_AS, &r); + r.rlim_cur = r.rlim_max; + setrlimit(RLIMIT_AS, &r); +} +#else +void liftrlimit() {} +#endif + +static ko_longopt_t long_options[] = { + { "bucket-bits", ko_required_argument, 300 }, + { "mb-size", ko_required_argument, 'K' }, + { "seed", ko_required_argument, 302 }, + { "no-kalloc", ko_no_argument, 303 }, + { "print-qname", ko_no_argument, 304 }, + { "no-self", ko_no_argument, 'D' }, + { "print-seeds", ko_no_argument, 306 }, + { "max-chain-skip", ko_required_argument, 307 }, + { "min-dp-len", ko_required_argument, 308 }, + { "print-aln-seq", ko_no_argument, 309 }, + { "splice", ko_no_argument, 310 }, + { "cost-non-gt-ag", ko_required_argument, 'C' }, + { "no-long-join", ko_no_argument, 312 }, + { "sr", ko_no_argument, 313 }, + { "frag", ko_required_argument, 314 }, + { "secondary", ko_required_argument, 315 }, + { "cs", ko_optional_argument, 316 }, + { "end-bonus", ko_required_argument, 317 }, + { "no-pairing", ko_no_argument, 318 }, + { "splice-flank", ko_required_argument, 319 }, + { "idx-no-seq", ko_no_argument, 320 }, + { "end-seed-pen", ko_required_argument, 321 }, + { "for-only", ko_no_argument, 322 }, + { "rev-only", ko_no_argument, 323 }, + { "heap-sort", ko_required_argument, 324 }, + { "all-chain", ko_no_argument, 'P' }, + { "dual", ko_required_argument, 326 }, + { "max-clip-ratio", ko_required_argument, 327 }, + { "min-occ-floor", ko_required_argument, 328 }, + { "MD", ko_no_argument, 329 }, + { "lj-min-ratio", ko_required_argument, 330 }, + { "score-N", ko_required_argument, 331 }, + { "eqx", ko_no_argument, 332 }, + { "paf-no-hit", ko_no_argument, 333 }, + { "split-prefix", ko_required_argument, 334 }, + { "no-end-flt", ko_no_argument, 335 }, + { "hard-mask-level",ko_no_argument, 336 }, + { "cap-sw-mem", ko_required_argument, 337 }, + { "max-qlen", ko_required_argument, 338 }, + { "max-chain-iter", ko_required_argument, 339 }, + { "junc-bed", ko_required_argument, 340 }, + { "junc-bonus", ko_required_argument, 341 }, + { "sam-hit-only", ko_no_argument, 342 }, + { "chain-gap-scale",ko_required_argument, 343 }, + { "alt", ko_required_argument, 344 }, + { "alt-drop", ko_required_argument, 345 }, + { "mask-len", ko_required_argument, 346 }, + { "rmq", ko_optional_argument, 347 }, + { "qstrand", ko_no_argument, 348 }, + { "cap-kalloc", ko_required_argument, 349 }, + { "q-occ-frac", ko_required_argument, 350 }, + { "chain-skip-scale",ko_required_argument,351 }, + { "print-chains", ko_no_argument, 352 }, + { "no-hash-name", ko_no_argument, 353 }, + { "secondary-seq", ko_no_argument, 354 }, + { "help", ko_no_argument, 'h' }, + { "max-intron-len", ko_required_argument, 'G' }, + { "version", ko_no_argument, 'V' }, + { "min-count", ko_required_argument, 'n' }, + { "min-chain-score",ko_required_argument, 'm' }, + { "mask-level", ko_required_argument, 'M' }, + { "min-dp-score", ko_required_argument, 's' }, + { "sam", ko_no_argument, 'a' }, + { 0, 0, 0 } +}; + +static inline int64_t mm_parse_num2(const char *str, char **q) +{ + double x; + char *p; + x = strtod(str, &p); + if (*p == 'G' || *p == 'g') x *= 1e9, ++p; + else if (*p == 'M' || *p == 'm') x *= 1e6, ++p; + else if (*p == 'K' || *p == 'k') x *= 1e3, ++p; + if (q) *q = p; + return (int64_t)(x + .499); +} + +static inline int64_t mm_parse_num(const char *str) +{ + return mm_parse_num2(str, 0); +} + +static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const char *arg, int yes_to_set) +{ + if (yes_to_set) { + if (strcmp(arg, "yes") == 0 || strcmp(arg, "y") == 0) opt->flag |= flag; + else if (strcmp(arg, "no") == 0 || strcmp(arg, "n") == 0) opt->flag &= ~flag; + else fprintf(stderr, "[WARNING]\033[1;31m option '--%s' only accepts 'yes' or 'no'.\033[0m\n", long_options[long_idx].name); + } else { + if (strcmp(arg, "yes") == 0 || strcmp(arg, "y") == 0) opt->flag &= ~flag; + else if (strcmp(arg, "no") == 0 || strcmp(arg, "n") == 0) opt->flag |= flag; + else fprintf(stderr, "[WARNING]\033[1;31m option '--%s' only accepts 'yes' or 'no'.\033[0m\n", long_options[long_idx].name); + } +} + +int main(int argc, char *argv[]) +{ + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:j:"; + ketopt_t o = KETOPT_INIT; + mm_mapopt_t opt; + mm_idxopt_t ipt; + int i, c, n_threads = 3, n_parts, old_best_n = -1; + char *fnw = 0, *rg = 0, *junc_bed = 0, *s, *alt_list = 0; + FILE *fp_help = stderr; + mm_idx_reader_t *idx_rdr; + mm_idx_t *mi; + + mm_verbose = 3; + liftrlimit(); + mm_realtime0 = realtime(); + mm_set_opt(0, &ipt, &opt); + + while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { // test command line options and apply option -x/preset first + if (c == 'x') { + if (mm_set_opt(o.arg, &ipt, &opt) < 0) { + fprintf(stderr, "[ERROR] unknown preset '%s'\n", o.arg); + return 1; + } + } else if (c == ':') { + fprintf(stderr, "[ERROR] missing option argument\n"); + return 1; + } else if (c == '?') { + fprintf(stderr, "[ERROR] unknown option in \"%s\"\n", argv[o.i - 1]); + return 1; + } + } + o = KETOPT_INIT; + + while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { + if (c == 'w') ipt.w = atoi(o.arg), ipt.flag &= ~MM_I_SYNCMER; + else if (c == 'j') ipt.w = atoi(o.arg), ipt.flag |= MM_I_SYNCMER; + else if (c == 'k') ipt.k = atoi(o.arg); + else if (c == 'H') ipt.flag |= MM_I_HPC; + else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I + else if (c == 't') n_threads = atoi(o.arg); + else if (c == 'v') mm_verbose = atoi(o.arg); + else if (c == 'g') opt.max_gap = (int)mm_parse_num(o.arg); + else if (c == 'G') mm_mapopt_max_intron_len(&opt, (int)mm_parse_num(o.arg)); + else if (c == 'F') opt.max_frag_len = (int)mm_parse_num(o.arg); + else if (c == 'N') old_best_n = opt.best_n, opt.best_n = atoi(o.arg); + else if (c == 'p') opt.pri_ratio = atof(o.arg); + else if (c == 'M') opt.mask_level = atof(o.arg); + else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR; + else if (c == 'D') opt.flag |= MM_F_NO_DIAG; + else if (c == 'P') opt.flag |= MM_F_ALL_CHAINS; + else if (c == 'X') opt.flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; // -D -P --no-long-join --dual=no + else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR; + else if (c == 'Q') opt.flag |= MM_F_NO_QUAL; + else if (c == 'Y') opt.flag |= MM_F_SOFTCLIP; + else if (c == 'L') opt.flag |= MM_F_LONG_CIGAR; + else if (c == 'y') opt.flag |= MM_F_COPY_COMMENT; + else if (c == 'T') opt.sdust_thres = atoi(o.arg); + else if (c == 'n') opt.min_cnt = atoi(o.arg); + else if (c == 'm') opt.min_chain_score = atoi(o.arg); + else if (c == 'A') opt.a = atoi(o.arg); + else if (c == 'B') opt.b = atoi(o.arg); + else if (c == 's') opt.min_dp_max = atoi(o.arg); + else if (c == 'C') opt.noncan = atoi(o.arg); + else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg); + else if (c == 'K') opt.mini_batch_size = mm_parse_num(o.arg); + else if (c == 'e') opt.occ_dist = mm_parse_num(o.arg); + else if (c == 'R') rg = o.arg; + else if (c == 'h') fp_help = stdout; + else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; + else if (c == 'o') { + if (strcmp(o.arg, "-") != 0) { + if (freopen(o.arg, "wb", stdout) == NULL) { + fprintf(stderr, "[ERROR]\033[1;31m failed to write the output to file '%s'\033[0m: %s\n", o.arg, strerror(errno)); + exit(1); + } + } + } + else if (c == 300) ipt.bucket_bits = atoi(o.arg); // --bucket-bits + else if (c == 302) opt.seed = atoi(o.arg); // --seed + else if (c == 303) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc + else if (c == 304) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname + else if (c == 306) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED, n_threads = 1; // --print-seed + else if (c == 307) opt.max_chain_skip = atoi(o.arg); // --max-chain-skip + else if (c == 339) opt.max_chain_iter = atoi(o.arg); // --max-chain-iter + else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len + else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq + else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice + else if (c == 312) opt.flag |= MM_F_NO_LJOIN; // --no-long-join + else if (c == 313) opt.flag |= MM_F_SR; // --sr + else if (c == 317) opt.end_bonus = atoi(o.arg); // --end-bonus + else if (c == 318) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing + else if (c == 320) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq + else if (c == 321) opt.anchor_ext_shift = atoi(o.arg); // --end-seed-pen + else if (c == 322) opt.flag |= MM_F_FOR_ONLY; // --for-only + else if (c == 323) opt.flag |= MM_F_REV_ONLY; // --rev-only + else if (c == 327) opt.max_clip_ratio = atof(o.arg); // --max-clip-ratio + else if (c == 328) opt.min_mid_occ = atoi(o.arg); // --min-occ-floor + else if (c == 329) opt.flag |= MM_F_OUT_MD; // --MD + else if (c == 331) opt.sc_ambi = atoi(o.arg); // --score-N + else if (c == 332) opt.flag |= MM_F_EQX; // --eqx + else if (c == 333) opt.flag |= MM_F_PAF_NO_HIT; // --paf-no-hit + else if (c == 334) opt.split_prefix = o.arg; // --split-prefix + else if (c == 335) opt.flag |= MM_F_NO_END_FLT; // --no-end-flt + else if (c == 336) opt.flag |= MM_F_HARD_MLEVEL; // --hard-mask-level + else if (c == 337) opt.max_sw_mat = mm_parse_num(o.arg); // --cap-sw-mat + else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen + else if (c == 340) junc_bed = o.arg; // --junc-bed + else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus + else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only + else if (c == 343) opt.chain_gap_scale = atof(o.arg); // --chain-gap-scale + else if (c == 351) opt.chain_skip_scale = atof(o.arg); // --chain-skip-scale + else if (c == 344) alt_list = o.arg; // --alt + else if (c == 345) opt.alt_drop = atof(o.arg); // --alt-drop + else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len + else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand + else if (c == 349) opt.cap_kalloc = mm_parse_num(o.arg); // --cap-kalloc + else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac + else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains + else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name + else if (c == 347) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq + else if (c == 330) { + fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n"); + } else if (c == 314) { // --frag + yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); + } else if (c == 315) { // --secondary + yes_or_no(&opt, MM_F_NO_PRINT_2ND, o.longidx, o.arg, 0); + } else if (c == 316) { // --cs + opt.flag |= MM_F_OUT_CS | MM_F_CIGAR; + if (o.arg == 0 || strcmp(o.arg, "short") == 0) { + opt.flag &= ~MM_F_OUT_CS_LONG; + } else if (strcmp(o.arg, "long") == 0) { + opt.flag |= MM_F_OUT_CS_LONG; + } else if (strcmp(o.arg, "none") == 0) { + opt.flag &= ~MM_F_OUT_CS; + } else if (mm_verbose >= 2) { + fprintf(stderr, "[WARNING]\033[1;31m --cs only takes 'short' or 'long'. Invalid values are assumed to be 'short'.\033[0m\n"); + } + } else if (c == 319) { // --splice-flank + yes_or_no(&opt, MM_F_SPLICE_FLANK, o.longidx, o.arg, 1); + } else if (c == 324) { // --heap-sort + yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1); + } else if (c == 326) { // --dual + yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); + } else if (c == 347) { // --rmq + if (o.arg) yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); + else opt.flag |= MM_F_RMQ; + } else if (c == 'S') { + opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; + if (mm_verbose >= 2) + fprintf(stderr, "[WARNING]\033[1;31m option -S is deprecated and may be removed in future. Please use --cs=long instead.\033[0m\n"); + } else if (c == 'V') { + puts(MM_VERSION); + return 0; + } else if (c == 'r') { + opt.bw = (int)mm_parse_num2(o.arg, &s); + if (*s == ',') opt.bw_long = (int)mm_parse_num2(s + 1, &s); + } else if (c == 'U') { + opt.min_mid_occ = strtol(o.arg, &s, 10); + if (*s == ',') opt.max_mid_occ = strtol(s + 1, &s, 10); + } else if (c == 'f') { + double x; + char *p; + x = strtod(o.arg, &p); + if (x < 1.0) opt.mid_occ_frac = x, opt.mid_occ = 0; + else opt.mid_occ = (int)(x + .499); + if (*p == ',') opt.max_occ = (int)(strtod(p+1, &p) + .499); + } else if (c == 'u') { + if (*o.arg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV; // both strands + else if (*o.arg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV; // match GT-AG + else if (*o.arg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; // match CT-AC (reverse complement of GT-AG) + else if (*o.arg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); // don't try to match the GT-AG signal + else { + fprintf(stderr, "[ERROR]\033[1;31m unrecognized cDNA direction\033[0m\n"); + return 1; + } + } else if (c == 'z') { + opt.zdrop = opt.zdrop_inv = strtol(o.arg, &s, 10); + if (*s == ',') opt.zdrop_inv = strtol(s + 1, &s, 10); + } else if (c == 'O') { + opt.q = opt.q2 = strtol(o.arg, &s, 10); + if (*s == ',') opt.q2 = strtol(s + 1, &s, 10); + } else if (c == 'E') { + opt.e = opt.e2 = strtol(o.arg, &s, 10); + if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); + } + } + if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) { + fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n"); + return 1; + } + if (!fnw && !(opt.flag&MM_F_CIGAR)) + ipt.flag |= MM_I_NO_SEQ; + if (mm_check_opt(&ipt, &opt) < 0) + return 1; + if (opt.best_n == 0) { + fprintf(stderr, "[WARNING]\033[1;31m changed '-N 0' to '-N %d --secondary=no'.\033[0m\n", old_best_n); + opt.best_n = old_best_n, opt.flag |= MM_F_NO_PRINT_2ND; + } + + if (argc == o.ind || fp_help == stdout) { + fprintf(fp_help, "Usage: minimap2 [options] | [query.fa] [...]\n"); + fprintf(fp_help, "Options:\n"); + fprintf(fp_help, " Indexing:\n"); + fprintf(fp_help, " -H use homopolymer-compressed k-mer (preferrable for PacBio)\n"); + fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k); + fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w); + fprintf(fp_help, " -j INT syncmer submer size (overriding -w) []\n"); + fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n"); + fprintf(fp_help, " -d FILE dump index to FILE []\n"); + fprintf(fp_help, " Mapping:\n"); + fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); + fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); + fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n"); + fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n"); + fprintf(fp_help, " -r NUM[,NUM] chaining/alignment bandwidth and long-join bandwidth [%d,%d]\n", opt.bw, opt.bw_long); + fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); + fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); +// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy + fprintf(fp_help, " -X skip self and dual mappings (for the all-vs-all mode)\n"); + fprintf(fp_help, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); + fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); + fprintf(fp_help, " Alignment:\n"); + fprintf(fp_help, " -A INT matching score [%d]\n", opt.a); + fprintf(fp_help, " -B INT mismatch penalty (larger value for lower divergence) [%d]\n", opt.b); + fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); + fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); + fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv); + fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); + fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); + fprintf(fp_help, " Input/Output:\n"); + fprintf(fp_help, " -a output in the SAM format (PAF by default)\n"); + fprintf(fp_help, " -o FILE output alignments to FILE [stdout]\n"); + fprintf(fp_help, " -L write CIGAR with >65535 ops at the CG tag\n"); + fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); + fprintf(fp_help, " -c output CIGAR in PAF\n"); + fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n"); + fprintf(fp_help, " --MD output the MD tag\n"); + fprintf(fp_help, " --eqx write =/X CIGAR operators\n"); + fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n"); + fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); + fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n"); +// fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose); + fprintf(fp_help, " --version show version number\n"); + fprintf(fp_help, " Preset:\n"); + fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); + fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n"); + fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n"); + fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n"); + fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n"); + fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n"); + fprintf(fp_help, " - sr - genomic short-read mapping\n"); + fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n"); + return fp_help == stdout? 0 : 1; + } + + if ((opt.flag & MM_F_SR) && argc - o.ind > 3) { + fprintf(stderr, "[ERROR] incorrect input: in the sr mode, please specify no more than two query files.\n"); + return 1; + } + idx_rdr = mm_idx_reader_open(argv[o.ind], &ipt, fnw); + if (idx_rdr == 0) { + fprintf(stderr, "[ERROR] failed to open file '%s': %s\n", argv[o.ind], strerror(errno)); + return 1; + } + if (!idx_rdr->is_idx && fnw == 0 && argc - o.ind < 2) { + fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n"); + mm_idx_reader_close(idx_rdr); + return 1; + } + if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2) + fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n"); + while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) { + int ret; + if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) { + fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n"); + mm_idx_destroy(mi); + mm_idx_reader_close(idx_rdr); + return 1; + } + if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) { + if (mm_idx_reader_eof(idx_rdr)) { + if (opt.split_prefix == 0) + ret = mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv); + else + ret = mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv); + } else { + ret = mm_write_sam_hdr(0, rg, MM_VERSION, argc, argv); + if (opt.split_prefix == 0 && mm_verbose >= 2) + fprintf(stderr, "[WARNING]\033[1;31m For a multi-part index, no @SQ lines will be outputted. Please use --split-prefix.\033[0m\n"); + } + if (ret != 0) { + mm_idx_destroy(mi); + mm_idx_reader_close(idx_rdr); + return 1; + } + } + if (mm_verbose >= 3) + fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", + __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); + if (argc != o.ind + 1) mm_mapopt_update(&opt, mi); + if (mm_verbose >= 3) mm_idx_stat(mi); + if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1); + if (alt_list) mm_idx_alt_read(mi, alt_list); + if (argc - (o.ind + 1) == 0) { + mm_idx_destroy(mi); + continue; // no query files + } + ret = 0; + if (!(opt.flag & MM_F_FRAG_MODE)) { + for (i = o.ind + 1; i < argc; ++i) { + ret = mm_map_file(mi, argv[i], &opt, n_threads); + if (ret < 0) break; + } + } else { + ret = mm_map_file_frag(mi, argc - (o.ind + 1), (const char**)&argv[o.ind + 1], &opt, n_threads); + } + mm_idx_destroy(mi); + if (ret < 0) { + fprintf(stderr, "ERROR: failed to map the query file\n"); + exit(EXIT_FAILURE); + } + } + n_parts = idx_rdr->n_parts; + mm_idx_reader_close(idx_rdr); + + if (opt.split_prefix) + mm_split_merge(argc - (o.ind + 1), (const char**)&argv[o.ind + 1], &opt, n_parts); + + if (fflush(stdout) == EOF) { + perror("[ERROR] failed to write the results"); + exit(EXIT_FAILURE); + } + + if (mm_verbose >= 3) { + fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION); + fprintf(stderr, "[M::%s] CMD:", __func__); + for (i = 0; i < argc; ++i) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec; Peak RSS: %.3f GB\n", __func__, realtime() - mm_realtime0, cputime(), peakrss() / 1024.0 / 1024.0 / 1024.0); + } + return 0; +} diff --git a/lib/minimap2/main.c b/lib/minimap2/main.c index 135e26078..4da93bcb3 100644 --- a/lib/minimap2/main.c +++ b/lib/minimap2/main.c @@ -7,8 +7,6 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.24-r1155-dirty" - #ifdef __linux__ #include #include @@ -79,6 +77,7 @@ static ko_longopt_t long_options[] = { { "print-chains", ko_no_argument, 352 }, { "no-hash-name", ko_no_argument, 353 }, { "secondary-seq", ko_no_argument, 354 }, + { "ds", ko_no_argument, 355 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -122,7 +121,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:j:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -154,8 +153,7 @@ int main(int argc, char *argv[]) o = KETOPT_INIT; while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { - if (c == 'w') ipt.w = atoi(o.arg), ipt.flag &= ~MM_I_SYNCMER; - else if (c == 'j') ipt.w = atoi(o.arg), ipt.flag |= MM_I_SYNCMER; + if (c == 'w') ipt.w = atoi(o.arg); else if (c == 'k') ipt.k = atoi(o.arg); else if (c == 'H') ipt.flag |= MM_I_HPC; else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I @@ -181,6 +179,7 @@ int main(int argc, char *argv[]) else if (c == 'm') opt.min_chain_score = atoi(o.arg); else if (c == 'A') opt.a = atoi(o.arg); else if (c == 'B') opt.b = atoi(o.arg); + else if (c == 'b') opt.transition = atoi(o.arg); else if (c == 's') opt.min_dp_max = atoi(o.arg); else if (c == 'C') opt.noncan = atoi(o.arg); else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg); @@ -189,7 +188,12 @@ int main(int argc, char *argv[]) else if (c == 'R') rg = o.arg; else if (c == 'h') fp_help = stdout; else if (c == '2') opt.flag |= MM_F_2_IO_THREADS; - else if (c == 'o') { + else if (c == 'J') { + int t; + t = atoi(o.arg); + if (t == 0) opt.flag |= MM_F_SPLICE_OLD; + else if (t == 1) opt.flag &= ~MM_F_SPLICE_OLD; + } else if (c == 'o') { if (strcmp(o.arg, "-") != 0) { if (freopen(o.arg, "wb", stdout) == NULL) { fprintf(stderr, "[ERROR]\033[1;31m failed to write the output to file '%s'\033[0m: %s\n", o.arg, strerror(errno)); @@ -239,7 +243,8 @@ int main(int argc, char *argv[]) else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac else if (c == 352) mm_dbg_flag |= MM_DBG_PRINT_CHAIN; // --print-chains else if (c == 353) opt.flag |= MM_F_NO_HASH_NAME; // --no-hash-name - else if (c == 347) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq + else if (c == 354) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq + else if (c == 355) opt.flag |= MM_F_OUT_DS; // --ds else if (c == 330) { fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n"); } else if (c == 314) { // --frag @@ -326,8 +331,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -H use homopolymer-compressed k-mer (preferrable for PacBio)\n"); fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k); fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w); - fprintf(fp_help, " -j INT syncmer submer size (overriding -w) []\n"); - fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n"); + fprintf(fp_help, " -I NUM split index for every ~NUM input bases [8G]\n"); fprintf(fp_help, " -d FILE dump index to FILE []\n"); fprintf(fp_help, " Mapping:\n"); fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); @@ -349,6 +353,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -z INT[,INT] Z-drop score and inversion Z-drop score [%d,%d]\n", opt.zdrop, opt.zdrop_inv); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); + fprintf(fp_help, " -J INT splice mode. 0: original minimap2 model; 1: miniprot model [1]\n"); fprintf(fp_help, " Input/Output:\n"); fprintf(fp_help, " -a output in the SAM format (PAF by default)\n"); fprintf(fp_help, " -o FILE output alignments to FILE [stdout]\n"); @@ -356,6 +361,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); fprintf(fp_help, " -c output CIGAR in PAF\n"); fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n"); + fprintf(fp_help, " --ds output the ds tag, which is an extension to cs\n"); fprintf(fp_help, " --MD output the MD tag\n"); fprintf(fp_help, " --eqx write =/X CIGAR operators\n"); fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n"); @@ -365,12 +371,12 @@ int main(int argc, char *argv[]) fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n"); - fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n"); - fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n"); - fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n"); + fprintf(fp_help, " - lr:hq - accurate long reads (error rate <1%%) against a reference genome\n"); + fprintf(fp_help, " - splice/splice:hq - spliced alignment for long reads/accurate long reads\n"); fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n"); - fprintf(fp_help, " - splice/splice:hq - long-read/Pacbio-CCS spliced alignment\n"); - fprintf(fp_help, " - sr - genomic short-read mapping\n"); + fprintf(fp_help, " - sr - short reads against a reference\n"); + fprintf(fp_help, " - map-pb/map-hifi/map-ont/map-iclr - CLR/HiFi/Nanopore/ICLR vs reference mapping\n"); + fprintf(fp_help, " - ava-pb/ava-ont - PacBio CLR/Nanopore read overlap\n"); fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of these and other advanced command-line options.\n"); return fp_help == stdout? 0 : 1; } diff --git a/lib/minimap2/map 2.c b/lib/minimap2/map 2.c new file mode 100644 index 000000000..2342c9ee3 --- /dev/null +++ b/lib/minimap2/map 2.c @@ -0,0 +1,714 @@ +#include +#include +#include +#include +#include "kthread.h" +#include "kvec.h" +#include "kalloc.h" +#include "sdust.h" +#include "mmpriv.h" +#include "bseq.h" +#include "khash.h" + +struct mm_tbuf_s { + void *km; + int rep_len, frag_gap; +}; + +mm_tbuf_t *mm_tbuf_init(void) +{ + mm_tbuf_t *b; + b = (mm_tbuf_t*)calloc(1, sizeof(mm_tbuf_t)); + if (!(mm_dbg_flag & 1)) b->km = km_init(); + return b; +} + +void mm_tbuf_destroy(mm_tbuf_t *b) +{ + if (b == 0) return; + km_destroy(b->km); + free(b); +} + +void *mm_tbuf_get_km(mm_tbuf_t *b) +{ + return b->km; +} + +static int mm_dust_minier(void *km, int n, mm128_t *a, int l_seq, const char *seq, int sdust_thres) +{ + int n_dreg, j, k, u = 0; + const uint64_t *dreg; + sdust_buf_t *sdb; + if (sdust_thres <= 0) return n; + sdb = sdust_buf_init(km); + dreg = sdust_core((const uint8_t*)seq, l_seq, sdust_thres, 64, &n_dreg, sdb); + for (j = k = 0; j < n; ++j) { // squeeze out minimizers that significantly overlap with LCRs + int32_t qpos = (uint32_t)a[j].y>>1, span = a[j].x&0xff; + int32_t s = qpos - (span - 1), e = s + span; + while (u < n_dreg && (int32_t)dreg[u] <= s) ++u; + if (u < n_dreg && (int32_t)(dreg[u]>>32) < e) { + int v, l = 0; + for (v = u; v < n_dreg && (int32_t)(dreg[v]>>32) < e; ++v) { // iterate over LCRs overlapping this minimizer + int ss = s > (int32_t)(dreg[v]>>32)? s : dreg[v]>>32; + int ee = e < (int32_t)dreg[v]? e : (uint32_t)dreg[v]; + l += ee - ss; + } + if (l <= span>>1) a[k++] = a[j]; // keep the minimizer if less than half of it falls in masked region + } else a[k++] = a[j]; + } + sdust_buf_destroy(sdb); + return k; // the new size +} + +static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, mm128_v *mv) +{ + int i, n, sum = 0; + mv->n = 0; + for (i = n = 0; i < n_segs; ++i) { + size_t j; + mm_sketch2(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mi->flag&MM_I_SYNCMER, mv); + for (j = n; j < mv->n; ++j) + mv->a[j].y += sum << 1; + if (opt->sdust_thres > 0) // mask low-complexity minimizers + mv->n = n + mm_dust_minier(km, mv->n - n, mv->a + n, qlens[i], seqs[i], opt->sdust_thres); + sum += qlens[i], n = mv->n; + } +} + +#include "ksort.h" +#define heap_lt(a, b) ((a).x > (b).x) +KSORT_INIT(heap, mm128_t, heap_lt) + +static inline int skip_seed(int flag, uint64_t r, const mm_seed_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) +{ + *is_self = 0; + if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) { + const mm_idx_seq_t *s = &mi->seq[r>>32]; + int cmp; + cmp = strcmp(qname, s->name); + if ((flag&MM_F_NO_DIAG) && cmp == 0 && (int)s->len == qlen) { + if ((uint32_t)r>>1 == (q->q_pos>>1)) return 1; // avoid the diagnonal anchors + if ((r&1) == (q->q_pos&1)) *is_self = 1; // this flag is used to avoid spurious extension on self chain + } + if ((flag&MM_F_NO_DUAL) && cmp > 0) // all-vs-all mode: map once + return 1; + } + if (flag & (MM_F_FOR_ONLY|MM_F_REV_ONLY)) { + if ((r&1) == (q->q_pos&1)) { // forward strand + if (flag & MM_F_REV_ONLY) return 1; + } else { + if (flag & MM_F_FOR_ONLY) return 1; + } + } + return 0; +} + +static mm128_t *collect_seed_hits_heap(void *km, const mm_mapopt_t *opt, int max_occ, const mm_idx_t *mi, const char *qname, const mm128_v *mv, int qlen, int64_t *n_a, int *rep_len, + int *n_mini_pos, uint64_t **mini_pos) +{ + int i, n_m, heap_size = 0; + int64_t j, n_for = 0, n_rev = 0; + mm_seed_t *m; + mm128_t *a, *heap; + + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + + heap = (mm128_t*)kmalloc(km, n_m * sizeof(mm128_t)); + a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); + + for (i = 0, heap_size = 0; i < n_m; ++i) { + if (m[i].n > 0) { + heap[heap_size].x = m[i].cr[0]; + heap[heap_size].y = (uint64_t)i<<32; + ++heap_size; + } + } + ks_heapmake_heap(heap_size, heap); + while (heap_size > 0) { + mm_seed_t *q = &m[heap->y>>32]; + mm128_t *p; + uint64_t r = heap->x; + int32_t is_self, rpos = (uint32_t)r >> 1; + if (!skip_seed(opt->flag, r, q, qname, qlen, mi, &is_self)) { + if ((r&1) == (q->q_pos&1)) { // forward strand + p = &a[n_for++]; + p->x = (r&0xffffffff00000000ULL) | rpos; + p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1; + } else { // reverse strand + p = &a[(*n_a) - (++n_rev)]; + p->x = 1ULL<<63 | (r&0xffffffff00000000ULL) | rpos; + p->y = (uint64_t)q->q_span << 32 | (qlen - ((q->q_pos>>1) + 1 - q->q_span) - 1); + } + p->y |= (uint64_t)q->seg_id << MM_SEED_SEG_SHIFT; + if (q->is_tandem) p->y |= MM_SEED_TANDEM; + if (is_self) p->y |= MM_SEED_SELF; + } + // update the heap + if ((uint32_t)heap->y < q->n - 1) { + ++heap[0].y; + heap[0].x = m[heap[0].y>>32].cr[(uint32_t)heap[0].y]; + } else { + heap[0] = heap[heap_size - 1]; + --heap_size; + } + ks_heapdown_heap(0, heap_size, heap); + } + kfree(km, m); + kfree(km, heap); + + // reverse anchors on the reverse strand, as they are in the descending order + for (j = 0; j < n_rev>>1; ++j) { + mm128_t t = a[(*n_a) - 1 - j]; + a[(*n_a) - 1 - j] = a[(*n_a) - (n_rev - j)]; + a[(*n_a) - (n_rev - j)] = t; + } + if (*n_a > n_for + n_rev) { + memmove(a + n_for, a + (*n_a) - n_rev, n_rev * sizeof(mm128_t)); + *n_a = n_for + n_rev; + } + return a; +} + +static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, const mm_idx_t *mi, const char *qname, const mm128_v *mv, int qlen, int64_t *n_a, int *rep_len, + int *n_mini_pos, uint64_t **mini_pos) +{ + int i, n_m; + mm_seed_t *m; + mm128_t *a; + m = mm_collect_matches(km, &n_m, qlen, max_occ, opt->max_max_occ, opt->occ_dist, mi, mv, n_a, rep_len, n_mini_pos, mini_pos); + a = (mm128_t*)kmalloc(km, *n_a * sizeof(mm128_t)); + for (i = 0, *n_a = 0; i < n_m; ++i) { + mm_seed_t *q = &m[i]; + const uint64_t *r = q->cr; + uint32_t k; + for (k = 0; k < q->n; ++k) { + int32_t is_self, rpos = (uint32_t)r[k] >> 1; + mm128_t *p; + if (skip_seed(opt->flag, r[k], q, qname, qlen, mi, &is_self)) continue; + p = &a[(*n_a)++]; + if ((r[k]&1) == (q->q_pos&1)) { // forward strand + p->x = (r[k]&0xffffffff00000000ULL) | rpos; + p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1; + } else if (!(opt->flag & MM_F_QSTRAND)) { // reverse strand and not in the query-strand mode + p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | rpos; + p->y = (uint64_t)q->q_span << 32 | (qlen - ((q->q_pos>>1) + 1 - q->q_span) - 1); + } else { // reverse strand; query-strand + int32_t len = mi->seq[r[k]>>32].len; + p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | (len - (rpos + 1 - q->q_span) - 1); // coordinate only accurate for non-HPC seeds + p->y = (uint64_t)q->q_span << 32 | q->q_pos >> 1; + } + p->y |= (uint64_t)q->seg_id << MM_SEED_SEG_SHIFT; + if (q->is_tandem) p->y |= MM_SEED_TANDEM; + if (is_self) p->y |= MM_SEED_SELF; + } + } + kfree(km, m); + radix_sort_128x(a, a + (*n_a)); + return a; +} + +static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a) +{ + if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, 1, opt->max_gap * 0.8, n_regs, regs); + else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); + } +} + +static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, int *n_regs, mm_reg1_t *regs, mm128_t *a) +{ + if (!(opt->flag & MM_F_CIGAR)) return regs; + regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() + if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) + mm_set_parent(km, opt->mask_level, opt->mask_len, *n_regs, regs, opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, 0, opt->max_gap * 0.8, n_regs, regs); + mm_set_sam_pri(*n_regs, regs); + } + return regs; +} + +void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) +{ + int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos; + int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR); + uint32_t hash; + int64_t n_a; + uint64_t *u, *mini_pos; + mm128_t *a; + mm128_v mv = {0,0,0}; + mm_reg1_t *regs0; + km_stat_t kmst; + float chn_pen_gap, chn_pen_skip; + + for (i = 0, qlen_sum = 0; i < n_segs; ++i) + qlen_sum += qlens[i], n_regs[i] = 0, regs[i] = 0; + + if (qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return; + if (opt->max_qlen > 0 && qlen_sum > opt->max_qlen) return; + + hash = qname && !(opt->flag & MM_F_NO_HASH_NAME)? __ac_X31_hash_string(qname) : 0; + hash ^= __ac_Wang_hash(qlen_sum) + __ac_Wang_hash(opt->seed); + hash = __ac_Wang_hash(hash); + + collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv); + if (opt->q_occ_frac > 0.0f) mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac); + if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + else a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + + if (mm_dbg_flag & MM_DBG_PRINT_SEED) { + fprintf(stderr, "RS\t%d\n", rep_len); + for (i = 0; i < n_a; ++i) + fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), + i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); + } + + // set max chaining gap on the query and the reference sequence + if (is_sr) + max_chain_gap_qry = qlen_sum > opt->max_gap? qlen_sum : opt->max_gap; + else max_chain_gap_qry = opt->max_gap; + if (opt->max_gap_ref > 0) { + max_chain_gap_ref = opt->max_gap_ref; // always honor mm_mapopt_t::max_gap_ref if set + } else if (opt->max_frag_len > 0) { + max_chain_gap_ref = opt->max_frag_len - qlen_sum; + if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; + } else max_chain_gap_ref = opt->max_gap; + + chn_pen_gap = opt->chain_gap_scale * 0.01 * mi->k; + chn_pen_skip = opt->chain_skip_scale * 0.01 * mi->k; + if (opt->flag & MM_F_RMQ) { + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, n_a, a, &n_regs0, &u, b->km); + } else { + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + } + + if (opt->bw_long > opt->bw && (opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN)) == 0 && n_segs == 1 && n_regs0 > 1) { // re-chain/long-join for long sequences + int32_t st = (int32_t)a[0].y, en = (int32_t)a[(int32_t)u[0] - 1].y; + if (qlen_sum - (en - st) > opt->rmq_rescue_size || en - st > qlen_sum * opt->rmq_rescue_ratio) { + int32_t i; + for (i = 0, n_a = 0; i < n_regs0; ++i) n_a += (int32_t)u[i]; + kfree(b->km, u); + radix_sort_128x(a, a + n_a); + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw_long, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, n_a, a, &n_regs0, &u, b->km); + } + } else if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { // re-chain, mostly for short reads + int rechain = 0; + if (n_regs0 > 0) { // test if the best chain has all the segments + int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; + for (i = 0; i < n_regs0; ++i) { // find the best chain + if (max < (int)(u[i]>>32)) max = u[i]>>32, max_i = i, max_off = off; + off += (uint32_t)u[i]; + } + for (i = 1; i < (int32_t)u[max_i]; ++i) // count the number of segments in the best chain + if ((a[max_off+i].y&MM_SEED_SEG_MASK) != (a[max_off+i-1].y&MM_SEED_SEG_MASK)) + ++n_chained_segs; + if (n_chained_segs < n_segs) + rechain = 1; + } else rechain = 1; + if (rechain) { // redo chaining with a higher max_occ threshold + kfree(b->km, a); + kfree(b->km, u); + kfree(b->km, mini_pos); + if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + chn_pen_gap, chn_pen_skip, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + } + } + b->frag_gap = max_chain_gap_ref; + b->rep_len = rep_len; + + regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a, !!(opt->flag&MM_F_QSTRAND)); + if (mi->n_alt) { + mm_mark_alt(mi, n_regs0, regs0); + mm_hit_sort(b->km, &n_regs0, regs0, opt->alt_drop); // this step can be merged into mm_gen_regs(); will do if this shows up in profile + } + + if (mm_dbg_flag & (MM_DBG_PRINT_SEED|MM_DBG_PRINT_CHAIN)) + for (j = 0; j < n_regs0; ++j) + for (i = regs0[j].as; i < regs0[j].as + regs0[j].cnt; ++i) + fprintf(stderr, "CN\t%d\t%s\t%d\t%c\t%d\t%d\t%d\n", j, mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), + i == regs0[j].as? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); + + chain_post(opt, max_chain_gap_ref, mi, b->km, qlen_sum, n_segs, qlens, &n_regs0, regs0, a); + if (!is_sr && !(opt->flag&MM_F_QSTRAND)) { + mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos); + n_regs0 = mm_filter_strand_retained(n_regs0, regs0); + } + + if (n_segs == 1) { // uni-segment + regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a); + regs0 = (mm_reg1_t*)realloc(regs0, sizeof(*regs0) * n_regs0); + mm_set_mapq(b->km, n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr); + n_regs[0] = n_regs0, regs[0] = regs0; + } else { // multi-segment + mm_seg_t *seg; + seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a); // split fragment chain to separate segment chains + free(regs0); + for (i = 0; i < n_segs; ++i) { + mm_set_parent(b->km, opt->mask_level, opt->mask_len, n_regs[i], regs[i], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); // update mm_reg1_t::parent + regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); + mm_set_mapq(b->km, n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); + } + mm_seg_free(b->km, n_segs, seg); + if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) + mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing + } + + kfree(b->km, mv.a); + kfree(b->km, a); + kfree(b->km, u); + kfree(b->km, mini_pos); + + if (b->km) { + km_stat(b->km, &kmst); + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "QM\t%s\t%d\tcap=%ld,nCore=%ld,largest=%ld\n", qname, qlen_sum, kmst.capacity, kmst.n_cores, kmst.largest); + assert(kmst.n_blocks == kmst.n_cores); // otherwise, there is a memory leak + if (kmst.largest > 1U<<28 || (opt->cap_kalloc > 0 && kmst.capacity > opt->cap_kalloc)) { + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "[W::%s] reset thread-local memory after read %s\n", __func__, qname); + km_destroy(b->km); + b->km = km_init(); + } + } +} + +mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) +{ + mm_reg1_t *regs; + mm_map_frag(mi, 1, &qlen, &seq, n_regs, ®s, b, opt, qname); + return regs; +} + +/************************** + * Multi-threaded mapping * + **************************/ + +typedef struct { + int n_processed, n_threads, n_fp; + int64_t mini_batch_size; + const mm_mapopt_t *opt; + mm_bseq_file_t **fp; + const mm_idx_t *mi; + kstring_t str; + + int n_parts; + uint32_t *rid_shift; + FILE *fp_split, **fp_parts; +} pipeline_t; + +typedef struct { + const pipeline_t *p; + int n_seq, n_frag; + mm_bseq1_t *seq; + int *n_reg, *seg_off, *n_seg, *rep_len, *frag_gap; + mm_reg1_t **reg; + mm_tbuf_t **buf; +} step_t; + +static void worker_for(void *_data, long i, int tid) // kt_for() callback +{ + step_t *s = (step_t*)_data; + int qlens[MM_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori; + const char *qseqs[MM_MAX_SEG]; + double t = 0.0; + mm_tbuf_t *b = s->buf[tid]; + assert(s->n_seg[i] <= MM_MAX_SEG); + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) { + fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq); + t = realtime(); + } + for (j = 0; j < s->n_seg[i]; ++j) { + if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) + mm_revcomp_bseq(&s->seq[off + j]); + qlens[j] = s->seq[off + j].l_seq; + qseqs[j] = s->seq[off + j].seq; + } + if (s->p->opt->flag & MM_F_INDEPEND_SEG) { + for (j = 0; j < s->n_seg[i]; ++j) { + mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name); + s->rep_len[off + j] = b->rep_len; + s->frag_gap[off + j] = b->frag_gap; + } + } else { + mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); + for (j = 0; j < s->n_seg[i]; ++j) { + s->rep_len[off + j] = b->rep_len; + s->frag_gap[off + j] = b->frag_gap; + } + } + for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand + if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) { + int k, t; + mm_revcomp_bseq(&s->seq[off + j]); + for (k = 0; k < s->n_reg[off + j]; ++k) { + mm_reg1_t *r = &s->reg[off + j][k]; + t = r->qs; + r->qs = qlens[j] - r->qe; + r->qe = qlens[j] - t; + r->rev = !r->rev; + } + } + if (mm_dbg_flag & MM_DBG_PRINT_QNAME) + fprintf(stderr, "QT\t%s\t%d\t%.6f\n", s->seq[off].name, tid, realtime() - t); +} + +static void merge_hits(step_t *s) +{ + int f, i, k0, k, max_seg = 0, *n_reg_part, *rep_len_part, *frag_gap_part, *qlens; + void *km; + FILE **fp = s->p->fp_parts; + const mm_mapopt_t *opt = s->p->opt; + + km = km_init(); + for (f = 0; f < s->n_frag; ++f) + max_seg = max_seg > s->n_seg[f]? max_seg : s->n_seg[f]; + qlens = CALLOC(int, max_seg + s->p->n_parts * 3); + n_reg_part = qlens + max_seg; + rep_len_part = n_reg_part + s->p->n_parts; + frag_gap_part = rep_len_part + s->p->n_parts; + for (f = 0, k = k0 = 0; f < s->n_frag; ++f) { + k0 = k; + for (i = 0; i < s->n_seg[f]; ++i, ++k) { + int j, l, t, rep_len = 0; + qlens[i] = s->seq[k].l_seq; + for (j = 0, s->n_reg[k] = 0; j < s->p->n_parts; ++j) { + mm_err_fread(&n_reg_part[j], sizeof(int), 1, fp[j]); + mm_err_fread(&rep_len_part[j], sizeof(int), 1, fp[j]); + mm_err_fread(&frag_gap_part[j], sizeof(int), 1, fp[j]); + s->n_reg[k] += n_reg_part[j]; + if (rep_len < rep_len_part[j]) + rep_len = rep_len_part[j]; + } + s->reg[k] = CALLOC(mm_reg1_t, s->n_reg[k]); + for (j = 0, l = 0; j < s->p->n_parts; ++j) { + for (t = 0; t < n_reg_part[j]; ++t, ++l) { + mm_reg1_t *r = &s->reg[k][l]; + uint32_t capacity; + mm_err_fread(r, sizeof(mm_reg1_t), 1, fp[j]); + r->rid += s->p->rid_shift[j]; + if (opt->flag & MM_F_CIGAR) { + mm_err_fread(&capacity, 4, 1, fp[j]); + r->p = (mm_extra_t*)calloc(capacity, 4); + r->p->capacity = capacity; + mm_err_fread(r->p, r->p->capacity, 4, fp[j]); + } + } + } + if (!(opt->flag&MM_F_SR) && s->seq[k].l_seq >= opt->rank_min_len) + mm_update_dp_max(s->seq[k].l_seq, s->n_reg[k], s->reg[k], opt->rank_frac, opt->a, opt->b); + for (j = 0; j < s->n_reg[k]; ++j) { + mm_reg1_t *r = &s->reg[k][j]; + if (r->p) r->p->dp_max2 = 0; // reset ->dp_max2 as mm_set_parent() doesn't clear it; necessary with mm_update_dp_max() + r->subsc = 0; // this may not be necessary + r->n_sub = 0; // n_sub will be an underestimate as we don't see all the chains now, but it can't be accurate anyway + } + mm_hit_sort(km, &s->n_reg[k], s->reg[k], opt->alt_drop); + mm_set_parent(km, opt->mask_level, opt->mask_len, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b, opt->flag&MM_F_HARD_MLEVEL, opt->alt_drop); + if (!(opt->flag & MM_F_ALL_CHAINS)) { + mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, 0, opt->max_gap * 0.8, &s->n_reg[k], s->reg[k]); + mm_set_sam_pri(s->n_reg[k], s->reg[k]); + } + mm_set_mapq(km, s->n_reg[k], s->reg[k], opt->min_chain_score, opt->a, rep_len, !!(opt->flag & MM_F_SR)); + } + if (s->n_seg[f] == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) + mm_pair(km, frag_gap_part[0], opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, &s->n_reg[k0], &s->reg[k0]); + } + free(qlens); + km_destroy(km); +} + +static void *worker_pipeline(void *shared, int step, void *in) +{ + int i, j, k; + pipeline_t *p = (pipeline_t*)shared; + if (step == 0) { // step 0: read sequences + int with_qual = (!!(p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_NO_QUAL)); + int with_comment = !!(p->opt->flag & MM_F_COPY_COMMENT); + int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MM_F_FRAG_MODE)); + step_t *s; + s = (step_t*)calloc(1, sizeof(step_t)); + if (p->n_fp > 1) s->seq = mm_bseq_read_frag2(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq); + else s->seq = mm_bseq_read3(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq); + if (s->seq) { + s->p = p; + for (i = 0; i < s->n_seq; ++i) + s->seq[i].rid = p->n_processed++; + s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*)); + for (i = 0; i < p->n_threads; ++i) + s->buf[i] = mm_tbuf_init(); + s->n_reg = (int*)calloc(5 * s->n_seq, sizeof(int)); + s->seg_off = s->n_reg + s->n_seq; // seg_off, n_seg, rep_len and frag_gap are allocated together with n_reg + s->n_seg = s->seg_off + s->n_seq; + s->rep_len = s->n_seg + s->n_seq; + s->frag_gap = s->rep_len + s->n_seq; + s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*)); + for (i = 1, j = 0; i <= s->n_seq; ++i) + if (i == s->n_seq || !frag_mode || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) { + s->n_seg[s->n_frag] = i - j; + s->seg_off[s->n_frag++] = j; + j = i; + } + return s; + } else free(s); + } else if (step == 1) { // step 1: map + if (p->n_parts > 0) merge_hits((step_t*)in); + else kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_frag); + return in; + } else if (step == 2) { // step 2: output + void *km = 0; + step_t *s = (step_t*)in; + const mm_idx_t *mi = p->mi; + for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); + free(s->buf); + if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); + for (k = 0; k < s->n_frag; ++k) { + int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k]; + for (i = seg_st; i < seg_en; ++i) { + mm_bseq1_t *t = &s->seq[i]; + if (p->opt->split_prefix && p->n_parts == 0) { // then write to temporary files + mm_err_fwrite(&s->n_reg[i], sizeof(int), 1, p->fp_split); + mm_err_fwrite(&s->rep_len[i], sizeof(int), 1, p->fp_split); + mm_err_fwrite(&s->frag_gap[i], sizeof(int), 1, p->fp_split); + for (j = 0; j < s->n_reg[i]; ++j) { + mm_reg1_t *r = &s->reg[i][j]; + mm_err_fwrite(r, sizeof(mm_reg1_t), 1, p->fp_split); + if (p->opt->flag & MM_F_CIGAR) { + mm_err_fwrite(&r->p->capacity, 4, 1, p->fp_split); + mm_err_fwrite(r->p, r->p->capacity, 4, p->fp_split); + } + } + } else if (s->n_reg[i] > 0) { // the query has at least one hit + for (j = 0; j < s->n_reg[i]; ++j) { + mm_reg1_t *r = &s->reg[i][j]; + assert(!r->sam_pri || r->id == r->parent); + if ((p->opt->flag & MM_F_NO_PRINT_2ND) && r->id != r->parent) + continue; + if (p->opt->flag & MM_F_OUT_SAM) + mm_write_sam3(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag, s->rep_len[i]); + else + mm_write_paf3(&p->str, mi, t, r, km, p->opt->flag, s->rep_len[i]); + mm_err_puts(p->str.s); + } + } else if ((p->opt->flag & MM_F_PAF_NO_HIT) || ((p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_SAM_HIT_ONLY))) { // output an empty hit, if requested + if (p->opt->flag & MM_F_OUT_SAM) + mm_write_sam3(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag, s->rep_len[i]); + else + mm_write_paf3(&p->str, mi, t, 0, 0, p->opt->flag, s->rep_len[i]); + mm_err_puts(p->str.s); + } + } + for (i = seg_st; i < seg_en; ++i) { + for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p); + free(s->reg[i]); + free(s->seq[i].seq); free(s->seq[i].name); + if (s->seq[i].qual) free(s->seq[i].qual); + if (s->seq[i].comment) free(s->seq[i].comment); + } + } + free(s->reg); free(s->n_reg); free(s->seq); // seg_off, n_seg, rep_len and frag_gap were allocated with reg; no memory leak here + km_destroy(km); + if (mm_verbose >= 3) + fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq); + free(s); + } + return 0; +} + +static mm_bseq_file_t **open_bseqs(int n, const char **fn) +{ + mm_bseq_file_t **fp; + int i, j; + fp = (mm_bseq_file_t**)calloc(n, sizeof(mm_bseq_file_t*)); + for (i = 0; i < n; ++i) { + if ((fp[i] = mm_bseq_open(fn[i])) == 0) { + if (mm_verbose >= 1) + fprintf(stderr, "ERROR: failed to open file '%s': %s\n", fn[i], strerror(errno)); + for (j = 0; j < i; ++j) + mm_bseq_close(fp[j]); + free(fp); + return 0; + } + } + return fp; +} + +int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads) +{ + int i, pl_threads; + pipeline_t pl; + if (n_segs < 1) return -1; + memset(&pl, 0, sizeof(pipeline_t)); + pl.n_fp = n_segs; + pl.fp = open_bseqs(pl.n_fp, fn); + if (pl.fp == 0) return -1; + pl.opt = opt, pl.mi = idx; + pl.n_threads = n_threads > 1? n_threads : 1; + pl.mini_batch_size = opt->mini_batch_size; + if (opt->split_prefix) + pl.fp_split = mm_split_init(opt->split_prefix, idx); + pl_threads = n_threads == 1? 1 : (opt->flag&MM_F_2_IO_THREADS)? 3 : 2; + kt_pipeline(pl_threads, worker_pipeline, &pl, 3); + + free(pl.str.s); + if (pl.fp_split) fclose(pl.fp_split); + for (i = 0; i < pl.n_fp; ++i) + mm_bseq_close(pl.fp[i]); + free(pl.fp); + return 0; +} + +int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads) +{ + return mm_map_file_frag(idx, 1, &fn, opt, n_threads); +} + +int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx) +{ + int i; + pipeline_t pl; + mm_idx_t *mi; + if (n_segs < 1 || n_split_idx < 1) return -1; + memset(&pl, 0, sizeof(pipeline_t)); + pl.n_fp = n_segs; + pl.fp = open_bseqs(pl.n_fp, fn); + if (pl.fp == 0) return -1; + pl.opt = opt; + pl.mini_batch_size = opt->mini_batch_size; + + pl.n_parts = n_split_idx; + pl.fp_parts = CALLOC(FILE*, pl.n_parts); + pl.rid_shift = CALLOC(uint32_t, pl.n_parts); + pl.mi = mi = mm_split_merge_prep(opt->split_prefix, n_split_idx, pl.fp_parts, pl.rid_shift); + if (pl.mi == 0) { + free(pl.fp_parts); + free(pl.rid_shift); + return -1; + } + for (i = n_split_idx - 1; i > 0; --i) + pl.rid_shift[i] = pl.rid_shift[i - 1]; + for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i) + pl.rid_shift[i] += pl.rid_shift[i - 1]; + if (opt->flag & MM_F_OUT_SAM) + for (i = 0; i < (int32_t)pl.mi->n_seq; ++i) + printf("@SQ\tSN:%s\tLN:%d\n", pl.mi->seq[i].name, pl.mi->seq[i].len); + + kt_pipeline(2, worker_pipeline, &pl, 3); + + free(pl.str.s); + mm_idx_destroy(mi); + free(pl.rid_shift); + for (i = 0; i < n_split_idx; ++i) + fclose(pl.fp_parts[i]); + free(pl.fp_parts); + for (i = 0; i < pl.n_fp; ++i) + mm_bseq_close(pl.fp[i]); + free(pl.fp); + mm_split_rm_tmp(opt->split_prefix, n_split_idx); + return 0; +} diff --git a/lib/minimap2/map.c b/lib/minimap2/map.c index 2342c9ee3..038888fbf 100644 --- a/lib/minimap2/map.c +++ b/lib/minimap2/map.c @@ -10,11 +10,6 @@ #include "bseq.h" #include "khash.h" -struct mm_tbuf_s { - void *km; - int rep_len, frag_gap; -}; - mm_tbuf_t *mm_tbuf_init(void) { mm_tbuf_t *b; @@ -67,7 +62,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t mv->n = 0; for (i = n = 0; i < n_segs; ++i) { size_t j; - mm_sketch2(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mi->flag&MM_I_SYNCMER, mv); + mm_sketch(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mv); for (j = n; j < mv->n; ++j) mv->a[j].y += sum << 1; if (opt->sdust_thres > 0) // mask low-complexity minimizers diff --git a/lib/minimap2/minimap.h b/lib/minimap2/minimap.h index 9e436b0ff..b944199a5 100644 --- a/lib/minimap2/minimap.h +++ b/lib/minimap2/minimap.h @@ -5,47 +5,50 @@ #include #include -#define MM_F_NO_DIAG 0x001 // no exact diagonal hit -#define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name -#define MM_F_CIGAR 0x004 -#define MM_F_OUT_SAM 0x008 -#define MM_F_NO_QUAL 0x010 -#define MM_F_OUT_CG 0x020 -#define MM_F_OUT_CS 0x040 -#define MM_F_SPLICE 0x080 // splice mode -#define MM_F_SPLICE_FOR 0x100 // match GT-AG -#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG -#define MM_F_NO_LJOIN 0x400 -#define MM_F_OUT_CS_LONG 0x800 -#define MM_F_SR 0x1000 -#define MM_F_FRAG_MODE 0x2000 -#define MM_F_NO_PRINT_2ND 0x4000 -#define MM_F_2_IO_THREADS 0x8000 -#define MM_F_LONG_CIGAR 0x10000 -#define MM_F_INDEPEND_SEG 0x20000 -#define MM_F_SPLICE_FLANK 0x40000 -#define MM_F_SOFTCLIP 0x80000 -#define MM_F_FOR_ONLY 0x100000 -#define MM_F_REV_ONLY 0x200000 -#define MM_F_HEAP_SORT 0x400000 -#define MM_F_ALL_CHAINS 0x800000 -#define MM_F_OUT_MD 0x1000000 -#define MM_F_COPY_COMMENT 0x2000000 -#define MM_F_EQX 0x4000000 // use =/X instead of M -#define MM_F_PAF_NO_HIT 0x8000000 // output unmapped reads to PAF -#define MM_F_NO_END_FLT 0x10000000 -#define MM_F_HARD_MLEVEL 0x20000000 -#define MM_F_SAM_HIT_ONLY 0x40000000 +#define MM_VERSION "2.27-r1193" + +#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit +#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name +#define MM_F_CIGAR (0x004LL) +#define MM_F_OUT_SAM (0x008LL) +#define MM_F_NO_QUAL (0x010LL) +#define MM_F_OUT_CG (0x020LL) +#define MM_F_OUT_CS (0x040LL) +#define MM_F_SPLICE (0x080LL) // splice mode +#define MM_F_SPLICE_FOR (0x100LL) // match GT-AG +#define MM_F_SPLICE_REV (0x200LL) // match CT-AC, the reverse complement of GT-AG +#define MM_F_NO_LJOIN (0x400LL) +#define MM_F_OUT_CS_LONG (0x800LL) +#define MM_F_SR (0x1000LL) +#define MM_F_FRAG_MODE (0x2000LL) +#define MM_F_NO_PRINT_2ND (0x4000LL) +#define MM_F_2_IO_THREADS (0x8000LL) +#define MM_F_LONG_CIGAR (0x10000LL) +#define MM_F_INDEPEND_SEG (0x20000LL) +#define MM_F_SPLICE_FLANK (0x40000LL) +#define MM_F_SOFTCLIP (0x80000LL) +#define MM_F_FOR_ONLY (0x100000LL) +#define MM_F_REV_ONLY (0x200000LL) +#define MM_F_HEAP_SORT (0x400000LL) +#define MM_F_ALL_CHAINS (0x800000LL) +#define MM_F_OUT_MD (0x1000000LL) +#define MM_F_COPY_COMMENT (0x2000000LL) +#define MM_F_EQX (0x4000000LL) // use =/X instead of M +#define MM_F_PAF_NO_HIT (0x8000000LL) // output unmapped reads to PAF +#define MM_F_NO_END_FLT (0x10000000LL) +#define MM_F_HARD_MLEVEL (0x20000000LL) +#define MM_F_SAM_HIT_ONLY (0x40000000LL) #define MM_F_RMQ (0x80000000LL) #define MM_F_QSTRAND (0x100000000LL) #define MM_F_NO_INV (0x200000000LL) #define MM_F_NO_HASH_NAME (0x400000000LL) -#define MM_F_SECONDARY_SEQ (0x800000000LL) //output SEQ field for seqondary alignments using hard clipping +#define MM_F_SPLICE_OLD (0x800000000LL) +#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping +#define MM_F_OUT_DS (0x2000000000LL) #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 #define MM_I_NO_NAME 0x4 -#define MM_I_SYNCMER 0x8 #define MM_IDX_MAGIC "MMI\2" @@ -95,6 +98,7 @@ typedef struct { typedef struct { uint32_t capacity; // the capacity of cigar[] int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings + int32_t dp_max0; // DP score before mm_update_dp_max() adjustment uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for - uint32_t n_cigar; // number of cigar operations in cigar[] uint32_t cigar[]; @@ -151,6 +155,7 @@ typedef struct { float alt_drop; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties + int transition; // transition mismatch score (A:G, C:T) int sc_ambi; // score when one or both bases are "N" int noncan; // cost of non-canonical splicing sites int junc_bonus; @@ -191,6 +196,11 @@ typedef struct { } mm_idx_reader_t; // memory buffer for thread-local storage during mapping +struct mm_tbuf_s { + void *km; + int rep_len, frag_gap; +}; + typedef struct mm_tbuf_s mm_tbuf_t; // global variables diff --git a/lib/minimap2/minimap2.1 b/lib/minimap2/minimap2.1 index 08d9c949e..aa674d190 100644 --- a/lib/minimap2/minimap2.1 +++ b/lib/minimap2/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "18 December 2021" "minimap2-2.24 (r1122)" "Bioinformatics tools" +.TH minimap2 1 "12 March 2024" "minimap2-2.27 (r1193)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -101,7 +101,7 @@ on the HPC sequence. .BI -I \ NUM Load at most .I NUM -target bases into RAM for indexing [4G]. If there are more than +target bases into RAM for indexing [8G]. If there are more than .I NUM bases in .IR target.fa , @@ -343,6 +343,10 @@ Matching score [2] .BI -B \ INT Mismatching penalty [4] .TP +.BI -b \ INT +Mismatching penalty for transitions [same as +.BR -B ]. +.TP .BI -O \ INT1[,INT2] Gap open penalty [4,24]. If .I INT2 @@ -356,10 +360,19 @@ costs .RI min{ O1 + k * E1 , O2 + k * E2 }. In the splice mode, the second gap penalties are not used. .TP +.BI -J \ INT +Splice model [1]. 0 for the original minimap2 splice model that always penalizes non-GT-AG splicing; +1 for the miniprot model that considers non-GT-AG. Option +.B -C +has no effect with the default +.BR -J1 . +.BR -J0 . +.TP .BI -C \ INT Cost for a non-canonical GT-AG splicing (effective with -.BR --splice ) -[0] +.B --splice +.BR -J0 ) +[0]. .TP .BI -z \ INT1[,INT2] Truncate an alignment if the running alignment score drops too quickly along @@ -506,6 +519,9 @@ Output =/X CIGAR operators for sequence match/mismatch. .B -Y In SAM output, use soft clipping for supplementary alignments. .TP +.B --secondary-seq +In SAM output, show query sequences for secondary alignments. +.TP .BI --seed \ INT Integer seed for randomizing equally best hits. Minimap2 hashes .I INT @@ -566,15 +582,43 @@ are: Align noisy long reads of ~10% error rate to a reference genome. This is the default mode. .TP +.B lr:hq +Align accurate long reads (error rate <1%) to a reference genome +.RB ( -k19 +.B -w19 -U50,500 +.BR -g10k ). +This was recommended by ONT developers for recent Nanopore reads +produced with chemistry v14 that can reach ~99% in accuracy. +It was shown to work better for accurate Nanopore reads +than +.BR map-hifi . +.TP .B map-hifi Align PacBio high-fidelity (HiFi) reads to a reference genome -.RB ( -k19 -.B -w19 -U50,500 -g10k -A1 -B4 -O6,26 -E2,1 +.RB ( -xlr:hq +.B -A1 -B4 -O6,26 -E2,1 .BR -s200 ). +It differs from +.B lr:hq +only in scoring. It has not been tested whether +.B lr:hq +would work better for PacBio HiFi reads. .TP .B map-pb Align older PacBio continuous long (CLR) reads to a reference genome .RB ( -Hk19 ). +Note that this data type is effectively deprecated by HiFi. +Unless you work on very old data, you probably want to use +.B map-hifi +or +.BR lr:hq . +.TP +.B map-iclr +Align Illumina Complete Long Reads (ICLR) to a reference genome +.RB ( -k19 +.B -B6 -b4 +.BR -O10,50 ). +This was recommended by Illumina developers. .TP .B asm5 Long assembly to reference mapping @@ -582,21 +626,21 @@ Long assembly to reference mapping .B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B19 -O39,81 -E3,1 -s200 -z200 .BR -N50 ). Typically, the alignment will not extend to regions with 5% or higher sequence -divergence. Only use this preset if the average divergence is far below 5%. +divergence. Use this preset if the average divergence is not much higher than 0.1%. .TP .B asm10 Long assembly to reference mapping .RB ( -k19 .B -w19 -U50,500 --rmq -r1k,100k -g10k -A1 -B9 -O16,41 -E2,1 -s200 -z200 .BR -N50 ). -Up to 10% sequence divergence. +Use this if the average divergence is around 1%. .TP .B asm20 Long assembly to reference mapping .RB ( -k19 .B -w10 -U50,500 --rmq -r1k,100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200 .BR -N50 ). -Up to 20% sequence divergence. +Use this if the average divergence is around several percent. .TP .B splice Long-read spliced alignment @@ -612,13 +656,13 @@ costs are different during chaining; 4) the computation of the tag ignores introns to demote hits to pseudogenes. .TP .B splice:hq -Long-read splice alignment for PacBio CCS reads +Spliced alignment for accurate long RNA-seq reads such as PacBio iso-seq .RB ( -xsplice .B -C5 -O6,24 .BR -B4 ). .TP .B sr -Short single-end reads without splicing +Short-read alignment without splicing .RB ( -k21 .B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m25 .B -s40 -g100 -2K50m --heap-sort=yes diff --git a/lib/minimap2/misc/paftools.js b/lib/minimap2/misc/paftools.js index bc2f29d7f..67819e9b0 100755 --- a/lib/minimap2/misc/paftools.js +++ b/lib/minimap2/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.24-r1152-dirty'; +var paftools_version = '2.27-r1193'; /***************************** ***** Library functions ***** @@ -133,26 +133,50 @@ Interval.find_ovlp = function(a, st, en) function fasta_read(fn) { - var h = {}, gt = '>'.charCodeAt(0); + var h = {}, seqlen = []; + var buf = new Bytes(); var file = fn == '-'? new File() : new File(fn); - var buf = new Bytes(), seq = null, name = null, seqlen = []; - while (file.readline(buf) >= 0) { - if (buf[0] == gt) { - if (seq != null && name != null) { - seqlen.push([name, seq.length]); - h[name] = seq; - name = seq = null; - } - var m, line = buf.toString(); - if ((m = /^>(\S+)/.exec(line)) != null) { - name = m[1]; - seq = new Bytes(); - } - } else seq.set(buf); - } - if (seq != null && name != null) { - seqlen.push([name, seq.length]); - h[name] = seq; + if (typeof k8_version == "undefined") { // for k8-0.x + var seq = null, name = null, gt = '>'.charCodeAt(0); + while (file.readline(buf) >= 0) { + if (buf[0] == gt) { + if (seq != null && name != null) { + seqlen.push([name, seq.length]); + h[name] = seq; + name = seq = null; + } + var m, line = buf.toString(); + if ((m = /^>(\S+)/.exec(line)) != null) { + name = m[1]; + seq = new Bytes(); + } + } else seq.set(buf); + } + if (seq != null && name != null) { + seqlen.push([name, seq.length]); + h[name] = seq; + } + } else { // for k8-1.x + var seq = null, name = null; + while (file.readline(buf) >= 0) { + var line = buf.toString(); + if (line[0] == ">") { + if (seq != null && name != null) { + seqlen.push([name, seq.length]); + h[name] = new Uint8Array(seq.buffer); + name = seq = null; + } + var m; + if ((m = /^>(\S+)/.exec(line)) != null) { + name = m[1]; + seq = new Bytes(); + } + } else seq.set(line); + } + if (seq != null && name != null) { + seqlen.push([name, seq.length]); + h[name] = new Uint8Array(seq.buffer); + } } buf.destroy(); file.close(); @@ -161,16 +185,27 @@ function fasta_read(fn) function fasta_free(fa) { - for (var name in fa) - fa[name].destroy(); + if (typeof k8_version == "undefined") + for (var name in fa) + fa[name].destroy(); + // FIXME: for k8-1.0, sequences are not freed. This is ok for now but not general. } Bytes.prototype.reverse = function() { - for (var i = 0; i < this.length>>1; ++i) { - var tmp = this[i]; - this[i] = this[this.length - i - 1]; - this[this.length - i - 1] = tmp; + if (typeof k8_version === "undefined") { // k8-0.x + for (var i = 0; i < this.length>>1; ++i) { + var tmp = this[i]; + this[i] = this[this.length - i - 1]; + this[this.length - i - 1] = tmp; + } + } else { // k8-1.x + var buf = new Uint8Array(this.buffer); + for (var i = 0; i < buf.length>>1; ++i) { + var tmp = buf[i]; + buf[i] = buf[buf.length - i - 1]; + buf[buf.length - i - 1] = tmp; + } } } @@ -185,13 +220,24 @@ Bytes.prototype.revcomp = function() for (var i = 0; i < s1.length; ++i) Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i); } - for (var i = 0; i < this.length>>1; ++i) { - var tmp = this[this.length - i - 1]; - this[this.length - i - 1] = Bytes.rctab[this[i]]; - this[i] = Bytes.rctab[tmp]; + if (typeof k8_version === "undefined") { // k8-0.x + for (var i = 0; i < this.length>>1; ++i) { + var tmp = this[this.length - i - 1]; + this[this.length - i - 1] = Bytes.rctab[this[i]]; + this[i] = Bytes.rctab[tmp]; + } + if (this.length&1) + this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; + } else { // k8-1.x + var buf = new Uint8Array(this.buffer); + for (var i = 0; i < buf.length>>1; ++i) { + var tmp = buf[buf.length - i - 1]; + buf[buf.length - i - 1] = Bytes.rctab[buf[i]]; + buf[i] = Bytes.rctab[tmp]; + } + if (buf.length&1) + buf[buf.length>>1] = Bytes.rctab[buf[buf.length>>1]]; } - if (this.length&1) - this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; } /******************** @@ -2051,7 +2097,7 @@ function paf_mapeval(args) warn("Usage: paftools.js mapeval [options] |"); warn("Options:"); warn(" -r FLOAT mapping correct if overlap_length/union_length>FLOAT [" + ovlp_ratio + "]"); - warn(" -Q INT print wrong mappings with mapQ>INT [don't print]"); + warn(" -Q INT print wrong mappings with mapQ>=INT [don't print]"); warn(" -m INT 0: eval the longest aln only; 1: first aln only; 2: all primary aln [0]"); exit(1); } diff --git a/lib/minimap2/mmpriv 2.h b/lib/minimap2/mmpriv 2.h new file mode 100644 index 000000000..7b51b9845 --- /dev/null +++ b/lib/minimap2/mmpriv 2.h @@ -0,0 +1,134 @@ +#ifndef MMPRIV2_H +#define MMPRIV2_H + +#include +#include "minimap.h" +#include "bseq.h" +#include "kseq.h" + +#define MM_PARENT_UNSET (-1) +#define MM_PARENT_TMP_PRI (-2) + +#define MM_DBG_NO_KALLOC 0x1 +#define MM_DBG_PRINT_QNAME 0x2 +#define MM_DBG_PRINT_SEED 0x4 +#define MM_DBG_PRINT_ALN_SEQ 0x8 +#define MM_DBG_PRINT_CHAIN 0x10 + +#define MM_SEED_LONG_JOIN (1ULL<<40) +#define MM_SEED_IGNORE (1ULL<<41) +#define MM_SEED_TANDEM (1ULL<<42) +#define MM_SEED_SELF (1ULL<<43) + +#define MM_SEED_SEG_SHIFT 48 +#define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT)) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2)) +#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf) + +#define MALLOC(type, len) ((type*)malloc((len) * sizeof(type))) +#define CALLOC(type, len) ((type*)calloc((len), sizeof(type))) + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + uint32_t n; + uint32_t q_pos; + uint32_t q_span:31, flt:1; + uint32_t seg_id:31, is_tandem:1; + const uint64_t *cr; +} mm_seed_t; + +typedef struct { + int n_u, n_a; + uint64_t *u; + mm128_t *a; +} mm_seg_t; + +double cputime(void); +double realtime(void); +long peakrss(void); + +void radix_sort_128x(mm128_t *beg, mm128_t *end); +void radix_sort_64(uint64_t *beg, uint64_t *end); +uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); + +void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p); +void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p); + +mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); +void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac); + +double mm_event_identity(const mm_reg1_t *r); +int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag); +void mm_write_paf3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int64_t opt_flag, int rep_len); +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); +void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regs, const mm_reg1_t *const* regs, void *km, int64_t opt_flag); +void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len); + +void mm_idxopt_init(mm_idxopt_t *opt); +const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); +int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); +int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); +mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); +mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand); + +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float gap_scale, + int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip, + int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); + +void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r); +void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand); +void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs); +int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a); +int mm_set_sam_pri(int n, mm_reg1_t *r); +void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac); +void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r); +void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); +int mm_filter_strand_retained(int n_regs, mm_reg1_t *r); +void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); +void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac); +void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); +void mm_update_dp_max(int qlen, int n_regs, mm_reg1_t *regs, float frac, int a, int b); + +void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos); + +mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a); +void mm_seg_free(void *km, int n_segs, mm_seg_t *segs); +void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs); + +FILE *mm_split_init(const char *prefix, const mm_idx_t *mi); +mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part); +int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx); +void mm_split_rm_tmp(const char *prefix, int n_splits); + +void mm_err_puts(const char *str); +void mm_err_fwrite(const void *p, size_t size, size_t nitems, FILE *fp); +void mm_err_fread(void *p, size_t size, size_t nitems, FILE *fp); + +static inline float mg_log2(float x) // NB: this doesn't work when x<2 +{ + union { float f; uint32_t i; } z = { x }; + float log_2 = ((z.i >> 23) & 255) - 128; + z.i &= ~(255 << 23); + z.i += 127 << 23; + log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f; + return log_2; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/minimap2/mmpriv.h b/lib/minimap2/mmpriv.h index 7b51b9845..2f5034b7d 100644 --- a/lib/minimap2/mmpriv.h +++ b/lib/minimap2/mmpriv.h @@ -60,8 +60,6 @@ void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); -void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p); -void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p); mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac); diff --git a/lib/minimap2/options.c b/lib/minimap2/options.c index 235b6dd89..4ed6d3476 100644 --- a/lib/minimap2/options.c +++ b/lib/minimap2/options.c @@ -8,7 +8,7 @@ void mm_idxopt_init(mm_idxopt_t *opt) opt->k = 15, opt->w = 10, opt->flag = 0; opt->bucket_bits = 14; opt->mini_batch_size = 50000000; - opt->batch_size = 4000000000ULL; + opt->batch_size = 8000000000ULL; } void mm_mapopt_init(mm_mapopt_t *opt) @@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->alt_drop = 0.15f; opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; + opt->transition = 0; opt->sc_ambi = 1; opt->zdrop = 400, opt->zdrop_inv = 200; opt->end_bonus = -1; @@ -90,7 +91,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) if (preset == 0) { mm_idxopt_init(io); mm_mapopt_init(mo); - } else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default + } else if (strcmp(preset, "lr") == 0 || strcmp(preset, "map-ont") == 0) { // this is the same as the default } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; @@ -105,13 +106,22 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_chain_skip = 25; mo->bw_long = mo->bw; mo->occ_dist = 0; - } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { + } else if (strcmp(preset, "lr:hq") == 0 || strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->max_gap = 10000; - mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; - mo->occ_dist = 500; mo->min_mid_occ = 50, mo->max_mid_occ = 500; - mo->min_dp_max = 200; + if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { + mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; + mo->min_dp_max = 200; + } + } else if (strcmp(preset, "map-iclr-prerender") == 0) { + io->flag = 0, io->k = 15; + mo->b = 6, mo->transition = 1; + mo->q = 10, mo->q2 = 50; + } else if (strcmp(preset, "map-iclr") == 0) { + io->flag = 0, io->k = 19; + mo->b = 6, mo->transition = 4; + mo->q = 10, mo->q2 = 50; } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->bw = 1000, mo->bw_long = 100000; @@ -156,7 +166,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->junc_bonus = 9; mo->zdrop = 200, mo->zdrop_inv = 100; // because mo->a is halved if (strcmp(preset, "splice:hq") == 0) - mo->junc_bonus = 5, mo->b = 4, mo->q = 6, mo->q2 = 24; + mo->noncan = 5, mo->b = 4, mo->q = 6, mo->q2 = 24; } else return -1; return 0; } diff --git a/lib/minimap2/pyproject.toml b/lib/minimap2/pyproject.toml new file mode 100644 index 000000000..bc2be6730 --- /dev/null +++ b/lib/minimap2/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools", "wheel", "Cython"] diff --git a/lib/minimap2/python/README.rst b/lib/minimap2/python/README.rst index 3082980a4..a3aad6184 100644 --- a/lib/minimap2/python/README.rst +++ b/lib/minimap2/python/README.rst @@ -77,7 +77,9 @@ This constructor accepts the following arguments: * **min_chain_score**: minimum chaing score -* **bw**: chaining and alignment band width +* **bw**: chaining and alignment band width (initial chaining and extension) + +* **bw_long**: chaining and alignment band width (RMQ-based rechaining and closing gaps) * **best_n**: max number of alignments to return diff --git a/lib/minimap2/python/cmappy.pxd b/lib/minimap2/python/cmappy.pxd index c208c4c32..47855febf 100644 --- a/lib/minimap2/python/cmappy.pxd +++ b/lib/minimap2/python/cmappy.pxd @@ -36,6 +36,7 @@ cdef extern from "minimap.h": float alt_drop int a, b, q, e, q2, e2 + int transition int sc_ambi int noncan int junc_bonus diff --git a/lib/minimap2/python/mappy.pyx b/lib/minimap2/python/mappy.pyx index cf4763202..c51f06468 100644 --- a/lib/minimap2/python/mappy.pyx +++ b/lib/minimap2/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.24' +__version__ = '2.27' cmappy.mm_reset_timer() @@ -112,7 +112,7 @@ cdef class Aligner: cdef cmappy.mm_idxopt_t idx_opt cdef cmappy.mm_mapopt_t map_opt - def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None): + def __cinit__(self, fn_idx_in=None, preset=None, k=None, w=None, min_cnt=None, min_chain_score=None, min_dp_score=None, bw=None, bw_long=None, best_n=None, n_threads=3, fn_idx_out=None, max_frag_len=None, extra_flags=None, seq=None, scoring=None): self._idx = NULL cmappy.mm_set_opt(NULL, &self.idx_opt, &self.map_opt) # set the default options if preset is not None: @@ -125,6 +125,7 @@ cdef class Aligner: if min_chain_score is not None: self.map_opt.min_chain_score = min_chain_score if min_dp_score is not None: self.map_opt.min_dp_max = min_dp_score if bw is not None: self.map_opt.bw = bw + if bw_long is not None: self.map_opt.bw_long = bw_long if best_n is not None: self.map_opt.best_n = best_n if max_frag_len is not None: self.map_opt.max_frag_len = max_frag_len if extra_flags is not None: self.map_opt.flag |= extra_flags @@ -172,6 +173,7 @@ cdef class Aligner: cdef cmappy.mm_mapopt_t map_opt if self._idx == NULL: return + if ((self.map_opt.flag & 4) and (self._idx.flag & 2)): return map_opt = self.map_opt if max_frag_len is not None: map_opt.max_frag_len = max_frag_len if extra_flags is not None: map_opt.flag |= extra_flags @@ -217,6 +219,7 @@ cdef class Aligner: cdef int l cdef char *s if self._idx == NULL: return + if ((self.map_opt.flag & 4) and (self._idx.flag & 2)): return s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l) if l == 0: return None r = s[:l] if isinstance(s, str) else s[:l].decode() diff --git a/lib/minimap2/sdust 2.c b/lib/minimap2/sdust 2.c new file mode 100644 index 000000000..176dcb04b --- /dev/null +++ b/lib/minimap2/sdust 2.c @@ -0,0 +1,213 @@ +#include +#include +#include +#include "kalloc.h" +#include "kdq.h" +#include "kvec.h" +#include "sdust.h" + +#define SD_WLEN 3 +#define SD_WTOT (1<<(SD_WLEN<<1)) +#define SD_WMSK (SD_WTOT - 1) + +typedef struct { + int start, finish; + int r, l; +} perf_intv_t; + +typedef kvec_t(perf_intv_t) perf_intv_v; +typedef kvec_t(uint64_t) uint64_v; + +KDQ_INIT(int) + +#if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN) +unsigned char seq_nt4_table[256] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; +#else +extern unsigned char seq_nt4_table[256]; +#endif + +struct sdust_buf_s { + kdq_t(int) *w; + perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish + uint64_v res; // the result + void *km; // memory pool +}; + +sdust_buf_t *sdust_buf_init(void *km) +{ + sdust_buf_t *buf; + buf = (sdust_buf_t*)kcalloc(km, 1, sizeof(sdust_buf_t)); + buf->km = km; + buf->w = kdq_init(int, buf->km); + kdq_resize(int, buf->w, 8); + return buf; +} + +void sdust_buf_destroy(sdust_buf_t *buf) +{ + if (buf == 0) return; + kdq_destroy(int, buf->w); + kfree(buf->km, buf->P.a); kfree(buf->km, buf->res.a); kfree(buf->km, buf); +} + +static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv) +{ + int s; + if ((int)kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3? + s = *kdq_shift(int, w); + *rw -= --cw[s]; + if (*L > (int)kdq_size(w)) + --*L, *rv -= --cv[s]; + } + kdq_push(int, w, t); + ++*L; + *rw += cw[t]++; + *rv += cv[t]++; + if (cv[t] * 10 > T<<1) { + do { + s = kdq_at(w, kdq_size(w) - *L); + *rv -= --cv[s]; + --*L; + } while (s != t); + } +} + +static inline void save_masked_regions(void *km, uint64_v *res, perf_intv_v *P, int start) +{ + int i, saved = 0; + perf_intv_t *p; + if (P->n == 0 || P->a[P->n - 1].start >= start) return; + p = &P->a[P->n - 1]; + if (res->n) { + int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1]; + if (p->start <= f) // if overlapping with or adjacent to the previous interval + saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish); + } + if (!saved) kv_push(uint64_t, km, *res, (uint64_t)p->start<<32|p->finish); + for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window + P->n = i + 1; +} + +static void find_perfect(void *km, perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv) +{ + int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0; + memcpy(c, cv, SD_WTOT * sizeof(int)); + for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) { + int j, t = kdq_at(w, i), new_r, new_l; + r += c[t]++; + new_r = r, new_l = kdq_size(w) - i - 1; + if (new_r * 10 > T * new_l) { + for (j = 0; j < (int)P->n && P->a[j].start >= i + start; ++j) { // find insertion position + perf_intv_t *p = &P->a[j]; + if (max_r == 0 || p->r * max_l > max_r * p->l) + max_r = p->r, max_l = p->l; + } + if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert + max_r = new_r, max_l = new_l; + if (P->n == P->m) kv_resize(perf_intv_t, km, *P, P->n + 1); + memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room + ++P->n; + P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start; + P->a[j].r = new_r, P->a[j].l = new_l; + } + } + } +} + +const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf) +{ + int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT]; + int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence + unsigned t; // current word + + buf->P.n = buf->res.n = 0; + buf->w->front = buf->w->count = 0; + memset(cv, 0, SD_WTOT * sizeof(int)); + memset(cw, 0, SD_WTOT * sizeof(int)); + if (l_seq < 0) l_seq = strlen((const char*)seq); + for (i = l = t = 0; i <= l_seq; ++i) { + int b = i < l_seq? seq_nt4_table[seq[i]] : 4; + if (b < 4) { // an A/C/G/T base + ++l, t = (t<<2 | b) & SD_WMSK; + if (l >= SD_WLEN) { // we have seen a word + start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window + save_masked_regions(buf->km, &buf->res, &buf->P, start); // save intervals falling out of the current window? + shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv); + if (rw * 10 > L * T) + find_perfect(buf->km, &buf->P, buf->w, T, start, L, rv, cv); + } + } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences + start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l); + while (buf->P.n) save_masked_regions(buf->km, &buf->res, &buf->P, start++); // clear up unsaved perfect intervals + l = t = 0; + } + } + *n = buf->res.n; + return buf->res.a; +} + +uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n) +{ + uint64_t *ret; + sdust_buf_t *buf; + buf = sdust_buf_init(km); + ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf); + buf->res.a = 0; + sdust_buf_destroy(buf); + return ret; +} + +#ifdef _SDUST_MAIN +#include +#include +#include "ketopt.h" +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +int main(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *ks; + int W = 64, T = 20, c; + ketopt_t o = KETOPT_INIT; + + while ((c = ketopt(&o, argc, argv, 1, "w:t:", 0)) >= 0) { + if (c == 'w') W = atoi(o.arg); + else if (c == 't') T = atoi(o.arg); + } + if (o.ind == argc) { + fprintf(stderr, "Usage: sdust [-w %d] [-t %d] \n", W, T); + return 1; + } + fp = strcmp(argv[o.ind], "-")? gzopen(argv[o.ind], "r") : gzdopen(fileno(stdin), "r"); + ks = kseq_init(fp); + while (kseq_read(ks) >= 0) { + uint64_t *r; + int i, n; + r = sdust(0, (uint8_t*)ks->seq.s, -1, T, W, &n); + for (i = 0; i < n; ++i) + printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]); + free(r); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} +#endif diff --git a/lib/minimap2/sdust 2.h b/lib/minimap2/sdust 2.h new file mode 100644 index 000000000..a12cab28c --- /dev/null +++ b/lib/minimap2/sdust 2.h @@ -0,0 +1,25 @@ +#ifndef SDUST_H +#define SDUST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct sdust_buf_s; +typedef struct sdust_buf_s sdust_buf_t; + +// the simple interface +uint64_t *sdust(void *km, const uint8_t *seq, int l_seq, int T, int W, int *n); + +// the following interface dramatically reduce heap allocations when sdust is frequently called. +sdust_buf_t *sdust_buf_init(void *km); +void sdust_buf_destroy(sdust_buf_t *buf); +const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/minimap2/seed.c b/lib/minimap2/seed.c index 08fe5ad89..76a67aedd 100644 --- a/lib/minimap2/seed.c +++ b/lib/minimap2/seed.c @@ -7,7 +7,7 @@ void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac) mm128_t *a; size_t i, j, st; if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return; - KMALLOC(km, a, mv->n); + a = Kmalloc(km, mm128_t, mv->n); for (i = 0; i < mv->n; ++i) a[i].x = mv->a[i].x, a[i].y = i; radix_sort_128x(a, a + mv->n); diff --git a/lib/minimap2/setup 2.py b/lib/minimap2/setup 2.py new file mode 100644 index 000000000..ce4d79c28 --- /dev/null +++ b/lib/minimap2/setup 2.py @@ -0,0 +1,65 @@ +try: + from setuptools import setup, Extension + from setuptools.command.build_ext import build_ext +except ImportError: + from distutils.core import setup + from distutils.extension import Extension + from distutils.command.build_ext import build_ext + +import sys, platform, subprocess + + +def readme(): + with open('python/README.rst') as f: + return f.read() + + +class LibMM2Build(build_ext): + # Uses Makefile to build library, avoids duplicating logic + # determining which objects to compile but does require + # end users to have Make (since precompiled wheels are not + # distributed on PyPI). + def run(self): + def compile_libminimap2(*args, **kwargs): + cmd = ['make', 'libminimap2.a'] + list(args) + subprocess.check_call(cmd) + options = [] + if platform.machine() in ["aarch64", "arm64"]: + options = ["arm_neon=1", "aarch64=1"] + self.execute( + compile_libminimap2, options, + 'Compiling libminimap2 using Makefile') + build_ext.run(self) + + +setup( + name = 'mappy', + version = '2.24', + url = 'https://github.com/lh3/minimap2', + description = 'Minimap2 python binding', + long_description = readme(), + author = 'Heng Li', + author_email = 'lh3@me.com', + license = 'MIT', + keywords = 'sequence-alignment', + scripts = ['python/minimap2.py'], + cmdclass = {'build_ext': LibMM2Build}, + ext_modules = [ + Extension( + 'mappy', + sources = ['python/mappy.pyx'], + depends = ['python/cmappy.h', 'python/cmappy.pxd'], + include_dirs = ['.'], + extra_objects = ['libminimap2.a'], + libraries = ['z', 'm', 'pthread'])], + classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: MIT License', + 'Operating System :: POSIX', + 'Programming Language :: C', + 'Programming Language :: Cython', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Bio-Informatics'], + setup_requires=["cython"]) diff --git a/lib/minimap2/setup.py b/lib/minimap2/setup.py index ce4d79c28..04db5b185 100644 --- a/lib/minimap2/setup.py +++ b/lib/minimap2/setup.py @@ -1,40 +1,29 @@ try: from setuptools import setup, Extension - from setuptools.command.build_ext import build_ext except ImportError: from distutils.core import setup from distutils.extension import Extension - from distutils.command.build_ext import build_ext -import sys, platform, subprocess +import sys, platform +sys.path.append('python') + +extra_compile_args = ['-DHAVE_KALLOC'] +include_dirs = ["."] + +if platform.machine() in ["aarch64", "arm64"]: + include_dirs.append("sse2neon/") + extra_compile_args.extend(['-ftree-vectorize', '-DKSW_SSE2_ONLY', '-D__SSE2__']) +else: + extra_compile_args.append('-msse4.1') # WARNING: ancient x86_64 CPUs don't have SSE4 def readme(): with open('python/README.rst') as f: return f.read() - -class LibMM2Build(build_ext): - # Uses Makefile to build library, avoids duplicating logic - # determining which objects to compile but does require - # end users to have Make (since precompiled wheels are not - # distributed on PyPI). - def run(self): - def compile_libminimap2(*args, **kwargs): - cmd = ['make', 'libminimap2.a'] + list(args) - subprocess.check_call(cmd) - options = [] - if platform.machine() in ["aarch64", "arm64"]: - options = ["arm_neon=1", "aarch64=1"] - self.execute( - compile_libminimap2, options, - 'Compiling libminimap2 using Makefile') - build_ext.run(self) - - setup( name = 'mappy', - version = '2.24', + version = '2.27', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), @@ -43,15 +32,16 @@ def compile_libminimap2(*args, **kwargs): license = 'MIT', keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], - cmdclass = {'build_ext': LibMM2Build}, - ext_modules = [ - Extension( - 'mappy', - sources = ['python/mappy.pyx'], - depends = ['python/cmappy.h', 'python/cmappy.pxd'], - include_dirs = ['.'], - extra_objects = ['libminimap2.a'], - libraries = ['z', 'm', 'pthread'])], + ext_modules = [Extension('mappy', + sources = ['python/mappy.pyx', 'align.c', 'bseq.c', 'lchain.c', 'seed.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', + 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', + 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c', 'splitidx.c'], + depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', + 'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h', + 'python/cmappy.h', 'python/cmappy.pxd'], + extra_compile_args = extra_compile_args, + include_dirs = include_dirs, + libraries = ['z', 'm', 'pthread'])], classifiers = [ 'Development Status :: 5 - Production/Stable', 'License :: OSI Approved :: MIT License', diff --git a/lib/minimap2/sketch 2.c b/lib/minimap2/sketch 2.c new file mode 100644 index 000000000..1f6b7da46 --- /dev/null +++ b/lib/minimap2/sketch 2.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#define __STDC_LIMIT_MACROS +#include "kvec.h" +#include "mmpriv.h" + +unsigned char seq_nt4_table[256] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +static inline uint64_t hash64(uint64_t key, uint64_t mask) +{ + key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; + key = key ^ key >> 24; + key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 + key = key ^ key >> 14; + key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 + key = key ^ key >> 28; + key = (key + (key << 31)) & mask; + return key; +} + +typedef struct { // a simplified version of kdq + int front, count; + int a[32]; +} tiny_queue_t; + +static inline void tq_push(tiny_queue_t *q, int x) +{ + q->a[((q->count++) + q->front) & 0x1f] = x; +} + +static inline int tq_shift(tiny_queue_t *q) +{ + int x; + if (q->count == 0) return -1; + x = q->a[q->front++]; + q->front &= 0x1f; + --q->count; + return x; +} + +/** + * Find symmetric (w,k)-minimizers on a DNA sequence + * + * @param km thread-local memory pool; using NULL falls back to malloc() + * @param str DNA sequence + * @param len length of $str + * @param w find a minimizer for every $w consecutive k-mers + * @param k k-mer size + * @param rid reference ID; will be copied to the output $p array + * @param is_hpc homopolymer-compressed or not + * @param p minimizers + * p->a[i].x = kMer<<8 | kmerSpan + * p->a[i].y = rid<<32 | lastPos<<1 | strand + * where lastPos is the position of the last base of the i-th minimizer, + * and strand indicates whether the minimizer comes from the top or the bottom strand. + * Callers may want to set "p->n = 0"; otherwise results are appended to p + */ +void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p) +{ + uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0}; + int i, j, l, buf_pos, min_pos, kmer_span = 0; + mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX }; + tiny_queue_t tq; + + assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice + memset(buf, 0xff, w * 16); + memset(&tq, 0, sizeof(tiny_queue_t)); + kv_resize(mm128_t, km, *p, p->n + len/w); + + for (i = l = buf_pos = min_pos = 0; i < len; ++i) { + int c = seq_nt4_table[(uint8_t)str[i]]; + mm128_t info = { UINT64_MAX, UINT64_MAX }; + if (c < 4) { // not an ambiguous base + int z; + if (is_hpc) { + int skip_len = 1; + if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { + for (skip_len = 2; i + skip_len < len; ++skip_len) + if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) + break; + i += skip_len - 1; // put $i at the end of the current homopolymer run + } + tq_push(&tq, skip_len); + kmer_span += skip_len; + if (tq.count > k) kmer_span -= tq_shift(&tq); + } else kmer_span = l + 1 < k? l + 1 : k; + kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer + kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer + if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand + z = kmer[0] < kmer[1]? 0 : 1; // strand + ++l; + if (l >= k && kmer_span < 256) { + info.x = hash64(kmer[z], mask) << 8 | kmer_span; + info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; + } + } else l = 0, tq.count = tq.front = 0, kmer_span = 0; + buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below + if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet + for (j = buf_pos + 1; j < w; ++j) + if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]); + for (j = 0; j < buf_pos; ++j) + if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]); + } + if (info.x <= min.x) { // a new minimum; then write the old min + if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); + min = info, min_pos = buf_pos; + } else if (buf_pos == min_pos) { // old min has moved outside the window + if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); + for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers + if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer + for (j = 0; j <= buf_pos; ++j) + if (min.x >= buf[j].x) min = buf[j], min_pos = j; + if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers + for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted + if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]); + for (j = 0; j <= buf_pos; ++j) + if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]); + } + } + if (++buf_pos == w) buf_pos = 0; + } + if (min.x != UINT64_MAX) + kv_push(mm128_t, km, *p, min); +} + +void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p) +{ + uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, smask = (1ULL<<2*smer) - 1, kmer[2] = {0,0}; + int i, j, l, buf_pos, min_pos, kmer_span = 0; + tiny_queue_t tq; + + assert(len > 0 && (smer > 0 && smer <= k) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice + memset(&tq, 0, sizeof(tiny_queue_t)); + kv_resize(mm128_t, km, *p, p->n + len/(k - smer)); + + for (i = l = buf_pos = min_pos = 0; i < len; ++i) { + int c = seq_nt4_table[(uint8_t)str[i]]; + if (c < 4) { // not an ambiguous base + int z; + if (is_hpc) { + int skip_len = 1; + if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { + for (skip_len = 2; i + skip_len < len; ++skip_len) + if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) + break; + i += skip_len - 1; // put $i at the end of the current homopolymer run + } + tq_push(&tq, skip_len); + kmer_span += skip_len; + if (tq.count > k) kmer_span -= tq_shift(&tq); + } else kmer_span = l + 1 < k? l + 1 : k; + kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer + kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer + if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand + z = kmer[0] < kmer[1]? 0 : 1; // strand + ++l; + if (l >= k && kmer_span < 256) { + uint64_t x, min = UINT64_MAX; + x = hash64(kmer[z], mask); + for (j = 0; j <= k - smer; ++j) { + uint64_t y = x >> (j + j) & smask; + min = min < y? min : y; + } + if ((x & smask) == min) { + mm128_t t; + t.x = x << 8 | kmer_span; + t.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; + kv_push(mm128_t, km, *p, t); + } + } + } else l = 0, tq.count = tq.front = 0, kmer_span = 0; + } +} + +void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p) +{ + if (is_syncmer) mm_sketch_syncmer(km, str, len, w, k, rid, is_hpc, p); + else mm_sketch(km, str, len, w, k, rid, is_hpc, p); +} diff --git a/lib/minimap2/sketch.c b/lib/minimap2/sketch.c index 1f6b7da46..f83069389 100644 --- a/lib/minimap2/sketch.c +++ b/lib/minimap2/sketch.c @@ -141,58 +141,3 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i if (min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); } - -void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p) -{ - uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, smask = (1ULL<<2*smer) - 1, kmer[2] = {0,0}; - int i, j, l, buf_pos, min_pos, kmer_span = 0; - tiny_queue_t tq; - - assert(len > 0 && (smer > 0 && smer <= k) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice - memset(&tq, 0, sizeof(tiny_queue_t)); - kv_resize(mm128_t, km, *p, p->n + len/(k - smer)); - - for (i = l = buf_pos = min_pos = 0; i < len; ++i) { - int c = seq_nt4_table[(uint8_t)str[i]]; - if (c < 4) { // not an ambiguous base - int z; - if (is_hpc) { - int skip_len = 1; - if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { - for (skip_len = 2; i + skip_len < len; ++skip_len) - if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) - break; - i += skip_len - 1; // put $i at the end of the current homopolymer run - } - tq_push(&tq, skip_len); - kmer_span += skip_len; - if (tq.count > k) kmer_span -= tq_shift(&tq); - } else kmer_span = l + 1 < k? l + 1 : k; - kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer - kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer - if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand - z = kmer[0] < kmer[1]? 0 : 1; // strand - ++l; - if (l >= k && kmer_span < 256) { - uint64_t x, min = UINT64_MAX; - x = hash64(kmer[z], mask); - for (j = 0; j <= k - smer; ++j) { - uint64_t y = x >> (j + j) & smask; - min = min < y? min : y; - } - if ((x & smask) == min) { - mm128_t t; - t.x = x << 8 | kmer_span; - t.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; - kv_push(mm128_t, km, *p, t); - } - } - } else l = 0, tq.count = tq.front = 0, kmer_span = 0; - } -} - -void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p) -{ - if (is_syncmer) mm_sketch_syncmer(km, str, len, w, k, rid, is_hpc, p); - else mm_sketch(km, str, len, w, k, rid, is_hpc, p); -} diff --git a/lib/minimap2/tex/Makefile b/lib/minimap2/tex/Makefile new file mode 100644 index 000000000..418a9628b --- /dev/null +++ b/lib/minimap2/tex/Makefile @@ -0,0 +1,21 @@ +.SUFFIXES: .gp .tex .eps .pdf .eps.gz + +.eps.pdf: + epstopdf --outfile $@ $< + +.eps.gz.pdf: + gzip -dc $< | epstopdf --filter > $@ + +.pdf.eps: + pdftops -eps $< $@ + +all:minimap2.pdf + +roc-color.eps:roc.gp + gnuplot roc.gp + +minimap2.pdf:minimap2.tex minimap2.bib roc-color.pdf + pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2; + +clean: + rm -fr *.toc *.aux *.bbl *.blg *.idx *.log *.out *~ minimap2.pdf diff --git a/lib/minimap2/tex/bioinfo.cls b/lib/minimap2/tex/bioinfo.cls new file mode 100644 index 000000000..48f78669f --- /dev/null +++ b/lib/minimap2/tex/bioinfo.cls @@ -0,0 +1,930 @@ +\newcommand\classname{bioinfo} +\newcommand\lastmodifieddate{2003/02/08} +\newcommand\versionnumber{0.1} + +% Are we printing crop marks? +\newif\if@cropmarkson \@cropmarksontrue + +\NeedsTeXFormat{LaTeX2e}[2001/06/01] +\ProvidesClass{\classname}[\lastmodifieddate\space\versionnumber] + +\setlength{\paperheight}{11truein} +\setlength{\paperwidth}{8.5truein} + +\newif\if@final + +\DeclareOption{draft}{\PassOptionsToPackage{draft}{graphicx}} +\DeclareOption{a4paper}{\PassOptionsToPackage{a4}{crop}} +\DeclareOption{centre}{\PassOptionsToPackage{center}{crop}} +\DeclareOption{crop}{\PassOptionsToPackage{cam}{crop}\global\@cropmarksontrue} +\DeclareOption{nocrop}{\PassOptionsToPackage{off}{crop}\global\@cropmarksonfalse} +\DeclareOption{info}{\PassOptionsToPackage{info}{crop}} +\DeclareOption{noinfo}{\PassOptionsToPackage{noinfo}{crop}} +\DeclareOption{final}{\global\@finaltrue} + +\ExecuteOptions{a4paper,nocrop,centre,info} + +\ProcessOptions + +% Load all necessary packages +\RequirePackage{inputenc,crop,graphicx,amsmath,array,color,amssymb,flushend,stfloats,amsthm,chngpage,times} +%\RequirePackage[LY1]{fontenc} +%\RequirePackage[LY1,mtbold]{mathtime} +\def\authoraffliate{\fontfamily{phv}\selectfont} +\def\helvetica{\fontfamily{phv}\selectfont} +\def\helveticaitalic{\fontfamily{phv}\itshape\selectfont} +\def\helveticabold{\fontfamily{phv}\bfseries\selectfont} +\def\helveticabolditalic{\fontfamily{phv}\bfseries\itshape\selectfont} + +% Not sure if needed. +\newcommand\@ptsize{0} + +% Set twoside printing +\@twosidetrue + +% Marginal notes are on the outside edge +\@mparswitchfalse + +\reversemarginpar + +\renewcommand\normalsize{% + \@setfontsize\normalsize{9}{11}% + \abovedisplayskip 10\p@ \@plus2\p@ \@minus5\p@ + \abovedisplayshortskip \z@ \@plus3\p@ + \belowdisplayshortskip 6\p@ \@plus3\p@ \@minus3\p@ + \belowdisplayskip \abovedisplayskip + \let\@listi\@listI} +\normalsize +\let\@bls\baselineskip + +\newcommand\small{% + \@setfontsize\small{9}{11}% + \abovedisplayskip 11\p@ minus 3\p@ + \belowdisplayskip \abovedisplayskip + \abovedisplayshortskip \z@ plus 2\p@ + \belowdisplayshortskip 4\p@ plus 2\p@ minus2\p@ + \def\@listi{\topsep 4.5\p@ plus 2\p@ minus 1\p@ + \itemsep \parsep + \topsep 4\p@ plus 2\p@ minus 2\p@}} + +\newcommand\footnotesize{% + \@setfontsize\footnotesize{8}{10}% + \abovedisplayskip 6\p@ minus 3\p@ + \belowdisplayskip\abovedisplayskip + \abovedisplayshortskip \z@ plus 3\p@ + \belowdisplayshortskip 6\p@ plus 3\p@ minus 3\p@ + \def\@listi{\topsep 3\p@ plus 1\p@ minus 1\p@ + \parsep 2\p@ plus 1\p@ minus 1\p@\itemsep \parsep}} + +\def\scriptsize{\@setfontsize\scriptsize{7pt}{9pt}} +\def\tiny{\@setfontsize\tiny{5pt}{7pt}} +\def\large{\@setfontsize\large{11.5pt}{12pt}} +\def\Large{\@setfontsize\Large{14pt}{16}} +\def\LARGE{\@setfontsize\LARGE{15pt}{17pt}} +\def\huge{\@setfontsize\huge{22pt}{22pt}} +\def\Huge{\@setfontsize\Huge{30pt}{30pt}} + +\DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm} +\DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf} +\DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt} +\DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf} +\DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit} +\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl} +\DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc} + +% Line spacing +\setlength\lineskip{1\p@} +\setlength\normallineskip{1\p@} +\renewcommand\baselinestretch{} + +% Paragraph dimensions and inter-para spacing +\setlength\parskip{0\p@} +\setlength\parindent{3mm} + +% Set inter-para skips +\setlength\smallskipamount{3\p@ \@plus 1\p@ \@minus 1\p@} +\setlength\medskipamount{6\p@ \@plus 2\p@} +\setlength\bigskipamount{12\p@ \@plus 4\p@ \@minus 4\p@} + +% Page break penalties +\@lowpenalty 51 +\@medpenalty 151 +\@highpenalty 301 + +% Disallow widows and orphans +\clubpenalty 10000 +\widowpenalty 10000 + +% Disable page breaks before equations, allow pagebreaks after +% equations and discourage widow lines before equations. +\displaywidowpenalty 100 +\predisplaypenalty 10000 +\postdisplaypenalty 2500 + +% Allow breaking the page in the middle of a paragraph +\interlinepenalty 0 + +% Disallow breaking the page after a hyphenated line +\brokenpenalty 10000 + +% Hyphenation; don't split words into less than three characters +\lefthyphenmin=3 +\righthyphenmin=3 + +% +% Set page layout dimensions +% +\setlength\headheight{16\p@} % height of running head +\setlength\topmargin{2.9pc} % head margin +\addtolength\topmargin{-1in} % subtract out the 1 inch driver margin + +\setlength\topskip{10\p@} % height of first line of text +\setlength\headsep{19\p@} % space below running head -- + +\setlength\footskip{34\p@} % space above footer line +\setlength\maxdepth{.5\topskip} % pages can be short or deep by half a line? + +\setlength\textwidth{42pc} % text measure excluding margins + +\setlength\textheight{58\baselineskip} % 54 lines on a full page, +\addtolength\textheight{\topskip} % including the first + % line on the page + +% Set the margins +\setlength\marginparsep{3\p@} +\setlength\marginparpush{3\p@} +\setlength\marginparwidth{35\p@} + +\setlength\oddsidemargin{4.5pc} +\addtolength\oddsidemargin{-1in} % subtract out the 1 inch driver margin +\setlength\@tempdima{\paperwidth} +\addtolength\@tempdima{-\textwidth} +\addtolength\@tempdima{-4.5pc} +\setlength\evensidemargin{\@tempdima} +\addtolength\evensidemargin{-1in} + +\setlength\columnsep{1.5pc} % space between columns for double-column text +\setlength\columnseprule{0\p@} % width of rule between two columns + +% Footnotes +\setlength\footnotesep{9\p@} % space between footnotes +% space between text and footnote +\setlength{\skip\footins}{12\p@ \@plus 6\p@ \@minus 1\p@} + +% Float placement parameters + +% The total number of floats that can be allowed on a page. +\setcounter{totalnumber}{10} +% The maximum number of floats at the top and bottom of a page. +\setcounter{topnumber}{5} +\setcounter{bottomnumber}{5} +% The maximum part of the top or bottom of a text page that can be +% occupied by floats. This is set so that at least four lines of text +% fit on the page. +\renewcommand\topfraction{.9} +\renewcommand\bottomfraction{.9} +% The minimum amount of a text page that must be occupied by text. +% This should accomodate four lines of text. +\renewcommand\textfraction{.06} +% The minimum amount of a float page that must be occupied by floats. +\renewcommand\floatpagefraction{.94} + +% The same parameters repeated for double column output +\renewcommand\dbltopfraction{.9} +\renewcommand\dblfloatpagefraction{.9} + +% Space between floats +\setlength\floatsep {12\p@ \@plus 2\p@ \@minus 2\p@} +% Space between floats and text +\setlength\textfloatsep{20\p@ \@plus 2\p@ \@minus 4\p@} +% Space above and below an inline figure +\setlength\intextsep {18\p@ \@plus 2\p@ \@minus 2\p@} + +% For double column floats +\setlength\dblfloatsep {12\p@ \@plus 2\p@ \@minus 2\p@} +\setlength\dbltextfloatsep{20\p@ \@plus 2\p@ \@minus 4\p@} + +% Space left at top, bottom and inbetween floats on a float page. +\setlength\@fptop{0\p@} % no space above float page figures +\setlength\@fpsep{12\p@ \@plus 1fil} +\setlength\@fpbot{0\p@} + +% The same for double column +\setlength\@dblfptop{0\p@} +\setlength\@dblfpsep{12\p@ \@plus 1fil} +\setlength\@dblfpbot{0\p@} + +% Override settings in mathtime back to TeX defaults +\DeclareMathSizes{5} {5} {5} {5} +\DeclareMathSizes{6} {6} {5} {5} +\DeclareMathSizes{7} {7} {5} {5} +\DeclareMathSizes{8} {8} {6} {5} +\DeclareMathSizes{9} {9} {6.5} {5} +\DeclareMathSizes{10} {10} {7.5} {5} +\DeclareMathSizes{12} {12} {9} {7} + +% Page styles +\def\ps@headings + {% + \def\@oddfoot{\vbox to 12.5\p@{\hbox{\rule{\textwidth}{0.5\p@}}\vss + \hbox to \textwidth{\hfill\helveticabold\small\thepage}% + }}% + \def\@evenfoot{\vbox to 12.5\p@{\rule{\textwidth}{0.5\p@}\vss + \hbox to \textwidth{\helveticabold\small\thepage\hfill}% + }}% + \def\@evenhead{\vbox{\hbox to \textwidth{\fontsize{8}{10}\selectfont + \helveticabold{\fontshape{it}\selectfont + \strut\leftmark}\hfill}\vspace{6.5\p@}\rule{\textwidth}{0.5\p@}}}% + \def\@oddhead{\vbox{\hbox to \textwidth{\hfill\fontsize{8}{10}\selectfont + \helveticabold{\fontshape{it}\selectfont\strut\rightmark}}% + \vspace{6.5\p@}\rule{\textwidth}{0.5\p@}}}% + \def\titlemark##1{\markboth{##1}{##1}}% + \def\authormark##1{\gdef\leftmark{##1}}% + } + +\def\ps@opening + {% + \def\@oddfoot{\vbox to 13\p@{\hbox{\rule{\textwidth}{1\p@}}\vss + \hbox to \textwidth{\helvetica + \fontsize{7}{9}\fontshape{n}\selectfont% + \hfill\small\helveticabold\thepage}% + }}% + \def\@evenfoot{\vbox to 13\p@{\rule{\textwidth}\vss + \hbox to \textwidth{\helvetica\thepage\hfill + \fontsize{7}{9}\fontshape{n}\selectfont}% + }}% + \let\@evenhead\relax + \let\@oddhead\relax} + +% Page range +\newif\iflastpagegiven \lastpagegivenfalse +\newcommand\firstpage[1]{% + \gdef\@firstpage{#1}% + \ifnum\@firstpage>\c@page + \setcounter{page}{#1}% + \ClassWarning{BIO}{Increasing pagenumber to \@firstpage}% + \else \ifnum\@firstpage<\c@page + \ClassWarning{BIO}{Firstpage lower than pagenumber}\fi\fi + \xdef\@firstpage{\the\c@page}% + } +\def\@firstpage{1} +\def\pagenumbering#1{% + \global\c@page \@ne + \gdef\thepage{\csname @#1\endcsname \c@page}% + \gdef\thefirstpage{% + \csname @#1\endcsname \@firstpage}% + \gdef\thelastpage{% + \csname @#1\endcsname \@lastpage}% + } + +\newcommand\lastpage[1]{\xdef\@lastpage{#1}% + \global\lastpagegiventrue} +\def\@lastpage{0} +\def\setlastpage{\iflastpagegiven\else + \edef\@tempa{@lastpage@}% + \expandafter + \ifx \csname \@tempa \endcsname \relax + \gdef\@lastpage{0}% + \else + \xdef\@lastpage{\@nameuse{@lastpage@}}% + \fi + \fi } +\def\writelastpage{% + \iflastpagegiven \else + \immediate\write\@auxout% + {\string\global\string\@namedef{@lastpage@}{\the\c@page}}% + \fi + } +\def\thepagerange{% + \ifnum\@lastpage =0 {\ \bf ???} \else + \ifnum\@lastpage = \@firstpage \ \thefirstpage\else + \thefirstpage--\thelastpage \fi\fi} + +\AtBeginDocument{\setlastpage + \pagenumbering{arabic}% + } +\AtEndDocument{% + \writelastpage + \if@final + \clearemptydoublepage + \else + \clearpage + \fi} + +% +% Sectional units +% + +% Counters +\newcounter{section} +\newcounter{subsection}[section] +\newcounter{subsubsection}[subsection] +\newcounter{paragraph}[subsubsection] +\newcounter{subparagraph}[paragraph] +\newcounter{figure} +\newcounter{table} + +% Form of the numbers +\newcommand\thepage{\arabic{page}} +\renewcommand\thesection{\arabic{section}} +\renewcommand\thesubsection{{\thesection.\arabic{subsection}}} +\renewcommand\thesubsubsection{{\thesubsection.\arabic{subsubsection}}} +\renewcommand\theparagraph{\thesubsubsection.\arabic{paragraph}} +\renewcommand\thesubparagraph{\theparagraph.\arabic{subparagraph}} +\renewcommand\theequation{\arabic{equation}} + +% Form of the words +\newcommand\contentsname{Contents} +\newcommand\listfigurename{List of Figures} +\newcommand\listtablename{List of Tables} +\newcommand\partname{Part} +\newcommand\appendixname{Appendix} +\newcommand\abstractname{Abstract} +\newcommand\refname{References} +\newcommand\bibname{References} +\newcommand\indexname{Index} +\newcommand\figurename{Fig.} +\newcommand\tablename{Table} + +% Clearemptydoublepage should really clear the running heads too +\newcommand{\clearemptydoublepage}{\newpage{\pagestyle{empty}\cleardoublepage}} + +% Frontmatter, mainmatter and backmatter + +\newif\if@mainmatter \@mainmattertrue + +\newcommand\frontmatter{% + \clearpage + \@mainmatterfalse + \pagenumbering{roman}} + +\newcommand\mainmatter{% + \clearpage + \@mainmattertrue + \pagenumbering{arabic}} + +\newcommand\backmatter{% + \clearpage + \@mainmatterfalse} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TITLE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\newlength{\dropfromtop} +\setlength{\dropfromtop}{\z@} + +% Application Notes +\newif\if@appnotes +\newcommand{\application}{% +% \setlength{\dropfromtop}{-2.25pc}% + \global\@appnotestrue} + +\long\def\title{\@ifnextchar[{\short@title}{\@@title}} +\def\short@title[#1]{\titlemark{#1}\@@@title} +\def\@@title#1{\authormark{#1}\@@@title{#1}} +\long\def\@@@title#1{\gdef\@title{#1}} + +\long\def\author{\@ifnextchar[{\short@uthor}{\@uthor}} +\def\short@uthor[#1]{\authormark{#1}\@@author} +\def\@uthor#1{\authormark{#1}\@@author{#1}} +\long\def\@@author#1{\gdef\@author{#1}} + +\def\vol#1{\global\def\@vol{#1}} +\def\issue#1{\global\def\@issue{#1}} +\def\address#1{\global\def\@issue{#1}} +\def\history#1{\global\def\@history{#1}} +\def\editor#1{\global\def\@editor{#1}} +\def\pubyear#1{\global\def\@pubyear{#1}} +\def\copyrightyear#1{\global\def\@copyrightyear{#1}} +\def\address#1{\global\def\@address{#1}} +\def\DOI#1{\global\def\@DOI{#1}} + +\definecolor{gray}{cmyk}{0, 0, 0, 0.15} +\newlength{\extraspace} +\setlength{\extraspace}{\z@} + +\newcommand\maketitle{\par + \begingroup + \renewcommand\thefootnote{\@fnsymbol\c@footnote}% + \def\@makefnmark{\rlap{\@textsuperscript{\normalfont\@thefnmark}}}% + \long\def\@makefntext##1{\parindent 3mm\noindent +% \@textsuperscript{\normalfont\@thefnmark}\raggedright##1}% + \@textsuperscript{\normalfont\@thefnmark}##1}% + \if@twocolumn + \ifnum \col@number=\@ne + \@maketitle + \else + \twocolumn[\@maketitle]% + \fi + \else + \newpage + \global\@topnum\z@ % Prevents figures from going at top of page. + \@maketitle + \fi + \thispagestyle{opening}\@thanks + \endgroup + \setcounter{footnote}{0}% + \global\let\thanks\relax + \global\let\maketitle\relax + \global\let\@maketitle\relax + \global\let\@address\@empty + \global\let\@history\@empty + \global\let\@editor\@empty + \global\let\@thanks\@empty + \global\let\@author\@empty + \global\let\@date\@empty + \global\let\@title\@empty + \global\let\@pubyear\@empty + \global\let\address\relax + \global\let\history\relax + \global\let\editor\relax + \global\let\title\relax + \global\let\author\relax + \global\let\date\relax + \global\let\pubyear\relax + \global\let\@copyrightline\@empty + \global\let\and\relax + \@afterindentfalse\@afterheading +} + +\newlength{\aboveskipchk}%for checking oddpage or evenpage top skip +\setlength{\aboveskipchk}{\z@}% + +\def\@maketitle{% + \let\footnote\thanks + \clearemptydoublepage + \checkoddpage\ifcpoddpage\setlength{\aboveskipchk}{-3pc}\else\setlength{\aboveskipchk}{-5pc}\fi%for checking oddpage or evenpage top skip%% + \vspace*{\aboveskipchk}% + \vspace{\dropfromtop}% + \hbox to \textwidth{% + {\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL REPORT} + \hfil + \if@appnotes APPLICATIONS NOTE\hfil\fi + }% +\enskip \parbox[b]{11.3pc}{% + \helvetica + \flushright\fontsize{8}{10}\fontshape{it}\selectfont + \hfill + }} + \rule{\textwidth}{1\p@}\par% + \helvetica + \hbox to \textwidth{% + \parbox[t]{41pc}{% + \vspace*{1sp} + {\helveticabold\fontsize{16}{21}\selectfont\raggedright \@title \par}% + \vspace{4.5\p@} + {\authoraffliate\fontsize{11}{13}\selectfont\raggedright \@author \par}% + \vspace{4\p@} + {\authoraffliate\fontsize{9}{11}\selectfont\raggedright \@address \par}% + \vspace{4\p@} + %{\helvetica\fontsize{8}{10}\selectfont\raggedright \@history \par} + %\vspace{24\p@} + %{\helvetica\fontsize{10}{12}\selectfont\raggedright \@editor \par} + %\vspace{20\p@} + }% + } + \vspace{4.5\p@}% + \rule{\textwidth}{1\p@}% + \vspace{12\p@ plus 6\p@ minus 6\p@}% + \vspace{\extraspace} + } +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Abstract %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\newcommand{\absection}[1]{% + \par\noindent{\bfseries #1}\space\ignorespaces} + +\newenvironment{abstract}{% + \begingroup + \let\section\absection + \fontfamily{\sfdefault}\fontsize{8}{11}\sffamily\selectfont + {\fontseries{b}\selectfont ABSTRACT}\par} +{\endgroup\bigskip\@afterheading\@afterindentfalse\vskip 12pt plus 3pt minus 1pt} + +% Section macros + +% Lowest level heading that takes a number by default +\setcounter{secnumdepth}{3} + +\renewcommand{\@seccntformat}[1]{\csname the#1\endcsname\quad} + +\def\section{% + \@startsection{section}{1}{\z@} + {-22\p@ plus -3\p@}{3\p@} + {\reset@font\raggedright\helveticabold\fontsize{10}{12}\selectfont\MakeUppercase}} + +\def\subsection{% + \@startsection{subsection}{2}{\z@} + {-11\p@ plus -2\p@}{3\p@} + {\reset@font\raggedright\mathversion{bold}\fontseries{b}\fontsize{10}{12}\selectfont}} + +\def\subsubsection{% + \@startsection{subsubsection}{3}{\z@} + %{-11\p@ plus -1\p@}{-1em} + {-11\p@ plus -1\p@}{0.001em} + {\reset@font\normalfont\normalsize\itshape}} + +\def\textcolon{\text{\rm :}} + + \def\paragraph{% + \@startsection{paragraph}{4}{\z@} + {-6\p@} + {-.4em} + {\reset@font\itshape}} + +% ******************** +% Figures and tables * +% ******************** + +% Table and array parameters +\setlength\arraycolsep{.5em} +\setlength\tabcolsep{.5em} +\setlength\arrayrulewidth{.5pt} +\setlength\doublerulesep{2.5pt} +\setlength\extrarowheight{\z@} +\renewcommand\arraystretch{1} + +\newlength{\abovecaptionskip} +\newlength{\belowcaptionskip} +\setlength{\abovecaptionskip}{13pt} +\setlength{\belowcaptionskip}{10.5pt} + +\long\def\@makecaption#1#2{\vspace{\abovecaptionskip}% + \begingroup + \footnotesize + \textbf{#1.}\enskip{#2}\par + \endgroup} + +\long\def\@tablecaption#1#2{% + \begingroup + \footnotesize + \textbf{#1.}\enskip{#2\strut\par} + \endgroup\vspace{\belowcaptionskip}} + +% Table rules +\def\toprule{\noalign{\ifnum0=`}\fi\hrule \@height 0.5pt \hrule \@height 6pt \@width 0pt \futurelet + \@tempa\@xhline} +\def\midrule{\noalign{\ifnum0=`}\fi \hrule \@height 6.75pt \@width 0pt \hrule \@height 0.5pt + \hrule \@height 6pt \@width 0pt \futurelet \@tempa\@xhline} +\def\botrule{\noalign{\ifnum0=`}\fi \hrule \@height 5.75pt \@width 0pt \hrule \@height 0.5pt \futurelet + \@tempa\@xhline} +\def\hrulefill{\leavevmode\leaders\hrule height .5pt\hfill\kern\z@} + +\def\thefigure{\@arabic\c@figure} +\def\fps@figure{tbp} +\def\ftype@figure{1} +\def\ext@figure{lof} +\def\fnum@figure{\figurename~\thefigure} +\def\figure{\@float{figure}} +\let\endfigure\end@float +\@namedef{figure*}{\@dblfloat{figure}} +\@namedef{endfigure*}{\end@dblfloat} +\def\thetable{\@arabic\c@table} +\def\fps@table{tbp} +\def\ftype@table{2} +\def\ext@table{lot} +\def\fnum@table{Table~\thetable} +\def\table{\let\@makecaption\@tablecaption\let\source\tablesource\@float{table}} +\def\endtable{\end@float} +\@namedef{table*}{\let\@makecaption\@tablecaption\@dblfloat{table}} +\@namedef{endtable*}{\end@dblfloat} + +\newif\if@rotate \@rotatefalse +\newif\if@rotatecenter \@rotatecenterfalse +\def\rotatecenter{\global\@rotatecentertrue} +\def\rotateendcenter{\global\@rotatecenterfalse} +\def\rotate{\global\@rotatetrue} +\def\endrotate{\global\@rotatefalse} +\newdimen\rotdimen +\def\rotstart#1{\special{ps: gsave currentpoint currentpoint translate + #1 neg exch neg exch translate}} +\def\rotfinish{\special{ps: currentpoint grestore moveto}} +\def\rotl#1{\rotdimen=\ht#1\advance\rotdimen by \dp#1 + \hbox to \rotdimen{\vbox to\wd#1{\vskip \wd#1 + \rotstart{270 rotate}\box #1\vss}\hss}\rotfinish} +\def\rotr#1{\rotdimen=\ht #1\advance\rotdimen by \dp#1 + \hbox to \rotdimen{\vbox to \wd#1{\vskip \wd#1 + \rotstart{90 rotate}\box #1\vss}\hss}\rotfinish} + +\newdimen\tempdime +\newbox\temptbox + +% From ifmtarg.sty +% Copyright Peter Wilson and Donald Arseneau, 2000 +\begingroup +\catcode`\Q=3 +\long\gdef\@ifmtarg#1{\@xifmtarg#1QQ\@secondoftwo\@firstoftwo\@nil} +\long\gdef\@xifmtarg#1#2Q#3#4#5\@nil{#4} +\long\gdef\@ifnotmtarg#1{\@xifmtarg#1QQ\@firstofone\@gobble\@nil} +\endgroup + +\def\tablesize{\@setfontsize\tablesize{8\p@}{10\p@}} + +\newenvironment{processtable}[3]{\setbox\temptbox=\hbox{{\tablesize #2}}% +\tempdime\wd\temptbox\@processtable{#1}{#2}{#3}{\tempdime}} +{\relax} + +\newcommand{\@processtable}[4]{% +\if@rotate +\setbox4=\vbox to \hsize{\vss\hbox to \textheight{% +\begin{minipage}{#4}% +\@ifmtarg{#1}{}{\caption{#1}}{\tablesize #2}% +\vskip7\p@\noindent +\parbox{#4}{\fontsize{7}{9}\selectfont #3\par}% +\end{minipage}}\vss}% +\rotr{4} +\else +\hbox to \hsize{\hss\begin{minipage}[t]{#4}% +\vskip2.9pt +\@ifmtarg{#1}{}{\caption{#1}}{\tablesize #2}% +\vskip6\p@\noindent +\parbox{#4}{\fontsize{7}{9}\selectfont #3\par}% +\end{minipage}\hss}\fi}% + +\newcolumntype{P}[1]{>{\raggedright\let\\\@arraycr\hangindent1em}p{#1}} + +% ****************************** +% List numbering and lettering * +% ****************************** +\def\labelenumi{{\rm\arabic{enumi}.}} +\def\theenumi{\arabic{enumi}} +\def\labelenumii{{\rm\alph{enumii}.}} +\def\theenumii{\alph{enumii}} +\def\p@enumii{\theenumi} +\def\labelenumiii{{\rm(\arabic{enumiii})}} +\def\theenumiii{\roman{enumiii}} +\def\p@enumiii{\theenumi(\theenumii)} +\def\labelenumiv{{\rm(\arabic{enumiv})}} +\def\theenumiv{\Alph{enumiv}} +\def\p@enumiv{\p@enumiii\theenumiii} +\def\labelitemi{{\small$\bullet$}} +\def\labelitemii{{\small$\bullet$}} +\def\labelitemiii{{\small$\bullet$}} +\def\labelitemiv{{\small$\bullet$}} + +\def\@listI{\leftmargin\leftmargini \topsep\medskipamount} +\let\@listi\@listI +\@listi +\def\@listii{\topsep\z@\leftmargin\leftmarginii} +\def\@listiii{\leftmargin\leftmarginiii \topsep\z@} +\def\@listiv{\leftmargin\leftmarginiv \topsep\z@} +\def\@listv{\leftmargin\leftmarginv \topsep\z@} +\def\@listvi{\leftmargin\leftmarginvi \topsep\z@} + +\setlength{\leftmargini}{3mm} +\setlength{\leftmarginii}{\z@} +\setlength{\leftmarginiii}{\z@} +\setlength{\leftmarginiv}{\z@} + +% Changes to the list parameters for enumerate +\def\enumargs{% + \partopsep \z@ + \itemsep 3\p@ + \parsep \z@ + \labelsep 0.5em + \listparindent \parindent + \itemindent \z@ + \topsep 11\p@ +} + +\def\enumerate{% + \@ifnextchar[{\@numerate}{\@numerate[0]}} + +\def\@numerate[#1]{% + \ifnum \@enumdepth >3 \@toodeep\else + \advance\@enumdepth \@ne + \edef\@enumctr{enum\romannumeral\the\@enumdepth} + \list{\csname label\@enumctr\endcsname}{% + \enumargs + \setlength{\leftmargin}{\csname leftmargin\romannumeral\the\@enumdepth\endcsname} + \usecounter{\@enumctr} + \settowidth\labelwidth{#1} + \addtolength{\leftmargin}{\labelwidth} + \addtolength{\leftmargin}{\labelsep} + \def\makelabel##1{\hss \llap{##1}}}% + \fi + } +\let\endenumerate\endlist + +% Changes to the list parameters for itemize +\def\itemargs{% + \partopsep \z@ + \itemsep 3\p@ + \parsep \z@ + \labelsep 0.5em + \rightmargin \z@ + \listparindent \parindent + \itemindent \z@ + \topsep11\p@ +} + +\def\itemize{% + \@ifnextchar[{\@itemize}{\@itemize[$\bullet$]}} + +\def\@itemize[#1]{% + \ifnum \@itemdepth >3 \@toodeep\else + \advance\@itemdepth \@ne + \edef\@itemctr{item\romannumeral\the\@itemdepth} + \list{\csname label\@itemctr\endcsname}{% + \itemargs + \setlength{\leftmargin}{\csname leftmargin\romannumeral\the\@itemdepth\endcsname} + \settowidth\labelwidth{#1} + \addtolength{\leftmargin}{\labelwidth} + \addtolength{\leftmargin}{\labelsep} + \def\makelabel##1{\hss \llap{##1}}}% + \fi + } +\let\enditemize\endlist + +\newenvironment{unlist}{% + \begin{list}{}% + {\setlength{\labelwidth}{\z@}% + \setlength{\labelsep}{\z@}% + \setlength{\topsep}{\medskipamount}% + \setlength{\itemsep}{3\p@}% + \setlength{\leftmargin}{2em}% + \setlength{\itemindent}{-2em}}} +{\end{list}} + + +% *********************** +% Quotes and Quotations * +% *********************** +\def\quotation{\par\begin{list}{}{ + \setlength{\topsep}{\medskipamount} + \setlength{\leftmargin}{2em}% + \setlength{\rightmargin}{\z@}% + \setlength\labelwidth{0pt}% + \setlength\labelsep{0pt}% + \listparindent\parindent}% + \item[]} +\def\endquotation{\end{list}} +\let\quote\quotation +\let\endquote\endquotation + +\skip\@mpfootins = \skip\footins +\fboxsep=6\p@ +\fboxrule=1\p@ + +% ******************* +% Table of contents * +% ******************* +\newcommand\@pnumwidth{4em} +\newcommand\@tocrmarg{2.55em plus 1fil} +\newcommand\@dotsep{1000} +\setcounter{tocdepth}{4} + +\def\numberline#1{\hbox to \@tempdima{{#1}}} + +\def\@authortocline#1#2#3#4#5{% + \vskip 1.5\p@ + \ifnum #1>\c@tocdepth \else + {\leftskip #2\relax \rightskip \@tocrmarg \parfillskip -\rightskip + \parindent #2\relax\@afterindenttrue + \interlinepenalty\@M + \leavevmode + \@tempdima #3\relax + \advance\leftskip \@tempdima \null\nobreak\hskip -\leftskip + {\itshape #4}\nobreak + \leaders\hbox{$\m@th + \mkern \@dotsep mu\hbox{.}\mkern \@dotsep + mu$}\hfill + \nobreak + \hb@xt@\@pnumwidth{\hfil}% + \par}% + \fi} + +\newcommand*\l@author{\@authortocline{2}{0pt}{30pt}} +\newcommand*\l@section{\@dottedtocline{3}{11pt}{20pt}} +\newcommand*\l@subsection{\@dottedtocline{4}{31pt}{29pt}} +\newcommand*\l@subsubsection[2]{} + + + +% *********** +% Footnotes * +% *********** + +\def\footnoterule{\noindent\rule{\columnwidth}{0.5pt}} +\def\@makefnmark{\@textsuperscript{\normalfont\@thefnmark}}% +\newcommand\@makefntext[1]{\noindent{\@makefnmark}\enskip#1} + +% *********** +% References * +% *********** + +\providecommand{\newblock}{} +\newenvironment{thebibliography}{% + \section{\bibname}% + \begingroup + \small + \begin{list}{}{% + \setlength{\topsep}{\z@}% + \setlength{\labelsep}{\z@}% + \settowidth{\labelwidth}{\z@}% + \setlength{\leftmargin}{4mm}% + \setlength{\itemindent}{-4mm}}\small} +{\end{list}\endgroup} + +\RequirePackage{natbib} + +% ********** +% Appendix * +% ********** +\newif\ifappend % Are we in the Appendix? +\def\appendix{\par + \setcounter{section}{0} + \setcounter{subsection}{0} + \appendtrue +} + +%Math parameters + +\setlength{\jot}{5\p@} +\mathchardef\@m=1500 % adapted value + +\def\frenchspacing{\sfcode`\.\@m \sfcode`\?\@m \sfcode`\!\@m + \sfcode`\:\@m \sfcode`\;\@m \sfcode`\,\@m} + +% Theorems +\def\th@plain{% +%% \let\thm@indent\noindent % no indent +\thm@headfont{\quad\scshape}% heading font is bold +\thm@notefont{\upshape\mdseries}% same as heading font +\thm@headpunct{.}% no period after heading +\thm@headsep 5\p@ plus\p@ minus\p@\relax +%% \let\thm@swap\@gobble +%% \thm@preskip\topsep +%% \thm@postskip\theorempreskipamount +\itshape % body font +} + +\vbadness=9999 +\tolerance=9999 +\doublehyphendemerits=10000 +\doublehyphendemerits 640000 % corresponds to badness 800 +\finalhyphendemerits 1000000 % corresponds to badness 1000 + +\flushbottom +\frenchspacing +\ps@headings +\twocolumn + +% Screen PDF compatability +\newcommand{\medline}[1]{% + \unskip\unskip\ignorespaces} + + +%%%%for smaller size text +\newenvironment{methods}{% + \begingroup +\def\section{% + \@startsection{section}{1}{\z@} + {-24\p@ plus -3\p@}{4\p@} + {\reset@font\raggedright\helveticabold\fontsize{10}{12}\selectfont\MakeUppercase}} + \def\subsection{% + \@startsection{subsection}{2}{\z@} + {-5\p@ plus -2\p@}{4\p@} + {\reset@font\raggedright\mathversion{bold}\fontseries{b}\fontsize{10}{12}\selectfont}} + \def\subsubsection{% + \@startsection{subsubsection}{3}{\z@} +% {-6\p@ plus -1\p@}{-1em} + {-6\p@ plus -1\p@}{0.001em} + {\reset@font\normalfont\normalsize\itshape}} +\footnotesize + \par} +{\par\endgroup\bigskip\@afterheading\@afterindentfalse} + + + +\graphicspath{{g:/artwork/oup/bioinfo/}} + +\language=2 + +\hyphenation{Figure Table Figures Tables} + +\newcommand{\href}[2]{#2} + +\renewenvironment{proof}[1][\proofname]{\par + \normalfont \topsep6\p@\@plus6\p@\relax + \labelsep 0.5em + \trivlist + \item[\hskip\labelsep\hskip1em\textsc{#1}.]\ignorespaces +}{\endtrivlist\@endpefalse} + +%%Different Bonds + +\def\sbond{\ensuremath{\raise.25ex\hbox{${-}\!\!\!\!{-}$}}\kern -.9pt} +\def\dbond{\ensuremath{\raise.25ex\hbox{=$\!$=}}} +\def\tbond{\ensuremath{\raise.20ex\hbox{${\equiv}\!\!\!{\equiv}$}}} + +% Author queries +%\fboxsep=4\p@ +%\fboxrule=0.5\p@ +\newcommand{\query}[2][0pt]{}% +% \marginpar{\vspace*{#1}% +% {\parbox{\marginparwidth}{% +% \raggedright\fontsize{6}{8}\selectfont +% #2}}}} + +\renewcommand{\dag}{{\mathversion{normal}$^{\dagger}$}} + +\endinput diff --git a/lib/minimap2/tex/blasr-mc.eval b/lib/minimap2/tex/blasr-mc.eval new file mode 100644 index 000000000..1c4314885 --- /dev/null +++ b/lib/minimap2/tex/blasr-mc.eval @@ -0,0 +1,17 @@ +Q 60 32681 57 0.001744133 +Q 39 3 1 0.001774569 +Q 38 3 1 0.001804999 +Q 35 5 1 0.001835311 +Q 34 31 2 0.001894692 +Q 20 11 2 0.001955154 +Q 19 4 1 0.001985460 +Q 15 29 5 0.002136296 +Q 14 6 1 0.002166417 +Q 10 11 1 0.002196193 +Q 6 11 2 0.002256442 +Q 5 1 1 0.002286864 +Q 4 1 1 0.002317285 +Q 3 36 15 0.002771602 +Q 2 5 2 0.002832085 +Q 1 12 9 0.003105023 +Q 0 220 83 0.005594194 diff --git a/lib/minimap2/tex/bowtie2-s3.sam.eval b/lib/minimap2/tex/bowtie2-s3.sam.eval new file mode 100644 index 000000000..092247a53 --- /dev/null +++ b/lib/minimap2/tex/bowtie2-s3.sam.eval @@ -0,0 +1,28 @@ +Q 42 16872292 669 0.000039651 16872292 +Q 40 835329 636 0.000073697 17707621 +Q 31 6544 2 0.000073783 17714165 +Q 30 8882 6 0.000074084 17723047 +Q 27 68499 9 0.000074305 17791546 +Q 26 132041 81 0.000078277 17923587 +Q 25 129378 96 0.000083033 18052965 +Q 24 92056 382 0.000103665 18145021 +Q 23 14341 402 0.000125720 18159362 +Q 22 132838 146 0.000132789 18292200 +Q 21 122274 124 0.000138641 18414474 +Q 18 112183 103 0.000143361 18526657 +Q 17 126981 213 0.000153804 18653638 +Q 16 16356 208 0.000164810 18669994 +Q 15 42804 782 0.000206223 18712798 +Q 14 16026 318 0.000223025 18728824 +Q 12 170250 814 0.000264087 18899074 +Q 11 48351 1409 0.000337777 18947425 +Q 8 1843 311 0.000354156 18949268 +Q 7 62266 4435 0.000586276 19011534 +Q 6 413997 50057 0.003150647 19425531 +Q 5 404 58 0.003153568 19425935 +Q 4 704 154 0.003161381 19426639 +Q 3 1473 681 0.003196193 19428112 +Q 2 17541 16462 0.004039875 19445653 +Q 1 534344 354879 0.021693547 19979997 +Q 0 11939 9917 0.022176642 19991936 +U 8064 diff --git a/lib/minimap2/tex/bwa-s3.sam.eval b/lib/minimap2/tex/bwa-s3.sam.eval new file mode 100644 index 000000000..3aa9161b6 --- /dev/null +++ b/lib/minimap2/tex/bwa-s3.sam.eval @@ -0,0 +1,52 @@ +Q 60 18784147 3 0.000000160 18784147 +Q 52 19002 1 0.000000213 18803149 +Q 50 7152 2 0.000000319 18810301 +Q 49 6797 1 0.000000372 18817098 +Q 48 52188 2 0.000000477 18869286 +Q 47 48775 3 0.000000634 18918061 +Q 46 19447 2 0.000000739 18937508 +Q 45 25983 3 0.000000896 18963491 +Q 44 13455 1 0.000000949 18976946 +Q 43 14573 2 0.000001053 18991519 +Q 42 8697 4 0.000001263 19000216 +Q 41 8645 2 0.000001368 19008861 +Q 40 176603 75 0.000005264 19185464 +Q 38 2503 2 0.000005368 19187967 +Q 37 4117 3 0.000005523 19192084 +Q 36 2924 16 0.000006356 19195008 +Q 35 2323 8 0.000006772 19197331 +Q 34 2344 10 0.000007292 19199675 +Q 33 4279 6 0.000007603 19203954 +Q 32 2092 4 0.000007810 19206046 +Q 31 2625 11 0.000008382 19208671 +Q 30 2828 13 0.000009057 19211499 +Q 29 1581 1 0.000009108 19213080 +Q 28 1543 6 0.000009420 19214623 +Q 27 70916 223 0.000020948 19285539 +Q 26 1288 16 0.000021777 19286827 +Q 25 25551 122 0.000028065 19312378 +Q 24 14345 84 0.000032390 19326723 +Q 23 7308 87 0.000036878 19334031 +Q 22 8358 125 0.000043325 19342389 +Q 21 4836 71 0.000046983 19347225 +Q 20 5888 123 0.000053325 19353113 +Q 19 4656 83 0.000057600 19357769 +Q 18 3948 87 0.000062081 19361717 +Q 17 4418 114 0.000067954 19366135 +Q 16 4226 131 0.000074702 19370361 +Q 15 5760 164 0.000083144 19376121 +Q 14 4697 257 0.000096384 19380818 +Q 13 5246 313 0.000112503 19386064 +Q 12 4170 241 0.000124908 19390234 +Q 11 4095 304 0.000140557 19394329 +Q 10 3857 360 0.000159087 19398186 +Q 9 5300 438 0.000181617 19403486 +Q 8 4206 572 0.000211050 19407692 +Q 7 4676 787 0.000251541 19412368 +Q 6 3923 688 0.000286924 19416291 +Q 5 3294 708 0.000323333 19419585 +Q 4 2936 693 0.000358965 19422521 +Q 3 3928 816 0.000400897 19426449 +Q 2 2613 810 0.000442533 19429062 +Q 1 3515 1188 0.000503587 19432577 +Q 0 567423 376636 0.019321100 20000000 diff --git a/lib/minimap2/tex/bwa.eval b/lib/minimap2/tex/bwa.eval new file mode 100644 index 000000000..d61596f07 --- /dev/null +++ b/lib/minimap2/tex/bwa.eval @@ -0,0 +1,55 @@ +Q 60 31721 27 0.000851171 +Q 59 54 4 0.000975610 +Q 58 29 5 0.001131933 +Q 57 21 2 0.001194030 +Q 56 14 4 0.001319137 +Q 55 22 6 0.001506544 +Q 54 12 4 0.001631475 +Q 53 16 3 0.001724733 +Q 51 10 1 0.001755541 +Q 50 10 1 0.001786330 +Q 49 11 3 0.001879699 +Q 47 8 2 0.001941869 +Q 46 17 1 0.001972140 +Q 44 8 3 0.002065534 +Q 43 10 1 0.002096174 +Q 42 13 1 0.002126595 +Q 41 14 3 0.002219444 +Q 40 13 2 0.002281036 +Q 38 17 4 0.002404747 +Q 37 15 4 0.002528484 +Q 36 12 1 0.002558742 +Q 35 19 3 0.002650783 +Q 34 12 3 0.002743313 +Q 33 7 1 0.002773882 +Q 32 21 3 0.002865508 +Q 31 11 2 0.002926799 +Q 30 14 3 0.003018891 +Q 29 17 1 0.003048401 +Q 28 11 2 0.003109549 +Q 27 20 5 0.003262998 +Q 26 11 1 0.003292948 +Q 25 14 4 0.003415725 +Q 24 16 5 0.003569212 +Q 23 43 6 0.003750426 +Q 21 15 1 0.003779664 +Q 20 29 7 0.003992943 +Q 19 22 2 0.004052089 +Q 18 28 4 0.004172204 +Q 16 25 5 0.004323390 +Q 15 24 5 0.004474480 +Q 14 25 5 0.004625204 +Q 13 23 3 0.004714365 +Q 12 22 1 0.004741963 +Q 11 32 11 0.005075674 +Q 10 35 7 0.005285315 +Q 9 32 12 0.005648503 +Q 8 33 8 0.005888126 +Q 7 39 7 0.006095506 +Q 6 42 14 0.006515953 +Q 5 38 15 0.006966725 +Q 4 37 12 0.007325113 +Q 3 49 18 0.007862737 +Q 2 63 21 0.008486434 +Q 1 55 27 0.009292156 +Q 0 153 77 0.011576593 diff --git a/lib/minimap2/tex/eval2roc.pl b/lib/minimap2/tex/eval2roc.pl new file mode 100755 index 000000000..9c32e38d9 --- /dev/null +++ b/lib/minimap2/tex/eval2roc.pl @@ -0,0 +1,33 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Getopt::Std; + +my %opts = (n=>33088, s=>100); +getopts('n:', \%opts); + +my $pseudo = .5; +my $tot = $pseudo; +my $err = $pseudo; +my $tot_last_out = -$opts{s}; +my $state = 0; +my $mapq = 0; +while (<>) { + chomp; + if (/^Q\t(\d+)\t(\d+)\t(\d+)/) { + $tot += $2; + $err += $3; + if ($tot - $tot_last_out >= $opts{s}) { + print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n"; + $tot_last_out = $tot; + $state = 0; + } else { + $state = 1; + $mapq = $1; + } + } +} +if ($state) { + print join("\t", $mapq, $err/$tot, $tot / $opts{n}), "\n"; +} diff --git a/lib/minimap2/tex/graphmap.eval b/lib/minimap2/tex/graphmap.eval new file mode 100644 index 000000000..899428120 --- /dev/null +++ b/lib/minimap2/tex/graphmap.eval @@ -0,0 +1,4 @@ +Q 40 31897 63 0.001975107 +Q 3 423 267 0.010210396 +Q 2 162 120 0.013853827 +Q 1 188 172 0.019038874 diff --git a/lib/minimap2/tex/hs38-simu.sh b/lib/minimap2/tex/hs38-simu.sh new file mode 100644 index 000000000..0fe08aaf6 --- /dev/null +++ b/lib/minimap2/tex/hs38-simu.sh @@ -0,0 +1,10 @@ +./pbsim --prefix pb-1 --depth 0.1 --sample-fastq m131017_060208_42213_c100579642550000001823095604021496_s1_p0.1.subreads.fastq --length-min 1000 --length-max 30000 --seed 11 hs38.fa + +bin/mason_variator -ir hs38.fa -s 1 -ov hs38-s1.vcf --snp-rate 1e-3 --small-indel-rate 2e-4 --sv-indel-rate 0 --sv-inversion-rate 0 --sv-translocation-rate 0 --sv-duplication-rate 0 --max-small-indel-size 10 +bin/mason_simulator -ir hs38.fa -iv hs38-s1.vcf -n 1000000 --seed 1 -o s1_1.fq -or s1_2.fq -oa s1.sam --illumina-prob-mismatch-scale 2.5 + +bin/mason_variator -ir hs38.fa -s 2 -ov hs38-s2.vcf --snp-rate 1e-3 --small-indel-rate 2e-4 --sv-indel-rate 0 --sv-inversion-rate 0 --sv-translocation-rate 0 --sv-duplication-rate 0 --max-small-indel-size 10 +bin/mason_simulator -ir hs38.fa -iv hs38-s2.vcf -n 1000000 --seed 2 -o mason-s2_1.fq -or mason-s2_2.fq -oa mason-s2.sam --illumina-prob-mismatch-scale 2.5 --illumina-read-length 150 + +bin/mason_variator -ir hs38.fa -s 3 -ov hs38-s3.vcf --snp-rate 1e-3 --small-indel-rate 2e-4 --sv-indel-rate 0 --sv-inversion-rate 0 --sv-translocation-rate 0 --sv-duplication-rate 0 --max-small-indel-size 10 +bin/mason_simulator -ir hs38.fa -iv hs38-s3.vcf -n 10000000 --seed 3 -o mason-s3_1.fq -or mason-s3_2.fq -oa mason-s3.sam --illumina-prob-mismatch-scale 2.5 --illumina-read-length 150 diff --git a/lib/minimap2/tex/minialign.eval b/lib/minimap2/tex/minialign.eval new file mode 100644 index 000000000..246ddb6f3 --- /dev/null +++ b/lib/minimap2/tex/minialign.eval @@ -0,0 +1,49 @@ +Q 60 32070 190 0.005924540 +Q 59 62 2 0.005975352 +Q 58 37 5 0.006123908 +Q 57 40 7 0.006333633 +Q 56 39 6 0.006512032 +Q 55 32 2 0.006567534 +Q 54 54 2 0.006618420 +Q 53 33 4 0.006735255 +Q 52 39 2 0.006788866 +Q 51 48 3 0.006871264 +Q 50 34 2 0.006925634 +Q 49 32 3 0.007011070 +Q 48 35 2 0.007064967 +Q 47 36 4 0.007179896 +Q 46 23 1 0.007205495 +Q 45 25 1 0.007230614 +Q 44 17 3 0.007318716 +Q 43 17 2 0.007376121 +Q 42 31 5 0.007522016 +Q 41 25 4 0.007638486 +Q 40 26 4 0.007754541 +Q 39 35 2 0.007807258 +Q 37 18 4 0.007924896 +Q 36 13 3 0.008013162 +Q 35 15 2 0.008070411 +Q 34 20 3 0.008156805 +Q 33 11 1 0.008184501 +Q 32 15 3 0.008272003 +Q 31 25 1 0.008296107 +Q 29 8 1 0.008324472 +Q 28 7 2 0.008383452 +Q 27 9 2 0.008441894 +Q 26 30 2 0.008494888 +Q 23 2 1 0.008524710 +Q 22 11 3 0.008612846 +Q 20 23 3 0.008697760 +Q 19 6 1 0.008726479 +Q 18 8 1 0.008754658 +Q 16 6 1 0.008783354 +Q 13 2 1 0.008813108 +Q 12 4 2 0.008872604 +Q 11 7 2 0.008931275 +Q 10 4 3 0.009021009 +Q 9 6 4 0.009140436 +Q 8 6 3 0.009229559 +Q 7 5 1 0.009258419 +Q 6 8 3 0.009346925 +Q 4 8 5 0.009495872 +Q 3 17 8 0.009732801 diff --git a/lib/minimap2/tex/minimap2.bib b/lib/minimap2/tex/minimap2.bib new file mode 100644 index 000000000..99f81bc22 --- /dev/null +++ b/lib/minimap2/tex/minimap2.bib @@ -0,0 +1,460 @@ +@article{Chaisson:2012aa, + Author = {Chaisson, Mark J and Tesler, Glenn}, + Journal = {BMC Bioinformatics}, + Pages = {238}, + Title = {{Mapping single molecule sequencing reads using basic local alignment with successive refinement (BLASR): application and theory}}, + Volume = {13}, + Year = {2012}} + +@article{Liu:2016ab, + Author = {Liu, Bo and others}, + Journal = {Bioinformatics}, + Pages = {1625-31}, + Title = {{rHAT}: fast alignment of noisy long reads with regional hashing}, + Volume = {32}, + Year = {2016}} + +@article{Liu:2017aa, + Author = {Liu, Bo and others}, + Journal = {Bioinformatics}, + Pages = {192-201}, + Title = {{LAMSA}: fast split read alignment with long approximate matches}, + Volume = {33}, + Year = {2017}} + +@article{Lin:2017aa, + Author = {Lin, Hsin-Nan and Hsu, Wen-Lian}, + Journal = {Bioinformatics}, + Title = {Kart: a divide-and-conquer algorithm for {NGS} read alignment}, + Year = {2017}} + +@article{Li:2013aa, + Author = {Li, Heng}, + Journal = {arXiv:1303.3997}, + Title = {Aligning sequence reads, clone sequences and assembly contigs with {BWA-MEM}}, + archivePrefix = "arXiv", + eprint = {1303.3997}, + primaryClass = "q-bio", + Year = {2013}} + +@article{Sovic:2016aa, + Author = {Sovi{\'c}, Ivan and others}, + Journal = {Nat Commun}, + Pages = {11307}, + Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}}, + Volume = {7}, + Year = {2016}} + +@article{Langmead:2012fk, + Author = {Langmead, Ben and Salzberg, Steven L}, + Journal = {Nat Methods}, + Pages = {357-9}, + Title = {Fast gapped-read alignment with {Bowtie} 2}, + Volume = {9}, + Year = {2012}} + +@article{Li:2016aa, + Author = {Li, Heng}, + Journal = {Bioinformatics}, + Pages = {2103-10}, + Title = {Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences}, + Volume = {32}, + Year = {2016}} + +@misc{Ruan:2016, + title = {Ultra-fast de novo assembler using long noisy reads}, + author = {Jue Ruan}, + journal = {Unpulished}, + howpublished = {\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}}, + year = {2016}} + +@article{Miller:1988aa, + Author = {Miller, W and Myers, E W}, + Journal = {Bull Math Biol}, + Number = {2}, + Pages = {97-120}, + Title = {Sequence comparison with concave weighting functions}, + Volume = {50}, + Year = {1988}} + +@article{Gotoh:1990aa, + Author = {Gotoh, O}, + Journal = {Bull Math Biol}, + Pages = {359-73}, + Title = {Optimal sequence alignment allowing for long gaps}, + Volume = {52}, + Year = {1990}} + +@article{Wu:1996aa, + Author = {Wu, Sun and others}, + Journal = {Algorithmica}, + Pages = {50-67}, + Title = {A subquadratic algorithm for approximate limited expression matching}, + Volume = {15}, + Year = {1996}} + +@article{Daily:2016aa, + Author = {Daily, Jeff}, + Journal = {BMC Bioinformatics}, + Month = {Feb}, + Pages = {81}, + Title = {Parasail: {SIMD C} library for global, semi-global, and local pairwise sequence alignments}, + Volume = {17}, + Year = {2016}} + +@article{Sedlazeck169557, + author = {Sedlazeck, Fritz J and others}, + title = {Accurate detection of complex structural variations using single molecule sequencing}, + note = {doi:10.1101/169557}, + journal = {bioRxiv}, + year = {2017}} + +@article{Altschul:1997vn, + Author = {Altschul, S F and others}, + Journal = {Nucleic Acids Res}, + Pages = {3389-402}, + Title = {Gapped {BLAST} and {PSI-BLAST}: a new generation of protein database search programs}, + Volume = {25}, + Year = {1997}} + +@article{Sosic:2017aa, + Author = {{\v S}o{\v s}i\'{c}, Martin and {\v S}ikic, Mile}, + Journal = {Bioinformatics}, + Pages = {1394-1395}, + Title = {Edlib: a {C/C++} library for fast, exact sequence alignment using edit distance}, + Volume = {33}, + Year = {2017}} + +@article{Abouelhoda:2005aa, + Author = {Mohamed Ibrahim Abouelhoda and Enno Ohlebusch}, + Journal = {J. Discrete Algorithms}, + Pages = {321-41}, + Title = {Chaining algorithms for multiple genome comparison}, + Volume = {3}, + Year = {2005}} + +@article{Ono:2013aa, + Author = {Ono, Yukiteru and others}, + Journal = {Bioinformatics}, + Pages = {119-21}, + Title = {{PBSIM}: {PacBio} reads simulator--toward accurate genome assembly}, + Volume = {29}, + Year = {2013}} + +@article {Jain128835, + author = {Jain, Miten and others}, + title = {Nanopore sequencing and assembly of a human genome with ultra-long reads}, + year = {2017}, + note = {doi:10.1101/128835}, + publisher = {Cold Spring Harbor Labs Journals}, + journal = {bioRxiv}} + +@article{Lau:2016aa, + Author = {Lau, Bayo and others}, + Journal = {Bioinformatics}, + Pages = {3829-3832}, + Title = {{LongISLND}: in silico sequencing of lengthy and noisy datatypes}, + Volume = {32}, + Year = {2016}} + +@article{Robinson:2011aa, + Author = {Robinson, James T and others}, + Journal = {Nat Biotechnol}, + Pages = {24-6}, + Title = {Integrative genomics viewer}, + Volume = {29}, + Year = {2011}} + +@article{Gotoh:1982aa, + Author = {Gotoh, O}, + Journal = {J Mol Biol}, + Pages = {705-8}, + Title = {An improved algorithm for matching biological sequences}, + Volume = {162}, + Year = {1982}} + +@article{Altschul:1986aa, + Author = {Altschul, S F and Erickson, B W}, + Journal = {Bull Math Biol}, + Pages = {603-16}, + Title = {Optimal sequence alignment using affine gap costs}, + Volume = {48}, + Year = {1986}} + +@article{Wu:2005vn, + Author = {Wu, Thomas D and Watanabe, Colin K}, + Journal = {Bioinformatics}, + Pages = {1859-75}, + Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences}, + Volume = {21}, + Year = {2005}} + +@article{Iwata:2012aa, + Author = {Iwata, Hiroaki and Gotoh, Osamu}, + Journal = {Nucleic Acids Res}, + Pages = {e161}, + Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features}, + Volume = {40}, + Year = {2012}} + +@article{Dobin:2013kx, + Author = {Dobin, Alexander and others}, + Journal = {Bioinformatics}, + Pages = {15-21}, + Title = {{STAR}: ultrafast universal {RNA-seq} aligner}, + Volume = {29}, + Year = {2013}} + +@article{Byrne:2017aa, + Author = {Byrne, Ashley and others}, + Journal = {Nat Commun}, + Pages = {16027}, + Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells}, + Volume = {8}, + Year = {2017}} + +@article{Roberts:2004fv, + Author = {Roberts, Michael and others}, + Journal = {Bioinformatics}, + Pages = {3363-9}, + Title = {Reducing storage requirements for biological sequence comparison}, + Volume = {20}, + Year = {2004}} + +@article{Zhang:2006aa, + Author = {Zhang, Miao and Gish, Warren}, + Journal = {Bioinformatics}, + Pages = {13-20}, + Title = {Improved spliced alignment from an information theoretic approach}, + Volume = {22}, + Year = {2006}} + +@article{Li:2007aa, + Author = {Li, Heng and others}, + Journal = {BMC Bioinformatics}, + Pages = {349}, + Title = {A cross-species alignment tool {(CAT)}}, + Volume = {8}, + Year = {2007}} + +@article{Farrar:2007hs, + Author = {Farrar, Michael}, + Journal = {Bioinformatics}, + Pages = {156-61}, + Title = {{Striped Smith-Waterman speeds database searches six times over other SIMD implementations}}, + Volume = {23}, + Year = {2007}} + +@techreport{Holtgrewe:2010aa, + Address = {Freie Universit{\"a}t Berlin}, + Author = {Holtgrewe, M.}, + Institution = {Institut f{\"u}r Mathematik und Informatik}, + Number = {TR-B-10-06}, + Title = {Mason -- a read simulator for second generation sequencing data}, + Year = {2010}} + +@article{Zaharia:2011aa, + Author = {Zaharia, Matei and others}, + Journal = {arXiv:1111:5572}, + Title = {Faster and More Accurate Sequence Alignment with {SNAP}}, + Year = {2011}} + +@article{Irimia:2008aa, + Author = {Irimia, Manuel and Roy, Scott William}, + Journal = {PLoS Genet}, + Pages = {e1000148}, + Title = {Evolutionary convergence on highly-conserved 3' intron structures in intron-poor eukaryotes and insights into the ancestral eukaryotic genome}, + Volume = {4}, + Year = {2008}} + +@article{Depristo:2011vn, + Author = {Depristo, Mark A and others}, + Journal = {Nat Genet}, + Pages = {491-8}, + Title = {A framework for variation discovery and genotyping using next-generation {DNA} sequencing data}, + Volume = {43}, + Year = {2011}} + +@article{Kurtz:2004zr, + Author = {Kurtz, Stefan and others}, + Journal = {Genome Biol}, + Pages = {R12}, + Title = {Versatile and open software for comparing large genomes}, + Volume = {5}, + Year = {2004}} + +@article {Li223297, + author = {Li, Heng and others}, + title = {New synthetic-diploid benchmark for accurate variant calling evaluation}, + year = {2017}, + note = {doi:10.1101/223297}, + journal = {bioRxiv} +} + +@article{Berlin:2015xy, + Author = {Berlin, Konstantin and others}, + Journal = {Nat Biotechnol}, + Pages = {623-30}, + Title = {Assembling large genomes with single-molecule sequencing and locality-sensitive hashing}, + Volume = {33}, + Year = {2015}} + +@article{Gurevich:2013aa, + Author = {Gurevich, Alexey and others}, + Journal = {Bioinformatics}, + Pages = {1072-5}, + Title = {{QUAST}: quality assessment tool for genome assemblies}, + Volume = {29}, + Year = {2013}} + +@article{Li:2010fk, + Author = {Li, Heng and Durbin, Richard}, + Journal = {Bioinformatics}, + Pages = {589-95}, + Title = {Fast and accurate long-read alignment with {Burrows-Wheeler} transform}, + Volume = {26}, + Year = {2010}} + +@article{Marcais:2018aa, + Author = {Mar{\c c}ais, Guillaume and others}, + Journal = {PLoS Comput Biol}, + Pages = {e1005944}, + Title = {{MUMmer4}: A fast and versatile genome alignment system}, + Volume = {14}, + Year = {2018}} + +@article{Li:2009ys, + Author = {Li, Heng and others}, + Journal = {Bioinformatics}, + Pages = {2078-9}, + Title = {The {Sequence Alignment/Map format and SAMtools}}, + Volume = {25}, + Year = {2009}} + +@article{Suzuki:2018aa, + Author = {Suzuki, Hajime and Kasahara, Masahiro}, + Journal = {BMC Bioinformatics}, + Pages = {45}, + Title = {Introducing difference recurrence relations for faster semi-global alignment of long sequences}, + Volume = {19}, + Year = {2018}} + +@article{Li:2018ab, + Author = {Li, Heng}, + Journal = {Bioinformatics}, + Pages = {3094-3100}, + Title = {Minimap2: pairwise alignment for nucleotide sequences}, + Volume = {34}, + Year = {2018}} + +@article{Jain:2020aa, + Author = {Jain, Chirag and others}, + Journal = {Bioinformatics}, + Pages = {i111-i118}, + Title = {Weighted minimizer sampling improves long read mapping}, + Volume = {36}, + Year = {2020}} + +@article{Miga:2020aa, + Author = {Miga, Karen H and others}, + Journal = {Nature}, + Pages = {79-84}, + Title = {Telomere-to-telomere assembly of a complete human {X} chromosome}, + Volume = {585}, + Year = {2020}} + +@article {Jain2020.11.01.363887, + author = {Jain, Chirag and others}, + title = {A long read mapping method for highly repetitive reference sequences}, + elocation-id = {2020.11.01.363887}, + year = {2020}, + doi = {10.1101/2020.11.01.363887}, + publisher = {Cold Spring Harbor Laboratory}, + URL = {https://www.biorxiv.org/content/early/2020/11/02/2020.11.01.363887}, + eprint = {https://www.biorxiv.org/content/early/2020/11/02/2020.11.01.363887.full.pdf}, + journal = {bioRxiv} +} + +@article{Li:2020aa, + Author = {Li, Heng and others}, + Journal = {Genome Biol}, + Pages = {265}, + Title = {The design and construction of reference pangenome graphs with minigraph}, + Volume = {21}, + Year = {2020}} + +@article{Ren:2021aa, + Author = {Ren, Jingwen and Chaisson, Mark J P}, + Journal = {PLoS Comput Biol}, + Pages = {e1009078}, + Title = {lra: A long read aligner for sequences and contigs}, + Volume = {17}, + Year = {2021}} + +@inproceedings{DBLP:conf/wabi/AbouelhodaO03, + Author = {Mohamed Ibrahim Abouelhoda and Enno Ohlebusch}, + Booktitle = {Algorithms in Bioinformatics, Third International Workshop, {WABI} 2003, Budapest, Hungary, September 15-20, 2003, Proceedings}, + Crossref = {DBLP:conf/wabi/2003}, + Pages = {1--16}, + Title = {A Local Chaining Algorithm and Its Applications in Comparative Genomics}, + Year = {2003}} + +@article{Ono:2021aa, + Author = {Ono, Yukiteru and others}, + Journal = {Bioinformatics}, + Pages = {589-595}, + Title = {{PBSIM2}: a simulator for long-read sequencers with a novel generative model of quality scores}, + Volume = {37}, + Year = {2021}} + +@article{Sedlazeck:2018ab, + Author = {Sedlazeck, Fritz J and others}, + Journal = {Nat Methods}, + Pages = {461-468}, + Title = {Accurate detection of complex structural variations using single-molecule sequencing}, + Volume = {15}, + Year = {2018}} + +@article{Jeffares:2017aa, + Author = {Jeffares, Daniel C and others}, + Journal = {Nat Commun}, + Pages = {14061}, + Title = {Transient structural variations have strong effects on quantitative traits and reproductive isolation in fission yeast}, + Volume = {8}, + Year = {2017}} + +@article{Zook:2020aa, + Author = {Zook, Justin M and others}, + Journal = {Nat Biotechnol}, + Pages = {1347-1355}, + Title = {A robust benchmark for detection of germline large deletions and insertions}, + Volume = {38}, + Year = {2020}} + +@article{Harpak:2017aa, + Author = {Harpak, Arbel and others}, + Journal = {Proc Natl Acad Sci U S A}, + Pages = {12779-12784}, + Title = {Frequent nonallelic gene conversion on the human lineage and its effect on the divergence of gene duplicates}, + Volume = {114}, + Year = {2017}} + +@article{Li:2018aa, + Author = {Li, Heng and others}, + Journal = {Nat Methods}, + Month = {Aug}, + Number = {8}, + Pages = {595-597}, + Title = {A synthetic-diploid benchmark for accurate variant-calling evaluation}, + Volume = {15}, + Year = {2018}} + +@article{Gu:1995wt, + author = {Gu, X and Li, W H}, + journal = {J Mol Evol}, + month = {Apr}, + number = {4}, + pages = {464-73}, + title = {The size distribution of insertions and deletions in human and rodent pseudogenes suggests the logarithmic gap penalty for sequence alignment}, + volume = {40}, + year = {1995}} diff --git a/lib/minimap2/tex/minimap2.tex b/lib/minimap2/tex/minimap2.tex new file mode 100644 index 000000000..77b6dbca5 --- /dev/null +++ b/lib/minimap2/tex/minimap2.tex @@ -0,0 +1,724 @@ +\documentclass{bioinfo} +\copyrightyear{2018} +\pubyear{2018} + +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{url} +\usepackage{amsmath} +\usepackage[ruled,vlined]{algorithm2e} +\newcommand\mycommfont[1]{\footnotesize\rmfamily{\it #1}} +\SetCommentSty{mycommfont} +\SetKwComment{Comment}{$\triangleright$\ }{} + +\usepackage{natbib} +\bibliographystyle{apalike} + +\DeclareMathOperator*{\argmax}{argmax} + +\begin{document} +\firstpage{1} + +\title[Aligning nucleotide sequences with minimap2]{Minimap2: pairwise alignment for nucleotide sequences} +\author[Li]{Heng Li} +\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA} + +\maketitle + +\begin{abstract} + +\section{Motivation:} Recent advances in sequencing technologies promise +ultra-long reads of $\sim$100 kilo bases (kb) in average, full-length mRNA or +cDNA reads in high throughput and genomic contigs over 100 mega bases (Mb) in +length. Existing alignment programs are unable or inefficient to process such data +at scale, which presses for the development of new alignment algorithms. + +\section{Results:} Minimap2 is a general-purpose alignment program to map DNA or long +mRNA sequences against a large reference database. It works with accurate short +reads of $\ge$100bp in length, $\ge$1kb genomic reads at error rate $\sim$15\%, +full-length noisy Direct RNA or cDNA reads, and assembly contigs or closely +related full chromosomes of hundreds of megabases in length. Minimap2 does +split-read alignment, employs concave gap cost for long insertions and +deletions (INDELs) and introduces new heuristics to reduce spurious alignments. +It is 3--4 times as fast as mainstream short-read mappers at comparable +accuracy, and is $\ge$30 times faster than long-read genomic or cDNA +mappers at higher accuracy, surpassing most aligners specialized in one type of +alignment. + +\section{Availability and implementation:} +\href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2} + +\section{Contact:} hengli@broadinstitute.org +\end{abstract} + +\section{Introduction} + +Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore +technologies (ONT) produce reads over 10kbp in length at an error rate +$\sim$15\%. Several aligners have been developed for such +data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}. +Most of them were five times as slow as mainstream short-read +aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases +mapped per second. We speculated there could be substantial room for speedup on +the thought that 10kb long sequences should be easier to map than 100bp reads +because we can more effectively skip repetitive regions, which are often the +bottleneck of short-read alignment. We confirmed our speculation by achieving +approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}. +\citet{Suzuki:2018aa} extended our work with a fast and novel algorithm on +generating base-level alignment, which in turn inspired us to develop minimap2 +with added functionality. + +Both SMRT and ONT have been applied to the sequencing of spliced mRNAs (RNA-seq). While +traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not +optimized for long noisy sequence reads and are tens of times slower than +dedicated long-read aligners. When developing minimap2 initially for aligning +genomic DNA only, we realized minor modifications could enable the base +algorithm to map mRNAs as well. Minimap2 becomes a first RNA-seq aligner +specifically designed for long noisy reads. We have also extended the original +algorithm to map short reads at a speed faster than several mainstream +short-read mappers. + +In this article, we will describe the minimap2 algorithm and its applications +to different types of input sequences. We will evaluate the performance and +accuracy of minimap2 on several simulated and real data sets and demonstrate +the versatility of minimap2. + +\begin{methods} +\section{Methods} + +Minimap2 follows a typical seed-chain-align procedure as is used by most +full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the +reference sequences and indexes them in a hash table, with the key being the +hash of a minimizer and the value being a list of locations of the minimizer +copies. Then for each query +sequence, minimap2 takes query minimizers as \emph{seeds}, finds exact matches +(i.e. \emph{anchors}) to the reference, and identifies sets of colinear anchors as +\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic +programming (DP) to extend from the ends of chains and to close +regions between adjacent anchors in chains. + +Minimap2 uses indexing and seeding algorithms similar to +minimap~\citep{Li:2016aa}, and furthers the predecessor with more accurate +chaining, the ability to produce base-level alignment and the support of +spliced alignment. + +\subsection{Chaining} + +\subsubsection{Chaining} +An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the +reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors +sorted by ending reference position $x$, let $f(i)$ be the maximal chaining +score up to the $i$-th anchor in the list. $f(i)$ can be calculated with +dynamic programming: +\begin{equation}\label{eq:chain} +f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\} +\end{equation} +where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of +matching bases between the two anchors. $\beta(j,i)>0$ is the gap cost. It +equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the +distance between two anchors is too large); otherwise +\begin{equation}\label{eq:chain-gap} +\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big) +\end{equation} +In implementation, a gap of length $l$ costs +\[ +\gamma_c(l)=\left\{\begin{array}{ll} +0.01\cdot \bar{w}\cdot|l|+0.5\log_2|l| & (l\not=0) \\ +0 & (l=0) +\end{array}\right. +\] +where $\bar{w}$ is the average seed length. For $N$ anchors, directly computing all $f(\cdot)$ with +Eq.~(\ref{eq:chain}) takes $O(N^2)$ time. Although theoretically faster +chaining algorithms exist~\citep{Abouelhoda:2005aa}, they +are inapplicable to generic gap cost, complex to implement and usually +associated with a large constant. We introduced a simple heuristic to +accelerate chaining. + +We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor +of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}), +we start from anchor $i-1$ and stop the process if we cannot find a better +score after up to $h$ iterations. This approach reduces the average time to +$O(hN)$. In practice, we can almost always find the optimal chain with +$h=50$; even if the heuristic fails, the optimal chain is often close. + +\subsubsection{Backtracking} +Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if +$f(i)=w_i$ or $\argmax_j\{f(j)+\alpha(j,i)-\beta(j,i)\}$ otherwise. For each +anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to +find its predecessor and mark each visited $i$ as `used', until $P(i)=0$ or we +reach an already `used' $i$. This way we find all chains with no anchors used +in more than one chains. + +\subsubsection{Identifying primary chains}\label{sec:primary} +In the absence of copy number changes, each query segment should not be mapped +to two places in the reference. However, chains found at the previous step may +have significant or complete overlaps due to repeats in the reference~\citep{Li:2010fk}. +Minimap2 used the following procedure to identify \emph{primary chains} that do +not greatly overlap on the query. + +Let $Q$ be an empty set initially. For each +chain from the best to the worst according to their chaining scores: if on the +query, the chain overlaps with a chain in $Q$ by 50\% or higher percentage of +the shorter chain, mark the chain as secondary to the chain in $Q$; otherwise, +add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did +not choose a more sophisticated data structure (e.g. range tree or k-d tree) +because this step is not the performance bottleneck. + +For each primary chain, minimap2 estimates its mapping quality with an +empirical formula: +\[ +{\rm mapQ}=40\cdot (1-f_2/f_1)\cdot\min\{1,m/10\}\cdot\log f_1 +\] +where $\log$ denotes natural logarithm, $m$ is the number of anchors on the primary chain, $f_1$ is the chaining +score, and $f_2\le f_1$ is the score of the best chain that is secondary to the +primary chain. Intuitively, a chain is assigned to a higher mapping quality if +it is long and its best secondary chain is weak. + +\subsubsection{Estimating per-base sequence divergence} +Suppose a query sequence harbors $n$ seeds of length $k$, $m$ of which are +present in a chain. We want to estimate the sequence divergence $\epsilon$ +between the query and the reference sequences in the chain. This is useful +when base-level alignment is too expensive to perform. + +If we model substitutions with a homogeneous Poisson process along the query +sequence, the probablity of seeing $k$ consecutive bases without substitutions +is $e^{-k\epsilon}$. On the assumption that all $k$-mers are independent of +each other, the likelihood function of $\epsilon$ is +\[ +\mathcal{L}(\epsilon|n,m,k)=e^{-m\cdot k\epsilon}(1-e^{-k\epsilon})^{n-m} +\] +The maximum likelihood estimate of $\epsilon$ is +\[ +\hat{\epsilon}=\frac{1}{k}\log\frac{n}{m} +\] +In reality, sequencing errors are sometimes clustered and $k$-mers are not +independent of each other, especially when we take minimizers as seeds. These +violate the assumptions in the derivation above. As a result, $\hat{\epsilon}$ +is only approximate and can be biased. It also ignores long deletions from the +reference sequence. In practice, fortunately, $\hat{\epsilon}$ is often close +to and strongly correlated with the sequence divergence estimated from +base-level alignments. On the several datasets used in +Section~\ref{sec:long-genomic}, the Spearman correlation coefficient is around +$0.9$. + +\subsubsection{Indexing with homopolymer compressed $k$-mers} +SmartDenovo +(\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}; +J. Ruan, personal communication) indexes reads with homopolymer-compressed (HPC) +$k$-mers and finds the strategy improves overlap sensitivity for SMRT reads. +Minimap2 adopts the same heuristic. + +The HPC string of a string $s$, denoted by ${\rm HPC}(s)$, is constructed by +contracting homopolymers in $s$ to a single base. An HPC $k$-mer of $s$ is a +$k$-long substring of ${\rm HPC}(s)$. For example, suppose $s={\tt GGATTTTCCA}$, +${\rm HPC}(s)={\tt GATCA}$ and the first HPC 4-mer is ${\tt GATC}$. + +To demonstrate the effectiveness of HPC $k$-mers, we performed read overlapping +for the example {\it E. coli} SMRT reads from PBcR~\citep{Berlin:2015xy}, using +different types of $k$-mers. With normal 15bp minimizers per 5bp window, +minimap2 finds 90.9\% of $\ge$2kb overlaps inferred from the read-to-reference +alignment. With HPC 19-mers per 5bp window, minimap2 finds 97.4\% of overlaps. It achieves this +higher sensitivity by indexing 1/3 fewer minimizers, which further helps +performance. HPC-based indexing reduces the sensitivity for current ONT reads, though. + +\subsection{Aligning genomic DNA}\label{sec:genomic} + +\subsubsection{Alignment with 2-piece affine gap cost} + +Minimap2 performs DP-based global alignment between adjacent anchors in a +chain. It uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}: +\begin{equation}\label{eq:2-piece} +\gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\} +\end{equation} +Without losing generality, we always assume $q+e<\tilde{q}+\tilde{e}$. +On the condition that $e>\tilde{e}$, it applies cost $q+|l|\cdot e$ to gaps +shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies +$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover +longer insertions and deletions~(INDELs). + +The equation to compute the optimal alignment under $\gamma_a(\cdot)$ is +\begin{equation}\label{eq:ae86} +\left\{\begin{array}{l} +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\ +E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ +F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\ +\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e} +\end{array}\right. +\end{equation} +where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query +base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine +gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}. + +\subsubsection{The Suzuki-Kasahara formulation} + +When we allow gaps longer than several hundred base pairs, nucleotide-level +alignment is much slower than chaining. SSE acceleration is critical to the +performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs} +based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short +sequences, but only 4-way parallelization when the peak alignment score reaches +32767. Long sequence alignment may exceed this threshold. Inspired by +\citet{Wu:1996aa} and the following work, \citet{Suzuki:2018aa} proposed a +difference-based formulation that lifted this limitation. +In case of 2-piece gap cost, define +\[ +\left\{\begin{array}{ll} +u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ +x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-H_{ij} \\ +y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-H_{ij} +\end{array}\right. +\] +We can transform Eq.~(\ref{eq:ae86}) to +\begin{equation}\label{eq:suzuki} +\left\{\begin{array}{lll} +z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\ +&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\ +u_{ij}&=&z_{ij}-v_{i-1,j}\\ +v_{ij}&=&z_{ij}-u_{i,j-1}\\ +x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\ +y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ +\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ +\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} +\end{array}\right. +\end{equation} +where $z_{ij}$ is a temporary variable that does not need to be stored. + +An important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded +by scoring parameters. To see that, +\[ +x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e +\] +With $E_{ij}\le H_{ij}$, we have +\[ +-q-e\le x_{ij}\le\max\{-q,0\}-e=-e +\] +and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$. +In addition, +\[ +u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e +\] +As the maximum value of $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the maximal +matching score, we can derive +\[ +u_{ij}\le M-v_{i-1,j}\le M+q+e +\] +In conclusion, in Eq.~(\ref{eq:suzuki}), $x$ and $y$ are bounded by $[-q-e,-e]$, +$\tilde{x}$ and $\tilde{y}$ by $[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and +$v$ by $[-q-e,M+q+e]$. When $-128\le-q-e\tilde{e}$, the initial +values in the diagonal-antidiagonal formuation are +\[ +\left\{\begin{array}{l} +x_{r-1,-1}=y_{r-1,r}=-q-e\\ +\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\ +u_{r-1,r}=v_{r-1,-1}=\eta(r)\\ +\end{array}\right. +\] +where +\[ +\eta(r)=\left\{\begin{array}{ll} +-q-e & (r=0) \\ +-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) +\end{array}\right. +\] +These can be derived from the initial values for Eq.~(\ref{eq:ae86}). + +When performing global alignment, we do not need to compute $H_{rt}$ in each cell. +We use 16-way vectorization throughout the alignment process. When extending +alignments from ends of chains, we need to find the cell $(r,t)$ where $H_{rt}$ +reaches the maximum. We resort to 4-way vectorization to compute +$H_{rt}=H_{r-1,t}+u_{rt}$. Because this computation is simple, +Eq.~(\ref{eq:suzuki}) is still the dominant performance bottleneck. + +In practice, our 16-way vectorized implementation of global alignment is three +times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without +banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with +a 1000bp band, it is considerably faster. When performing global alignment +between anchors, we expect the alignment to stay close to the diagonal of the +DP matrix. Banding is applicable most of the time. + +\subsubsection{The Z-drop heuristic} + +With global alignment, minimap2 may force to align unrelated sequences between +two adjacent anchors. To avoid such an artifact, we compute accumulative +alignment score along the alignment path and break the alignment where the +score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be +the alignment score along the alignment path ending at cell $(i,j)$ in the DP +matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'Z+e\cdot|(i-i')-(j-j')| +\] +where $e$ is the gap extension cost and $Z$ is an arbitrary threshold. +This strategy is first used in BWA-MEM. It is similar to X-drop employed in +BLAST~\citep{Altschul:1997vn}, but unlike X-drop, it would not break the +alignment in the presence of a single long gap. + +When minimap2 breaks a global alignment between two anchors, it performs local +alignment between the two subsequences involved in the global alignment, but +this time with the one subsequence reverse complemented. This additional +alignment step may identify short inversions that are missed during chaining. + +\subsubsection{Filtering out misplaced anchors} +Due to sequencing errors and local homology, some anchors in a chain may be +wrong. If we blindly align regions between two misplaced anchors, we will +produce a suboptimal alignment. To reduce this artifact, we filter out +anchors that lead to a $>$10bp insertion and a $>$10bp deletion at the same +time, and filter out terminal anchors that lead to a long gap towards the ends +of a chain. These heuristics greatly alleviate the issues with misplaced +anchors, but they are unable to fix all such errors. Local misalignment is a +limitation of minimap2 which we hope to address in future. + +\subsection{Aligning spliced sequences} + +The algorithm described above can be adapted to spliced alignment. In this +mode, the chaining gap cost distinguishes insertions to and deletions from the +reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of +\[ +\gamma_c(l)=\left\{\begin{array}{ll} +0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\ +\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0) +\end{array}\right. +\] +Similarly, the gap cost function used for DP-based alignment is changed to +\[ +\gamma_a(l)=\left\{\begin{array}{ll} +q+l\cdot e & (l>0) \\ +\min\{q+|l|\cdot e,\tilde{q}\} & (l<0) +\end{array}\right. +\] +In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is +regarded as an intron, which pays no cost to gap extensions. + +To pinpoint precise splicing junctions, minimap2 introduces reference-dependent +cost to penalize non-canonical splicing: +\begin{equation}\label{eq:splice} +\left\{\begin{array}{l} +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\ +E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ +F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\ +\end{array}\right. +\end{equation} +Let $T$ be the reference sequence. $d(i)$ is computed as +\[d(i)=\left\{\begin{array}{ll} +0 & \mbox{if $T[i+1,i+3]$ is ${\tt GTA}$ or ${\tt GTG}$} \\ +p/2 & \mbox{if $T[i+1,i+3]$ is ${\tt GTC}$ or ${\tt GTT}$} \\ +p & \mbox{otherwise} +\end{array}\right.\] +where $T[i,j]$ extracts a substring of $T$ between $i$ and $j$ inclusively. +$d(i)$ penalizes non-canonical donor sites with $p$ and less frequent Eukaryotic +splicing signal ${\tt GT[C/T]}$ with $p/2$~\citep{Irimia:2008aa}. Similarly, +\[a(i)=\left\{\begin{array}{ll} +0 & \mbox{if $T[i-2,i]$ is ${\tt CAG}$ or ${\tt TAG}$} \\ +p/2 & \mbox{if $T[i-2,i]$ is ${\tt AAG}$ or ${\tt GAG}$} \\ +p & \mbox{otherwise} +\end{array}\right.\] +models the acceptor signal. Eq.~(\ref{eq:splice}) is close to an equation in +\citet{Zhang:2006aa} except that we allow insertions immediately followed by +deletions and vice versa; in addition, we use the Suzuki-Kasahara diagonal +formulation in actual implementation. + +If RNA-seq reads are not sequenced from stranded libraries, the read strand +relative to the underlying transcript is unknown. By default, minimap2 aligns +each chain twice, first assuming ${\tt GT}$--${\tt AG}$ as the splicing signal +and then assuming ${\tt CT}$--${\tt AC}$, the reverse complement of ${\tt +GT}$--${\tt AG}$, as the splicing signal. The alignment with a higher score is +taken as the final alignment. This procedure also infers the relative strand of +reads that span canonical splicing sites. + +In the spliced alignment mode, minimap2 further increases the density of +minimizers and disables banded alignment. Together with the two-round DP-based +alignment, spliced alignment is several times slower than genomic DNA +alignment. + +\subsection{Aligning short paired-end reads} + +During chaining, minimap2 takes a pair of reads as one fragment with a gap of +unknown length in the middle. It applies a normal gap cost between seeds on the +same read but is a more permissive gap cost between seeds on different reads. +More precisely, the gap cost during chaining is ($l\not=0$): +\[ +\gamma_c(l)=\left\{\begin{array}{ll} +0.01\cdot\bar{w}\cdot |l|+0.5\log_2 |l| & \mbox{if two seeds on the same read} \\ +\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & \mbox{otherwise} +\end{array}\right. +\] +After identifying primary chains (Section~\ref{sec:primary}), we split each +fragment chain into two read chains and perform alignment for each read as in +Section~\ref{sec:genomic}. Finally, we pair hits of each read end to find +consistent paired-end alignments. + +\end{methods} + +\section{Results} + +Minimap2 is implemented in the C programming language and comes with APIs in +both C and Python. It is distributed under the MIT license, free to both +commercial and academic uses. Minimap2 uses the same base algorithm for all +applications, but it has to apply different sets of parameters depending on +input data types. Similar to BWA-MEM, minimap2 introduces `presets' that +modify multiple parameters with a simple invocation. Detailed settings +and command-line options can be found in the minimap2 manpage. In addition to +the applications evaluated in the following sections, minimap2 also retains +minimap's functionality to find overlaps between long reads and to search +against large multi-species databases such as \emph{nt} from NCBI. + +\subsection{Aligning long genomic reads}\label{sec:long-genomic} + +\begin{figure}[!tb] +\centering +\includegraphics[width=.5\textwidth]{roc-color.pdf} +\caption{Evaluation on aligning simulated reads. Simulated reads were mapped +to the primary assembly of human genome GRCh38. A read is considered correctly +mapped if its longest alignment overlaps with the true interval, and the +overlap length is $\ge$10\% of the true interval length. Read alignments are +sorted by mapping quality in the descending order. For each mapping quality +threshold, the fraction of alignments (out of the number of input reads) with +mapping quality above the threshold and their error rate are +plotted along the curve. (a) long-read alignment evaluation. 33,088 $\ge$1000bp +reads were simulated using pbsim~\citep{Ono:2013aa} with error profile sampled +from file `m131017\_060208\_42213\_*.1.*' downloaded at +\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is +11,628. Aligners were run under the default setting for SMRT reads. +Kart outputted all alignments at mapping quality 60, so is not shown in the +figure. It mapped nearly all reads with 4.1\% of alignments being wrong, less +accurate than others. (b) short-read alignment evaluation. 10 million pairs of +150bp reads were simulated using mason2~\citep{Holtgrewe:2010aa} with option +`\mbox{--illumina-prob-mismatch-scale 2.5}'. Short-read aligners were run under +the default setting except for changing the maximum fragment length to +800bp.}\label{fig:eval} +\end{figure} + +As a sanity check, we evaluated minimap2 on simulated human reads along with +BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}), +BWA-MEM~(v0.7.15; \citealp{Li:2013aa}), +GraphMap~(v0.5.2; \citealp{Sovic:2016aa}), +Kart~(v2.2.5; \citealp{Lin:2017aa}), +minialign~(v0.5.3; \href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}) and +NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab} +and LAMSA~\citep{Liu:2017aa} because they either +crashed or produced malformatted output. In this evaluation, minimap2 has +higher power to distinguish unique and repetitive hits, and achieves overall +higher mapping accuracy (Fig.~\ref{fig:eval}a). Minimap2 and +NGMLR provide better mapping quality estimate: they rarely give repetitive hits +high mapping quality. Apparently, other aligners may +occasionally miss close suboptimal hits and be overconfident in wrong mappings. +On run time, minimap2 took 200 CPU seconds, comparable to minialign and Kart, and is over +30 times faster than the rest. Minimap2 consumed 6.8GB memory at the peak, +more than BWA-MEM (5.4GB), similar to NGMLR and less than others. + +On real human SMRT reads, the relative performance and fraction of mapped reads reported by +these aligners are broadly similar to the metrics on simulated data. We are +unable to provide a good estimate of mapping error rate due to the lack of the +truth. On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed. +Kart, minialign and minimap2 are over 70 times faster than others. We have also +examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can +confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks +them into shorter gaps. The issue is much alleviated with minimap2, thanks +to the 2-piece affine gap cost. + +\subsection{Aligning long spliced reads} + +We evaluated minimap2 on SIRV control data~(AC:SRR5286959; +\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,918 +introns from 11\,018 reads. 93.8\% of splice juctions are precise. We examined +wrongly predicted junctions and found the majority were caused by clustered +splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing +errors are frequent, it is difficult to find precise splicing sites in this +case. If we allow up to 10bp distance from true splicing sites, 98.4\% of +aligned introns are approximately correct. It is worth noting that for SIRV, we +asked minimap2 to model the ${\tt GT..AG}$ splicing signal only without extra +bases. This is because SIRV does not honor the evolutionarily prevalent signal +${\tt GT[A/G]..[C/T]AG}$~\citep{Irimia:2008aa}. + +\begin{table}[!tb] +\processtable{Evaluation of junction accuracy on 2D ONT reads} +{\footnotesize\label{tab:intron} +\begin{tabular}{p{3.1cm}rrrr} +\toprule +& GMAP & minimap2 & SpAln & STAR\\ +\midrule +Run time (CPU min) & 631 & 15.9 & 2\,076 & 33.9 \\ +Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\ +\# aligned reads & 103\,669 & 104\,199 & 103\,711 & 26\,479 \\ +\# chimeric alignments & 1\,904 & 1\,488 & 0 & 0 \\ +\# non-spliced alignments & 15\,854 & 14\,798 & 17\,033 & 10\,545\vspace{1em}\\ +\# aligned introns & 692\,275 & 693\,553 & 692\,945 & 78\,603 \\ +\# novel introns & 11\,239 & 3\,113 & 8\,550 & 1\,214 \\ +\% exact introns & 83.8\% & 94.0\% & 87.9\% & 55.2\% \\ +\% approx. introns & 91.8\% & 96.9\% & 92.5\% & 82.4\% \\ +\botrule +\end{tabular} +}{Mouse cDNA reads (AC:SRR5286960; R9.4 chemistry) were mapped to the primary assembly of mouse +genome GRCm38 with the following tools and command options: minimap2 (`-ax +splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS +-S3'); STARlong (according to +\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were +compared to the EnsEMBL gene annotation, release 89. A predicted intron +is \emph{novel} if it has no overlaps with any annotated introns. An intron +is \emph{exact} if it is identical to an annotated intron. An intron is +\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends +of an annotated intron. Chimeric alignments are defined in the SAM spec~\citep{Li:2009ys}.} +\end{table} + +We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20; +\citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and +STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more +consistent with existing annotations (Table~\ref{tab:intron}): it finds +more junctions with a higher percentage being exactly or approximately correct. +Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to +minimap2 in speed, it does not work well with noisy reads. + +We have also evaluated spliced aligners on a human Nanopore Direct RNA-seq +dataset (\href{http://bit.ly/na12878ont}{http://bit.ly/na12878ont}). Minimap2 +aligned 10 million reads in $<$1 wall-clock hour using 16 CPU cores. 94.2\% of +aligned splice junctions consistent with gene annotations. In comparison, +GMAP under option `-k 14 -n 0 --min-intronlength 30 --cross-species' is 160 +times slower; 68.7\% of GMAP junctions are found in known gene annotations. The +percentage increases to 84.1\% if an aligned junction within 10bp from an +annotated junction is considered to be correct. On a public Iso-Seq dataset +(human Alzheimer brain from +\href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}), minimap2 is also +faster at higher junction accuracy in comparison to other aligners in +Table~\ref{tab:intron}. + +We noted that GMAP and SpAln have not been optimized for noisy reads. We are +showing the best setting we have experimented, but their developers should be +able to improve their accuracy further. + +%\begin{table}[!tb] +%\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads} +%{\footnotesize +%\begin{tabular}{lrrrr} +%\toprule +% & GMAP & minimap2 & SpAln & STAR \\ % one GMAP thread took 14 days to align a tiny fraction of reads +%\midrule +%Run time (CPU min) & - & 243 & 2,352 & 1,647 \\ +%\# aligned reads & 1,113,502 & 1,123,025 & 1,094,092 & 682,452 \\ +%\# chimeric alignments & 48,927 & 33,091 & 0 & 0 \\ +%\# non-spliced alignments & 334,097 & 339,081 & 291,447 & 272,536 \vspace{1em}\\ +%\# aligned introns & 8,922,221 & 9,071,755 & 9,208,564 & 3,029,121 \\ +%\# novel introns & 48,927 & 42,773 & 82,230 & 17,791 \\ +%\% exact introns & 90.6\% & 94.9\% & 91.7\% & 84.7\% \\ +%\% approx. introns & 94.0\% & 96.9\% & 93.4\% & 93.8\% \\ +%\botrule +%\end{tabular} +%}{} +%\end{table} + +\subsection{Aligning short genomic reads} + +We evaluated minimap2 along with Bowtie2~(v2.3.3; \citealt{Langmead:2012fk}), BWA-MEM and +SNAP (v1.0beta23; \citealt{Zaharia:2011aa}). Minimap2 is 3--4 times as fast as Bowtie2 and +BWA-MEM, but is 1.3 times slower than SNAP. Minimap2 is more accurate on this +simulated data set than Bowtie2 and SNAP but less accurate than BWA-MEM +(Fig.~\ref{fig:eval}b). Closer investigation reveals that BWA-MEM achieves +a higher accuracy partly because it tries to locally align a read in a small +region close to its mate. If we disable this feature, BWA-MEM becomes slightly +less accurate than minimap2. We might implement a similar heuristic +in minimap2 in future. + +To evaluate the accuracy of minimap2 on real data, we aligned human reads +(AC:ERR1341796) with BWA-MEM and minimap2, and called SNPs and small INDELs +with GATK HaplotypeCaller v3.5~\citep{Depristo:2011vn}. This run was sequenced +from experimentally mixed CHM1 and CHM13 cell lines. Both of them are homozygous +across the whole genome and have been \emph{de novo} assembled with SMRT reads +to high quality. This allowed us to construct an independent truth variant +dataset~\citep{Li223297} for +ERR1341796. In this evaluation, minimap2 has higher SNP false negative rate +(FNR; 2.6\% of minimap2 vs 2.3\% of BWA-MEM), but fewer false positive SNPs per +million bases (FPPM; 7.0 vs 8.8), similar INDEL FNR (11.2\% vs 11.3\%) and +similar INDEL FPPM (6.4 vs 6.5). Minimap2 is broadly comparable to BWA-MEM in the +context of small variant calling. + +\subsection{Aligning long-read assemblies} + +Minimap2 can align a SMRT assembly (AC:GCA\_001297185.1) against GRCh38 in 7 +minutes using 8 CPU cores, over 20 times faster than nucmer from +MUMmer4~\citep{Marcais:2018aa}. With the paftools.js script from the minimap2 +package, we called 2.67 million single-base substitutions out of 2.78Gbp +genomic regions. The transition-to-transversion ratio (ts/tv) is 2.01. In +comparison, using MUMmer4's dnadiff pipeline, we called 2.86 million +substitutions in 2.83Gbp at ts/tv=1.87. Given that ts/tv averaged across the +human genome is about 2 but ts/tv averaged over random errors is 0.5, the +minimap2 callset arguably has higher precision at lower sensitivity. + +The sample being assembled is a female. Minimap2 still called 201 substitutions +on the Y chromosome. These substitutions all come from one contig aligned at +96.8\% sequence identity. The contig could be a segmental duplication +absent from GRCh38. In constrast, dnadiff called 9070 substitutions on the Y +chromosome across 73 SMRT contigs. This again implies our minimap2-based +pipeline has higher precision. + +\section{Discussions} + +Minimap2 is a versatile mapper and pairwise aligner for nucleotide sequences. +It works with short reads, assembly contigs and long noisy genomic and RNA-seq +reads, and can be used as a read mapper, long-read overlapper or a full-genome +aligner. Minimap2 is also accurate and efficient, often outperforming other +domain-specific alignment tools in terms of both speed and accuracy. + +The capability of minimap2 comes from a fast base-level alignment algorithm and +an accurate chaining algorithm. When aligning long query sequences, base-level +alignment is often the performance bottleneck. The Suzuki-Kasahara algorithm +greatly alleviates the bottleneck and enables DP-based splice alignment +involving $>$100kb introns, which was impractically slow ten years ago. The +minimap2 chaining algorithm is fast and highly accurate by itself. In fact, +chaining alone is more accurate than all the other long-read mappers in +Fig.~\ref{fig:eval}a (data not shown). This accuracy helps to reduce downstream +base-level alignment of candidate chains, which is still several times slower than +chaining even with the Suzuki-Kasahara improvement. In addition, taking a +general form, minimap2 chaining can be adapted to non-typical data types such as +spliced reads and multiple reads per fragment. This gives us the opportunity to +extend the same base algorithm to a variety of use cases. + +Modern mainstream aligners often use a full-text index, such as suffix array or +FM-index, to index reference sequences. An advantage of this approach is that +we can use exact seeds of arbitrary lengths, which helps to increase seed +uniqueness and reduce unsuccessful extensions. Minimap2 indexes reference +k-mers with a hash table instead. Such fixed-length seeds are inferior to +variable-length seeds in theory, but can be computed much more efficiently in +practice. When a query sequence has multiple seed hits, we can afford to skip +highly repetitive seeds without affecting the final accuracy. This further +alleviates the concern with the seeding uniqueness. At the same time, at low +sequence identity, it is rare to see long seeds anyway. Hash table is the ideal +data structure for mapping long noisy sequences. + +\section*{Acknowledgements} +We owe a debt of gratitude to H. Suzuki and M. Kasahara for releasing their +masterpiece and insightful notes before formal publication. We thank M. +Schatz, P. Rescheneder and F. Sedlazeck for pointing out the limitation of +BWA-MEM. We are also grateful to minimap2 users who have greatly helped to +suggest features and to fix various issues. + +\paragraph{Funding\textcolon} NHGRI 1R01HG010040-01 + +\bibliography{minimap2} + +\end{document} diff --git a/lib/minimap2/tex/mm2-s3.sam.eval b/lib/minimap2/tex/mm2-s3.sam.eval new file mode 100644 index 000000000..6732dd294 --- /dev/null +++ b/lib/minimap2/tex/mm2-s3.sam.eval @@ -0,0 +1,62 @@ +Q 60 18579866 27 0.000001453 18579866 +Q 59 27087 4 0.000001666 18606953 +Q 58 21435 1 0.000001718 18628388 +Q 57 45663 3 0.000001874 18674051 +Q 56 36031 2 0.000001978 18710082 +Q 55 18499 2 0.000002082 18728581 +Q 54 14754 2 0.000002187 18743335 +Q 53 25541 2 0.000002291 18768876 +Q 52 26397 5 0.000002554 18795273 +Q 51 15090 3 0.000002711 18810363 +Q 50 13425 11 0.000003294 18823788 +Q 49 15175 2 0.000003397 18838963 +Q 48 19407 4 0.000003606 18858370 +Q 47 11538 16 0.000004452 18869908 +Q 46 12558 17 0.000005349 18882466 +Q 45 40362 28 0.000006817 18922828 +Q 44 10465 13 0.000007500 18933293 +Q 43 10098 20 0.000008552 18943391 +Q 42 10682 19 0.000009549 18954073 +Q 41 9823 11 0.000010125 18963896 +Q 40 9685 16 0.000010963 18973581 +Q 39 10273 18 0.000011905 18983854 +Q 38 9515 18 0.000012847 18993369 +Q 37 9474 27 0.000014261 19002843 +Q 36 10430 25 0.000015568 19013273 +Q 35 9241 34 0.000017348 19022514 +Q 34 9162 31 0.000018968 19031676 +Q 33 10164 49 0.000021532 19041840 +Q 32 9152 55 0.000024408 19050992 +Q 31 9252 35 0.000026233 19060244 +Q 30 9872 55 0.000029103 19070116 +Q 29 8938 65 0.000032496 19079054 +Q 28 8951 73 0.000036306 19088005 +Q 27 9949 95 0.000041261 19097954 +Q 26 9784 97 0.000046316 19107738 +Q 25 10126 97 0.000051366 19117864 +Q 24 11260 123 0.000057765 19129124 +Q 23 10047 114 0.000063691 19139171 +Q 22 9661 123 0.000070083 19148832 +Q 21 10339 168 0.000078813 19159171 +Q 20 17928 193 0.000088804 19177099 +Q 19 9842 193 0.000098817 19186941 +Q 18 14737 247 0.000111605 19201678 +Q 17 10218 238 0.000123934 19211896 +Q 16 10271 242 0.000136457 19222167 +Q 15 12241 333 0.000153683 19234408 +Q 14 9189 336 0.000171070 19243597 +Q 13 9493 515 0.000197734 19253090 +Q 12 11502 743 0.000236185 19264592 +Q 11 8211 507 0.000262390 19272803 +Q 10 9133 606 0.000293695 19281936 +Q 9 10014 931 0.000341801 19291950 +Q 8 8436 698 0.000377816 19300386 +Q 7 8443 705 0.000414163 19308829 +Q 6 10203 944 0.000462808 19319032 +Q 5 6936 756 0.000501760 19325968 +Q 4 6732 843 0.000545190 19332700 +Q 3 8215 1104 0.000602040 19340915 +Q 2 21201 5440 0.000882342 19362116 +Q 1 82328 22186 0.002019600 19444444 +Q 0 553853 371953 0.020562901 19998297 +U 1703 diff --git a/lib/minimap2/tex/mm2-update.tex b/lib/minimap2/tex/mm2-update.tex new file mode 100644 index 000000000..432c01fd5 --- /dev/null +++ b/lib/minimap2/tex/mm2-update.tex @@ -0,0 +1,240 @@ +\documentclass{bioinfo} +\copyrightyear{2021} +\pubyear{2021} + +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{url} +\usepackage{amsmath} +\usepackage[ruled,vlined]{algorithm2e} +\newcommand\mycommfont[1]{\footnotesize\rmfamily{\it #1}} +\SetCommentSty{mycommfont} +\SetKwComment{Comment}{$\triangleright$\ }{} + +\usepackage{natbib} +\bibliographystyle{apalike} + +\DeclareMathOperator*{\argmax}{argmax} + +\begin{document} +\firstpage{1} + +\title[Improvements to minimap2]{New strategies to improve minimap2 alignment accuracy} +\author[Li]{Heng Li$^{1,2}$} +\address{$^1$Dana-Farber Cancer Institute, 450 Brookline Ave, Boston, MA 02215, USA, +$^2$Harvard Medical School, 10 Shattuck St, Boston, MA 02215, USA} + +\maketitle + +\begin{abstract} + +\section{Summary:} We present several recent improvements to minimap2, a +versatile pairwise aligner for nucleotide sequences. Now minimap2 v2.22 can +more accurately map long reads to highly repetitive regions and align through +insertions or deletions up to 100kb by default, addressing major weakness in +minimap2 v2.18 or earlier. + +\section{Availability and implementation:} +\href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2} + +\section{Contact:} hli@ds.dfci.harvard.edu +\end{abstract} + +\section{Introduction} +Minimap2~\citep{Li:2018ab} is widely used for maping long sequence +reads and assembly contigs. \citet{Jain:2020aa} found minimap2 v2.18 or earlier occasionally +misaligned reads from highly repetitive regions as minimap2 ignored seeds of +high occurrence. They also noticed minimap2 may misplace reads with structural +variations (SVs) in such regions~\citep{Jain2020.11.01.363887}. These +misalignments have become a pressing issue in the advent of +temolere-to-telomore human assembly~\citep{Miga:2020aa}. Meanwhile, old minimap2 +was unable to efficiently align long insertions/deletions (INDELs) and often +breaks an alignment around variable-number tandem repeats (VNTRs). This has +inspired new chaining algorithms~\citep{Li:2020aa,Ren:2021aa} which are not +integrated into minimap2. Here we will describe recent efforts implemented +in v2.19 through v2.22 to improve mapping results. + +\begin{methods} +\section{Methods} + +\subsection{Rescuing high-occurrence $k$-mers}\label{sec:high-occ} +Minimap2 keeps all $k$-mer minimizers~\citep{Roberts:2004fv} during indexing. Its original +implementation only selected low-occurrence minimizers during mapping. The +cutoff is a few hundred for mapping long reads against a human genome. If a +read habors only a few or even no low-occurrence minimizers, it will fail +chaining due to insufficient anchors. + +To resolve this issue, we implemented a new heuristic to add additional +minimizers. Suppose we are looking at two adjacent low-occurence $k$-mers +located at position $x_1$ and $x_2$, respectively. If $|x_1-x_2|\ge L$, +minimap2 v2.22 additionally selects $\lfloor|x_1-x_2|/L\rfloor$ minimizers +of the lowest occurrence among minimizers between $x_1$ and $x_2$. Here +parameter $L$ controls the frequency of sampling. It defaults to 500. +This strategy adds necessary anchors at the cost of increasing total alignment +time by a few percent on real data. + +\subsection{Aligning through longer INDELs} +The original minimap2 may fail to align long INDELs due to its chaining +heuristics. Briefly, minimap2 applies dynamic programming (DP) to chain +minimizer anchors. This is a quadratic algorithm, slow for chaining +contigs. For acceptable performance, the original minimap2 uses a 500bp band by +default, which means a gap longer than 500bp will stop chaining. +To align through longer gaps, older minimap2 implemented a long-join heurstic as follows. +If there is an INDEL longer than 500bp and the two chains around the INDEL +have no overlaps on either the query or the reference sequence, minimap2 may +join the two short chains later. +This heuristic may fail around VNTRs because short chains +often have overlaps in VNTRs. More subtly, minimap2 may escape the inner DP +loop early, again for performance, if the chaining result is not improved for +50 iterations. When there is a copy number change in a long segmental +duplication, the early escape may break around the event even if users +specify a large band. + +In minigraph~\citep{Li:2020aa}, we developed a new chaining algorithm that +finds up to 1kb INDELs with DP-based chaining and goes through longer INDELs with a +subquadratic algorithm~\citep{DBLP:conf/wabi/AbouelhodaO03}. We ported the same +algorithm to minimap2 for contig mapping. For long-read mapping, the minigraph +algorithm is slower. Minimap2 v2.22 still uses the DP-based algorithm to +find short chains and then invokes the minigraph algorithm to rechain anchors in +these short chains. The rechaining step achieves the same goal as long-join +but is more reliable because it can resolve overlaps between short chains. The old +long-join heuristic has since been removed. + +\subsection{Properly mapping long reads with SVs} +The original minimap2 ranks an alignment by its Smith-Waterman score and +outputs the best scoring alignment. However, when there are SVs on the read, +the best scoring alignment is sometimes not the correct alignment. +\citet{Jain2020.11.01.363887} resolved this dilemma by altering the mapping +algorithm. + +In our view, this problem is rooted in inapropriate scoring: affine-gap penalty +over-penalizes a long INDEL that was often evolutionarily created in one event. +We should not penalize a SV by a function linear in the SV length. Minimap2 v2.22 instead rescores +an alignment with the following scoring function. Suppose an alignment consists +of $M$ matching bases, $N$ substitutions and $G$ gap opens, we empirically +score the alignment with +$$ +S=M-\frac{N+G}{2d}-\sum_{i=1}^G\log_2(1+g_i) +$$ +where $g_i\ge1$ is the length of the $i$-th gap and +$$ +d=\max\left\{\frac{N+G}{M+N+G},0.02\right\} +$$ +It approximates per-base sequence divergence except with the smallest value set +to 2\%. As an analogy to affine-gap scoring, the matching score in our scheme +is 1, the mismatch and gap open penalties are both $1/2d$ and the gap extension +penalty is a logarithm function of the gap length~\citep{Gu:1995wt}. Our scoring gives a long SV +a much milder penalty. In terms of time complexity, scoring an alignment is +linear in the length of the alignment. The time spent on rescoring is negligible in +practice. + +%If we assume sequences evolve under a duplication-mutation model, we may have a +%better way to choose the best alignment. If a long read can be mapped to $n$ +%loci, we can take the read as the template and build a +%pseudo-multi-sequence-alignment (pMSA) of $n+1$ sequences. In this pMSA, we say +%a site on the read is informative if the $n$ reference subsequences differ at +%the position. + +\end{methods} + +\section{Results} + +\begin{table} +\processtable{Evaluation of minimap2 v2.22} +{\footnotesize\label{tab:1}\begin{tabular}{p{4.2cm}rrrr} +\toprule +$[$Benchmark$]$ Metric & v2.22 & v2.18 & Winno & lra \\ +\midrule +$[$sim-map$]$ \% mapped reads at Q10 & 97.9 & 97.6 & {\bf 99.0}& 97.3 \\ +$[$sim-map$]$ err. rate at Q10 (phredQ) & {\bf 52} & {\bf 52} & 38 & 24 \\ +$[$winno-cmp$]$ rate of diff. (phredQ) & {\bf 41} & 37 & truth & 18 \\ +$[$winno-cmp$]$ CPU time (hour) & {\bf 5.0} & 5.3 & 71.8 & 13.1 \\ +$[$winno-cmp$]$ peak RAM (Gb) & 17.1 & 14.4 & {\bf 9.6} & 12.4 \\ +$[$sim-sv$]$ \% false negative rate & {\bf 0.5} & 2.0 & {\bf 0.5} & 1.4 \\ +$[$sim-sv$]$ \% false discovery rate & {\bf 0.0} & 0.1 & {\bf 0.0} & 0.1 \\ +$[$real-sv-1k$]$ \% false negative rate & {\bf 7.3} & 20.0 & 13.0 & N/A \\ +$[$real-sv-1k$]$ \% false discovery rate & 2.7 & {\bf 2.4} & 2.7 & N/A \\ +\botrule +\end{tabular}} +{In $[$sim-map$]$, 152,713 reads were simulated from the CHM13 telomere-to-telomere assembly v1.1 +(AC: GCA\_009914755.3) with pbsim2~\citep{Ono:2021aa}: ``pbsim2 -{}-hmm\_model R94.model -{}-length-min +5000 -{}-length-mean 20000 -{}-accuracy-mean 0.95''. Alignments of mapping quality +10 or higher were evaluated by ``paftools.js mapeval''. The mapping error rate +is measured in the phred scale: if the error rate is $e$, $-10\log_{10}e$ is +reported in the table. In $[$winno-cmp$]$, 1.39 million CHM13 HiFi reads from +SRR11292121 were mapped against the same CHM13 assembly. 99.3\% of them were mapped by Winnowmap2 +at mapping quality 10 or higher and were taken as ground truth to evaluate +minimap2 and lra with ``paftools.js pafcmp''. $[$sim-sv$]$ simulated 1,000 +50bp to 1000bp INDELs from chr8 in CHM13 using SURVIVOR~\citep{Jeffares:2017aa} and simulated Nanopore +reads at 30-fold coverage with the same pbsim2 command line. SVs were called with +``sniffles -q 10''~\citep{Sedlazeck:2018ab} and compared to the simulated truth with ``SURVIVOR eval +call.vcf truth.bed 50''. In $[$real-sv-1k$]$, small and long variants were +called by dipcall-0.3~\citep{Li:2018aa} for HG002 assemblies (AC: GCA\_018852605.1 and +GCA\_018852615.1) and compared to the GIAB truth~\citep{Zook:2020aa} using ``truvari -r 2000 -s +1000 -S 400 -{}-multimatch -{}-passonly'' which sets the minimum INDEL size to 1kb in evaluation. } +\end{table} + +We evaluated minimap2 v2.22 along with v2.18, Winnowmap2 v2.03 and lra v1.3.2 +(Table~\ref{tab:1}), using the default setting of each mapper according to the input data types. +Both versions of minimap2 achieved high mapping accuracy on +simulated Nanopore reads (sim-map). Winnowmap2 aligned more reads at mapping +quality 10 or higher (mapQ10). However, it may occasionally assign a high mapping +quality to a read with multiple identical best alignments. This reduced its +mapping accuracy. + +In lack of groud truth for real data, we took Winnowmap2 mapping as ground +truth to evaluate other mappers (winno-cmp in Table~\ref{tab:1}). Out of 1,378,092 reads with mapQ10 +alignments by Winnowmap2, minimap2 v2.22 could map all of them. 118 reads, less +than 0.01\% of all reads, were mapped differently by v2.22. 51 of them have +multiple identical best alignments. We believe these are more likely to be +Winnowmap2 errors. Most of the remaining 67 (=118-51) reads have multiple +highly similar but not identical alignments. +Minimap2 v2.18 is less consistent with 275 differences including 30 unmapped +reads mappable by both Winnowmap2 and v2.22. + +For the minimizer rescuing parameter $L$ in Section~\ref{sec:high-occ}, +we set its default to 500 such that v2.22 has comparable performance to v2.18 given simulated PacBio and Nanopore human reads. +To see the effect of this parameter on real data, we tried several different $L$ values. +v2.22 gave 99 mapping differences at $L=200$, +118 at $L=500$ (default), 167 at $L=750$ and 224 differences at $L=1000$ in comparison to Winnowmap2. +$L=200$ is 28\% slower than the default while $L=1000$ is 9\% faster. +Changing the default minimizer window size (option ``-w'') +and the initial minimizer occurrence cutoff (option ``-f'') +also affects performance and accuracy to a similar magnitude. + +The two benchmarks above only evaluate read mappings when there are no variations between the reads and the reference. +To measure the mapping accuracy in the presence of SVs (sim-sv), we reproduced +the results by~\citep{Jain2020.11.01.363887}. Minimap2 v2.22 is as good as +Winnowmap2 now. Note that we were setting the Sniffles mapping quality +threshold to 10 in consistent with the benchmarks above. If we used the +default threshold 20, v2.22 would miss additional five SVs (accounting for +0.5\% of simulated SVs). For four out of these five missing SVs, minimap2 v2.22 +mapped more variant reads than Winnowmap2. Sniffles did not call these SVs +because minimap2 tended to give them conservative mapping quality. It is worth +noting that the simulation here only considers a simple scenario in evolution. +Non-allelic gene conversions, which happen often in segmental +duplications~\citep{Harpak:2017aa}, would obscure the optimal mapping +strategies. How much such simple SV simulation informs real-world SV calling +remains a question. + +To see if minimap2 v2.22 could improve long INDEL alignment, we ran dipcall on +contig-to-reference alignments and focused on INDELs longer than 1kb +(real-sv-1k). v2.22 is more sensitive at comparable specificity, confirming its +advantage in more contiguous alignment. We could not get dipcall to work well with lra, +so did not report the numbers. + +Minimap2 spends most computing time on base alignment. As recent improvements +in v2.22 incur little additional computing and do not change the base alignment +algorithm, the new version has similar performance to older versions. It is +consistently faster than Winnowmap2 by several times. Sometimes simple +heuristics can be as effective as more sophisticated yet slower solutions. + +\section*{Acknowledgements} +We thank Arang Rhie and Chirag Jain for providing motivating examples for which +older minimap2 underperforms. + +\paragraph{Funding\textcolon} This work is funded by NHGRI grant R01HG010040. + +\bibliography{minimap2} + +\end{document} diff --git a/lib/minimap2/tex/mm2.approx.eval b/lib/minimap2/tex/mm2.approx.eval new file mode 100644 index 000000000..801be8b1c --- /dev/null +++ b/lib/minimap2/tex/mm2.approx.eval @@ -0,0 +1,12 @@ +Q 60 32084 0 0.000000000 32084 +Q 24 318 2 0.000061725 32402 +Q 11 98 2 0.000123077 32500 +Q 8 37 2 0.000184405 32537 +Q 7 37 3 0.000276294 32574 +Q 6 40 3 0.000367940 32614 +Q 5 34 2 0.000428816 32648 +Q 4 37 5 0.000581306 32685 +Q 3 28 6 0.000764222 32713 +Q 2 38 6 0.000946536 32751 +Q 1 50 21 0.001585318 32801 +Q 0 286 150 0.006105117 33087 diff --git a/lib/minimap2/tex/mm2.eval b/lib/minimap2/tex/mm2.eval new file mode 100644 index 000000000..38736f229 --- /dev/null +++ b/lib/minimap2/tex/mm2.eval @@ -0,0 +1,13 @@ +Q 60 32477 0 0.000000000 32477 +Q 22 16 1 0.000030776 32493 +Q 21 44 1 0.000061468 32537 +Q 19 73 1 0.000091996 32610 +Q 14 66 1 0.000122414 32676 +Q 10 26 3 0.000214054 32702 +Q 8 14 1 0.000244529 32716 +Q 7 13 2 0.000305539 32729 +Q 6 47 1 0.000335611 32776 +Q 3 10 1 0.000366010 32786 +Q 2 20 2 0.000426751 32806 +Q 1 248 94 0.003267381 33054 +Q 0 31 17 0.003778147 33085 diff --git a/lib/minimap2/tex/natbib.bst b/lib/minimap2/tex/natbib.bst new file mode 100644 index 000000000..a679e1d65 --- /dev/null +++ b/lib/minimap2/tex/natbib.bst @@ -0,0 +1,1288 @@ +%% +%% This is file `natbib.bst', generated +%% on <1994/9/16> with the docstrip utility (2.2h). +%% +%% The original source files were: +%% +%% genbst.mbs (with options: `ay,nat,seq-lab,nm-rev,dt-beg,yr-par,vol-bf, +%% volp-com,etal-it') +%% ---------------------------------------- +%% *** Personal bib style, PWD *** +%% +%% (Here are the specifications of the source file) +%% \ProvidesFile{genbst.mbs}[1994/09/16 1.5 (PWD)] +%% For use with BibTeX version 0.99a or later +%% and with LaTeX 2.09 or 2e +%%------------------------------------------------------------------- +%% NOTICE: +%% This file may be used for non-profit purposes. +%% It may not be distributed in exchange for money, +%% other than distribution costs. +%% +%% The author provides it `as is' and does not guarantee it in any way. +%% +%% Copyright (C) 1994 Patrick W. Daly +%% Max-Planck-Institut f\"ur Aeronomie +%% Postfach 20 +%% D-37189 Katlenburg-Lindau +%% Germany +%% +%% E-mail: +%% SPAN-- nsp::linmpi::daly (note nsp also known as ecd1) +%% Internet-- daly@linmpi.dnet.gwdg.de +%%----------------------------------------------------------- +%% \CharacterTable +%% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z +%% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z +%% Digits \0\1\2\3\4\5\6\7\8\9 +%% Exclamation \! Double quote \" Hash (number) \# +%% Dollar \$ Percent \% Ampersand \& +%% Acute accent \' Left paren \( Right paren \) +%% Asterisk \* Plus \+ Comma \, +%% Minus \- Point \. Solidus \/ +%% Colon \: Semicolon \; Less than \< +%% Equals \= Greater than \> Question mark \? +%% Commercial at \@ Left bracket \[ Backslash \\ +%% Right bracket \] Circumflex \^ Underscore \_ +%% Grave accent \` Left brace \{ Vertical bar \| +%% Right brace \} Tilde \~} +%%--------------------------------------------------------------------- + % This is an author-year citation style bibliography. As such, it is + % non-standard LaTeX, and requires a special package file to function properly. + % Such a package is natbib.sty by Patrick W. Daly + % The form of the \bibitem entries is + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}... + % The essential feature is that the label (the part in brackets) consists + % of the author names, as they should appear in the citation, with the year + % in parentheses following. There must be no space before the opening + % parenthesis! + % With natbib v5.3, a full list of authors may also follow the year. + % In natbib.sty, it is possible to define the type of enclosures that is + % really wanted (brackets or parentheses), but in either case, there must + % be parentheses in the label. + % The \cite command functions as follows: + % \cite{key} ==>> Jones et al. (1990) + % \cite[]{key} ==>> (Jones et al., 1990) + % \cite[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \cite[e.g.][]{key} ==>> (e.g. Jones et al., 1990) + % \cite[e.g.][p. 32]{key} ==>> (e.g. Jones et al., p. 32) + % \citeauthor{key} Jones et al. + % \citefullauthor{key} Jones, Baker, and Smith + % \citeyear{key} 1990 +%%--------------------------------------------------------------------- + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label extra.label sort.label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {non.stop} +{ duplicate$ + "}" * add.period$ + #-1 #1 substring$ "." = +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * non.stop + { "\/}" * } + { "}" * } + if$ + } + if$ +} + +FUNCTION {bolden} +{ duplicate$ empty$ + { pop$ "" } + { "{\bf " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}{, jj}{, f.}" format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " " * "et~al." emphasize * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.names.ed} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{f.~}{vv~}{ll}{, jj}" + format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " " * "et~al." emphasize * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.key} +{ empty$ + { key field.or.null } + { "" } + if$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.in.editors} +{ editor empty$ + { "" } + { editor format.names.ed + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ + } + if$ +} + +FUNCTION {format.full.names} +{'s := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " " * "et~al." emphasize * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.key.full} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.key.full} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {editor.key.full} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.full + { type$ "proceedings" = + 'editor.key.full + 'author.key.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem[" write$ + label write$ + ")" make.full.names * "]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {word.in} +{ "In " } + +FUNCTION {format.date} +{ year duplicate$ empty$ + { "empty year in " cite$ * "; set to ????" * warning$ + pop$ "????" } + 'skip$ + if$ + before.all 'output.state := + " (" swap$ * extra.label * ")" * +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "volume" volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + bolden + number empty$ + 'skip$ + { "(" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ", " * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { word.in booktitle emphasize * } + { word.in format.in.editors * ", " * booktitle emphasize * } + if$ + } + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ + word.in + "\cite{" * crossref * "}" * +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + word.in + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + "\cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ + word.in + "\cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + howpublished output + address output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address output + new.sentence + organization output + publisher output + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + organization address new.block.checkb + organization output + address output + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + new.block + format.title output + new.block + howpublished output + new.block + note output + fin.entry +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + new.block + "Ph.D. thesis" format.thesis.type output.nonnull + school "school" output.check + address output + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + format.editors output + editor format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address output + new.sentence + organization output + publisher output + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + note "note" output.check + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"January"} + +MACRO {feb} {"February"} + +MACRO {mar} {"March"} + +MACRO {apr} {"April"} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"August"} + +MACRO {sep} {"September"} + +MACRO {oct} {"October"} + +MACRO {nov} {"November"} + +MACRO {dec} {"December"} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {format.lab.names} +{ 's := + s #1 "{vv~}{ll}" format.name$ + s num.names$ duplicate$ + #2 > + { pop$ " " * "et~al." emphasize * } + { #2 < + 'skip$ + { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " " * "et~al." emphasize * } + { " and " * s #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {author.key.label} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.editor.key.label} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {editor.key.label} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ +} + +FUNCTION {calc.label} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.label + { type$ "proceedings" = + 'editor.key.label + 'author.key.label + if$ + } + if$ + "(" + * + year duplicate$ empty$ + { pop$ "????" } + { purify$ #-1 #4 substring$ } + if$ + * + 'label := +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { nameptr #1 > + { " " * } + 'skip$ + if$ + s nameptr + "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" + format.name$ 't := + nameptr numnames = t "others" = and + { "et al" * } + { numnames #2 > nameptr #2 = and + { "zzzzzz" * #1 'namesleft := } + { t sortify * } + if$ + } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.sort} +{ editor empty$ + { key empty$ + { "to sort, need editor or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ +} + +FUNCTION {presort} +{ calc.label + label sortify + " " + * + type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.sort + 'author.sort + if$ + } + if$ + #1 entry.max$ substring$ + 'sort.label := + sort.label + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { last.label next.extra } + +INTEGERS { last.extra.num } + +FUNCTION {initialize.extra.label.stuff} +{ #0 int.to.chr$ 'last.label := + "" 'next.extra := + #0 'last.extra.num := +} + +FUNCTION {forward.pass} +{ last.label label = + { last.extra.num #1 + 'last.extra.num := + last.extra.num int.to.chr$ 'extra.label := + } + { "a" chr.to.int$ 'last.extra.num := + "" 'extra.label := + label 'last.label := + } + if$ +} + +FUNCTION {reverse.pass} +{ next.extra "b" = + { "a" 'extra.label := } + 'skip$ + if$ + extra.label 'next.extra := + label extra.label * 'label := +} + +EXECUTE {initialize.extra.label.stuff} + +ITERATE {forward.pass} + +REVERSE {reverse.pass} + +FUNCTION {bib.sort.order} +{ sort.label + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {bib.sort.order} + +SORT + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{}" write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} +%% End of customized bst file + diff --git a/lib/minimap2/tex/natbib.sty b/lib/minimap2/tex/natbib.sty new file mode 100644 index 000000000..4c8c94843 --- /dev/null +++ b/lib/minimap2/tex/natbib.sty @@ -0,0 +1,803 @@ +%% +%% This is file `natbib.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% natbib.dtx (with options: `package,all') +%% ============================================= +%% IMPORTANT NOTICE: +%% +%% This program can be redistributed and/or modified under the terms +%% of the LaTeX Project Public License Distributed from CTAN +%% archives in directory macros/latex/base/lppl.txt; either +%% version 1 of the License, or any later version. +%% +%% This is a generated file. +%% It may not be distributed without the original source file natbib.dtx. +%% +%% Full documentation can be obtained by LaTeXing that original file. +%% Only a few abbreviated comments remain here to describe the usage. +%% ============================================= +%% Copyright 1993-2000 Patrick W Daly +%% Max-Planck-Institut f\"ur Aeronomie +%% Max-Planck-Str. 2 +%% D-37191 Katlenburg-Lindau +%% Germany +%% E-mail: daly@linmpi.mpg.de +\NeedsTeXFormat{LaTeX2e}[1995/06/01] +\ProvidesPackage{natbib} + [2000/07/24 7.0a (PWD)] + % This package reimplements the LaTeX \cite command to be used for various + % citation styles, both author-year and numerical. It accepts BibTeX + % output intended for many other packages, and therefore acts as a + % general, all-purpose citation-style interface. + % + % With standard numerical .bst files, only numerical citations are + % possible. With an author-year .bst file, both numerical and + % author-year citations are possible. + % + % If author-year citations are selected, \bibitem must have one of the + % following forms: + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}... + % \bibitem[Jones et al., 1990]{key}... + % \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones + % et al.}{1990}]{key}... + % \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}... + % \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}... + % \bibitem[\protect\citename{Jones et al., }1990]{key}... + % \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}... + % + % This is either to be made up manually, or to be generated by an + % appropriate .bst file with BibTeX. + % Author-year mode || Numerical mode + % Then, \citet{key} ==>> Jones et al. (1990) || Jones et al. [21] + % \citep{key} ==>> (Jones et al., 1990) || [21] + % Multiple citations as normal: + % \citep{key1,key2} ==>> (Jones et al., 1990; Smith, 1989) || [21,24] + % or (Jones et al., 1990, 1991) || [21,24] + % or (Jones et al., 1990a,b) || [21,24] + % \cite{key} is the equivalent of \citet{key} in author-year mode + % and of \citep{key} in numerical mode + % Full author lists may be forced with \citet* or \citep*, e.g. + % \citep*{key} ==>> (Jones, Baker, and Williams, 1990) + % Optional notes as: + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.,][]{key} ==>> (e.g., Jones et al., 1990) + % \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34) + % (Note: in standard LaTeX, only one note is allowed, after the ref. + % Here, one note is like the standard, two make pre- and post-notes.) + % \citealt{key} ==>> Jones et al. 1990 + % \citealt*{key} ==>> Jones, Baker, and Williams 1990 + % \citealp{key} ==>> Jones et al., 1990 + % \citealp*{key} ==>> Jones, Baker, and Williams, 1990 + % Additional citation possibilities (both author-year and numerical modes) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Williams + % \citeyear{key} ==>> 1990 + % \citeyearpar{key} ==>> (1990) + % \citetext{priv. comm.} ==>> (priv. comm.) + % Note: full author lists depends on whether the bib style supports them; + % if not, the abbreviated list is printed even when full requested. + % + % For names like della Robbia at the start of a sentence, use + % \Citet{dRob98} ==>> Della Robbia (1998) + % \Citep{dRob98} ==>> (Della Robbia, 1998) + % \Citeauthor{dRob98} ==>> Della Robbia + % + % + % Citation aliasing is achieved with + % \defcitealias{key}{text} + % \citetalias{key} ==>> text + % \citepalias{key} ==>> (text) + % + % Defining the citation style of a given bib style: + % Use \bibpunct (in the preamble only) with 6 mandatory arguments: + % 1. opening bracket for citation + % 2. closing bracket + % 3. citation separator (for multiple citations in one \cite) + % 4. the letter n for numerical styles, s for superscripts + % else anything for author-year + % 5. punctuation between authors and date + % 6. punctuation between years (or numbers) when common authors missing + % One optional argument is the character coming before post-notes. It + % appears in square braces before all other arguments. May be left off. + % Example (and default) \bibpunct[, ]{(}{)}{;}{a}{,}{,} + % + % To make this automatic for a given bib style, named newbib, say, make + % a local configuration file, natbib.cfg, with the definition + % \newcommand{\bibstyle@newbib}{\bibpunct...} + % Then the \bibliographystyle{newbib} will cause \bibstyle@newbib to + % be called on THE NEXT LATEX RUN (via the aux file). + % + % Such preprogrammed definitions may be invoked in the text (preamble only) + % by calling \citestyle{newbib}. This is only useful if the style specified + % differs from that in \bibliographystyle. + % + % With \citeindextrue and \citeindexfalse, one can control whether the + % \cite commands make an automatic entry of the citation in the .idx + % indexing file. For this, \makeindex must also be given in the preamble. + % + % LaTeX2e Options: (for selecting punctuation) + % round - round parentheses are used (default) + % square - square brackets are used [option] + % curly - curly braces are used {option} + % angle - angle brackets are used