|
| 1 | + |
| 2 | +parse_stockholm_msa_chunk <- function(x, pos, acc) { |
| 3 | + |
| 4 | + gc <- stringr::str_match(x, "#=GC +([^ ]+) +(.+)")[,2:3] |
| 5 | + gc <- gc[stats::complete.cases(gc), , drop = FALSE] |
| 6 | + for (i in seq_len(nrow(gc))) { |
| 7 | + attr(acc, gc[i,1]) <- paste0(attr(acc, gc[i,1]), gc[i,2]) |
| 8 | + } |
| 9 | + |
| 10 | + x <- stringr::str_match(x, "^(\\d+\\|)?([^#][^ ]*) +([^ ]+)$")[,3:4] |
| 11 | + x <- x[stats::complete.cases(x), , drop = FALSE] |
| 12 | + for (i in seq_len(nrow(x))) { |
| 13 | + if (x[i,1] %in% names(acc)) { |
| 14 | + acc[[x[i,1]]] <- paste0(acc[[x[i,1]]], x[i,2]) |
| 15 | + } else { |
| 16 | + acc[[x[i,1]]] <- x[i,2] |
| 17 | + } |
| 18 | + } |
| 19 | + acc |
| 20 | +} |
| 21 | + |
| 22 | +#' Parse a Multiple Alignment in Stockholm Format |
| 23 | +#' |
| 24 | +#' Parses Stockholm-format multiple alignment files, including "\code{#=GC}" |
| 25 | +#' lines. Other annotations are ignored. |
| 26 | +#' |
| 27 | +#' @param stockholm (\code{character} scalar) Path to a file to parse |
| 28 | +#' |
| 29 | +#' @return a \code{list}. The sequences themselves are element "\code{alignment}", |
| 30 | +#' and \code{#=GC} annotations are given as \code{character} strings; e.g., |
| 31 | +#' the "\code{#=GC SS_cons}" annotation is stored as an element named |
| 32 | +#' "\code{SS_cons}". |
| 33 | +#' @export |
| 34 | +#' |
| 35 | +#' @examples |
| 36 | +#' msafile <- system.file(file.path("extdata", "sample.stk"), package = "inferrnal") |
| 37 | +#' msa <- read_stockholm_msa(msafile) |
| 38 | +#' msa$alignment |
| 39 | +#' # consensus secondary structure |
| 40 | +#' msa$SS_cons |
| 41 | +#' # reference sequence |
| 42 | +#' msa$RF |
| 43 | +read_stockholm_msa <- function(stockholm) { |
| 44 | + assertthat::assert_that((assertthat::is.string(stockholm) && |
| 45 | + file.exists(stockholm)) || |
| 46 | + methods::is(stockholm, "connection")) |
| 47 | + |
| 48 | + seqs <- |
| 49 | + readr::read_lines_chunked(stockholm, |
| 50 | + readr::AccumulateCallback$new(parse_stockholm_msa_chunk, acc = list())) |
| 51 | + out <- attributes(seqs) |
| 52 | + out[["alignment"]] <- Biostrings::RNAMultipleAlignment(unlist(seqs)) |
| 53 | + out |
| 54 | +} |
| 55 | + |
| 56 | +#' Search for Covariance Models (CM) in a set of sequences. |
| 57 | +#' |
| 58 | +#' This function calls "\code{cmsearch}" from |
| 59 | +#' \href{http://eddylab.org/infernal/}{Infernal}. Infernal must be installed. |
| 60 | +#' Many parameters are not included (yet!), and the function is focused on |
| 61 | +#' retrieving the hits table and, optionally, producing an alignment. |
| 62 | +#' |
| 63 | +#' @param cm (character of length 1) Path to the covariance model (.cm) file. |
| 64 | +#' The covariance model must include calibration data from running |
| 65 | +#' "\code{cmcalibrate}". |
| 66 | +#' @param seq (filename, character vector, or |
| 67 | +#' \code{\link[Biostrings]{XStringSet}}) Sequences to search with the CM. |
| 68 | +#' If a filename, the file can be of any type supported by Infernal. |
| 69 | +#' @param glocal (logical of length 1) Whether to run the search in glocal mode |
| 70 | +#' (global with respect to the model, local with respect to the sequence). |
| 71 | +#' When \code{TRUE}, the search is faster, but will fail to find matches with |
| 72 | +#' only partially homologous structure. |
| 73 | +#' @param alignment (filename) A file to save the aligned hits to. If given, |
| 74 | +#' the alignment is saved in Stockholm format with annotations for secondary |
| 75 | +#' structure, posterior probablility, etc. |
| 76 | +#' @param cpu (integer of length 1) The number of threads to use in the search. |
| 77 | +#' |
| 78 | +#' @return a \code{\link[tibble]{tibble}} with columns: |
| 79 | +#' \itemize{ |
| 80 | +#' \item{target_name}{ (character) the name of the target sequence} |
| 81 | +#' \item{taget_accession}{(character) the target's accession number} |
| 82 | +#' \item{query_name}{(character) the name of the query CM} |
| 83 | +#' \item{query_accession}{(character) the query CM's accession number} |
| 84 | +#' \item{mdl}{(character) the model type ("cm" or "hmm")} |
| 85 | +#' \item{mdl_from}{(integer) the start location of the hit in the model} |
| 86 | +#' \item{mdl_to}{(integer) the end location of the hit in the model} |
| 87 | +#' \item{seq_from}{(integer) the start location of the hit in the sequence} |
| 88 | +#' \item{seq_to}{(integer) the end location of the hit in the sequence} |
| 89 | +#' \item{strand}{(character) the strand the hit was found on ("+" or "-")} |
| 90 | +#' \item{trunc)}{(character) whether the hit is truncated, and where ("no", "5'", "3'", "5'&3'", or "-" for hmm hits).} |
| 91 | +#' \item{pass}{(integer) which algorithm pass the hit was found on.} |
| 92 | +#' \item{gc}{(numeric) GC content of the hit} |
| 93 | +#' \item{bias}{(numeric) biased composition correction. See the Infernal documentation.} |
| 94 | +#' \item{score}{(numeric) bit-score of the hit, including the biased |
| 95 | +#' composition correction.} |
| 96 | +#' \item{E_value}{(numeric) Expectation value for the hit.}} |
| 97 | +#' \item{inc}{(character) "!" if the sequence meets the inclusion threshold, |
| 98 | +#' "?" if it only meets the reporting threshold.} |
| 99 | +#' @export |
| 100 | +#' |
| 101 | +#' @examples |
| 102 | +#' # search sequences from a fasta file for Rfam RF00002 (5.8S rRNA) |
| 103 | +#' cm5_8S <- system.file(file.path("extdata", "RF00002.cm"), package = "inferrnal") |
| 104 | +#' sampfasta <- system.file(file.path("extdata", "sample.fasta"), package = "inferrnal") |
| 105 | +#' cmsearch(cm = cm5_8S, seq = sampfasta, cpu = 1) |
| 106 | +#' # also works if the fasta file has already been loaded |
| 107 | +#' samp <- Biostrings::readDNAStringSet(sampfasta) |
| 108 | +#' cmsearch(cm = cm5_8S, seq = samp, cpu = 1) |
| 109 | +cmsearch <- function(cm, |
| 110 | + seq, |
| 111 | + glocal = TRUE, |
| 112 | + alignment, |
| 113 | + cpu) { |
| 114 | + assertthat::assert_that(assertthat::is.string(cm), |
| 115 | + file.exists(cm), |
| 116 | + assertthat::is.flag(glocal)) |
| 117 | + tablefile <- tempfile("cmsearch", fileext = ".dat") |
| 118 | + on.exit(unlink(tablefile)) |
| 119 | + args <- c("--tblout", tablefile, "--toponly", "--noali") |
| 120 | + if (isTRUE(glocal)) args <- c(args, "-g") |
| 121 | + |
| 122 | + if (!missing(cpu)) { |
| 123 | + assertthat::assert_that(assertthat::is.count(cpu)) |
| 124 | + args <- c(args, "--cpu", cpu) |
| 125 | + } |
| 126 | + |
| 127 | + if (!missing(alignment)) { |
| 128 | + assertthat::assert_that(assertthat::is.string(alignment)) |
| 129 | + d <- dirname(alignment) |
| 130 | + if (nchar(d) > 0 && !dir.exists(d)) dir.create(d, recursive = TRUE) |
| 131 | + args <- c(args, "-A", alignment) |
| 132 | + } |
| 133 | + |
| 134 | + args <- c(args, cm) |
| 135 | + |
| 136 | + seqfile <- NULL |
| 137 | + if (assertthat::is.string(seq) && file.exists(seq)) { |
| 138 | + seqfile <- seq |
| 139 | + } else { |
| 140 | + seqfile <- tempfile("seq", fileext = ".fasta") |
| 141 | + if (is.character(seq)) { |
| 142 | + seq <- Biostrings::BStringSet(seq) |
| 143 | + abc <- Biostrings::uniqueLetters(seq) |
| 144 | + if (all(abc %in% Biostrings::DNA_ALPHABET)) { |
| 145 | + seq <- Biostrings::DNAStringSet(seq) |
| 146 | + seq <- Biostrings::RNAStringSet(seq) |
| 147 | + } else if (all(abc %in% Biostrings::RNA_ALPHABET)) { |
| 148 | + seq <- Biostrings::RNAStringSet(seq) |
| 149 | + } else stop("Sequence alphabet should be DNA or RNA for CMalign.") |
| 150 | + } |
| 151 | + if (methods::is(seq, "XStringSet")) { |
| 152 | + Biostrings::writeXStringSet(seq, seqfile) |
| 153 | + on.exit(unlink(seqfile)) |
| 154 | + } else { |
| 155 | + stop("'seq' should be a filename, XStringSet, or character vector.") |
| 156 | + } |
| 157 | + } |
| 158 | + args <- c(args, seqfile) |
| 159 | + |
| 160 | + system2("cmsearch", args) |
| 161 | + |
| 162 | + readr::read_table2(tablefile, |
| 163 | + col_names = c("target_name", "target_accession", |
| 164 | + "query_name", "query_accession", |
| 165 | + "mdl", "mdl_from", "mdl_to", |
| 166 | + "seq_from", "seq_to", "strand", |
| 167 | + "trunc", "pass", "gc", "bias", |
| 168 | + "score", "E_value", "inc", |
| 169 | + "description"), |
| 170 | + col_types = "ccccciiiicciddddcc", |
| 171 | + comment = "#") |
| 172 | +} |
| 173 | + |
| 174 | +#' Align sequences to a covariance model |
| 175 | +#' |
| 176 | +#' This function calls \code{cmalign} from |
| 177 | +#' \href{http://eddylab.org/infernal/}{Infernal}. Infernal must be installed |
| 178 | +#' and on the path. Not all options are included. |
| 179 | +#' |
| 180 | +#' One of the easiest places to obtain CMs is \href{https://rfam.xfam.org/}{Rfam}. |
| 181 | +#' |
| 182 | +#' @param cmfile (\code{character} scalar) path to a covariance model file |
| 183 | +#' @param seq (\code{character} scalar, names \code{character} vector, or |
| 184 | +#' \code{\link[Biostrings]{XStringSet}}) sequences to align to the |
| 185 | +#' covariance model. This may be given as a character path to a fasta |
| 186 | +#' file, or the sequences as a character vector object of class |
| 187 | +#' \code{\link[Biostrings]{DNAStringSet}} or |
| 188 | +#' \code{\link[Biostrings]{RNAStringSet}}. For \code{cmalign}, the |
| 189 | +#' sequences should be known \emph{a priori} to represent the same region |
| 190 | +#' as the CM; to find the region in longer sequences and align it, use |
| 191 | +#' the \code{alignment} option of \code{\link{cmsearch}}. |
| 192 | +#' @param glocal (\code{logical} scalar) If \code{TRUE}, align in glocal mode. |
| 193 | +#' See \href{http://eddylab.org/infernal/}{Infernal} documentation for |
| 194 | +#' more information. |
| 195 | +#' @param cpu (\code{integer} scalar) The number of cpus to use. |
| 196 | +#' |
| 197 | +#' @return the aligned sequences, as returned by |
| 198 | +#' \code{\link{read_stockholm_msa}}. |
| 199 | +#' @export |
| 200 | +#' |
| 201 | +#' @examples |
| 202 | +#' # align a set of unaligned 5.8S rRNA sequences to the Rfam CM. |
| 203 | +#' cm5_8S <- system.file(file.path("extdata", "RF00002.cm"), package = "inferrnal") |
| 204 | +#' unaln <- system.file(file.path("extdata", "samp_5_8S.fasta"), package = "inferrnal") |
| 205 | +#' cmalign(cm5_8S, unaln, cpu = 1) |
| 206 | +#' # also works if the fasta file has already been loaded |
| 207 | +#' unaln <- Biostrings::readRNAStringSet(unaln) |
| 208 | +#' cmalign(cm5_8S, unaln, cpu = 1) |
| 209 | +cmalign <- function(cmfile, seq, glocal = TRUE, cpu) { |
| 210 | + assertthat::assert_that(assertthat::is.readable(cmfile), |
| 211 | + assertthat::is.flag(glocal)) |
| 212 | + args <- "cmalign" |
| 213 | + if (isTRUE(glocal)) args <- c(args, "-g") |
| 214 | + if (!missing(cpu)) { |
| 215 | + assertthat::assert_that(assertthat::is.count(cpu)) |
| 216 | + args <- c(args, "--cpu", cpu) |
| 217 | + } |
| 218 | + |
| 219 | + seqfile <- NULL |
| 220 | + if (assertthat::is.string(seq) && file.exists(seq)) { |
| 221 | + seqfile <- seq |
| 222 | + } else { |
| 223 | + seqfile <- tempfile("seq", fileext = ".fasta") |
| 224 | + if (is.character(seq)) { |
| 225 | + seq <- Biostrings::BStringSet(seq) |
| 226 | + abc <- Biostrings::uniqueLetters(seq) |
| 227 | + if (all(abc %in% Biostrings::DNA_ALPHABET)) { |
| 228 | + seq <- Biostrings::DNAStringSet(seq) |
| 229 | + seq <- Biostrings::RNAStringSet(seq) |
| 230 | + } else if (all(abc %in% Biostrings::RNA_ALPHABET)) { |
| 231 | + seq <- Biostrings::RNAStringSet(seq) |
| 232 | + } else stop("Sequence alphabet should be DNA or RNA for CMalign.") |
| 233 | + } |
| 234 | + if (methods::is(seq, "XStringSet")) { |
| 235 | + Biostrings::writeXStringSet(seq, seqfile) |
| 236 | + on.exit(unlink(seqfile)) |
| 237 | + } else { |
| 238 | + stop("'seq' should be a filename, XStringSet, or character vector.") |
| 239 | + } |
| 240 | + } |
| 241 | + args <- c(args, cmfile, seqfile) |
| 242 | + args <- paste(args, collapse = " ") |
| 243 | + alnpipe <- pipe(args) |
| 244 | + read_stockholm_msa(alnpipe) |
| 245 | +} |
0 commit comments