Skip to content

Commit a75eeb1

Browse files
committedJan 14, 2020
First commit after extracting code from oueme-fungi-transect. Passes R CMD CHECK
1 parent e7d0a13 commit a75eeb1

17 files changed

+14254
-21
lines changed
 

‎.Rbuildignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
^inferrnal\.Rproj$
2+
^\.Rproj\.user$
3+
^LICENSE\.md$
4+
^README\.Rmd$

‎.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata

‎DESCRIPTION

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Package: inferrnal
2+
Title: Interface to Call Programs from Infernal RNA Covariance Model Package
3+
Version: 0.0.1
4+
Authors@R:
5+
person(given = "Brendan",
6+
family = "Furneaux",
7+
role = c("aut", "cre"),
8+
email = "brendan.furneaux@gmail.com",
9+
comment = c(ORCID = "0000-0003-3522-7363"))
10+
Description: A thin wrapper around the Infernal package to search for matches to
11+
a covariance model in RNA sequences, or to align RNA sequences to a
12+
covariance model. Infernal is not included in the package and needs to be
13+
installed independently.
14+
License: MIT + file LICENSE
15+
Encoding: UTF-8
16+
LazyData: true
17+
SystemRequirements: Infernal
18+
URL: https://github.com/brendanf/inferrnal
19+
BugReports: https://github.com/brendanf/inferrnal/issues
20+
Imports:
21+
assertthat,
22+
Biostrings,
23+
readr,
24+
stringr,
25+
methods,
26+
stats
27+
RoxygenNote: 7.0.2

‎LICENSE

+2-21
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,2 @@
1-
MIT License
2-
3-
Copyright (c) 2020 brendanf
4-
5-
Permission is hereby granted, free of charge, to any person obtaining a copy
6-
of this software and associated documentation files (the "Software"), to deal
7-
in the Software without restriction, including without limitation the rights
8-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9-
copies of the Software, and to permit persons to whom the Software is
10-
furnished to do so, subject to the following conditions:
11-
12-
The above copyright notice and this permission notice shall be included in all
13-
copies or substantial portions of the Software.
14-
15-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21-
SOFTWARE.
1+
YEAR: 2020
2+
COPYRIGHT HOLDER: Brendan Furneaux

‎LICENSE.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# MIT License
2+
3+
Copyright (c) 2020 Brendan Furneaux
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

‎NAMESPACE

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(cmalign)
4+
export(cmsearch)
5+
export(read_stockholm_msa)

‎R/inferrnal.R

+245
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
2+
parse_stockholm_msa_chunk <- function(x, pos, acc) {
3+
4+
gc <- stringr::str_match(x, "#=GC +([^ ]+) +(.+)")[,2:3]
5+
gc <- gc[stats::complete.cases(gc), , drop = FALSE]
6+
for (i in seq_len(nrow(gc))) {
7+
attr(acc, gc[i,1]) <- paste0(attr(acc, gc[i,1]), gc[i,2])
8+
}
9+
10+
x <- stringr::str_match(x, "^(\\d+\\|)?([^#][^ ]*) +([^ ]+)$")[,3:4]
11+
x <- x[stats::complete.cases(x), , drop = FALSE]
12+
for (i in seq_len(nrow(x))) {
13+
if (x[i,1] %in% names(acc)) {
14+
acc[[x[i,1]]] <- paste0(acc[[x[i,1]]], x[i,2])
15+
} else {
16+
acc[[x[i,1]]] <- x[i,2]
17+
}
18+
}
19+
acc
20+
}
21+
22+
#' Parse a Multiple Alignment in Stockholm Format
23+
#'
24+
#' Parses Stockholm-format multiple alignment files, including "\code{#=GC}"
25+
#' lines. Other annotations are ignored.
26+
#'
27+
#' @param stockholm (\code{character} scalar) Path to a file to parse
28+
#'
29+
#' @return a \code{list}. The sequences themselves are element "\code{alignment}",
30+
#' and \code{#=GC} annotations are given as \code{character} strings; e.g.,
31+
#' the "\code{#=GC SS_cons}" annotation is stored as an element named
32+
#' "\code{SS_cons}".
33+
#' @export
34+
#'
35+
#' @examples
36+
#' msafile <- system.file(file.path("extdata", "sample.stk"), package = "inferrnal")
37+
#' msa <- read_stockholm_msa(msafile)
38+
#' msa$alignment
39+
#' # consensus secondary structure
40+
#' msa$SS_cons
41+
#' # reference sequence
42+
#' msa$RF
43+
read_stockholm_msa <- function(stockholm) {
44+
assertthat::assert_that((assertthat::is.string(stockholm) &&
45+
file.exists(stockholm)) ||
46+
methods::is(stockholm, "connection"))
47+
48+
seqs <-
49+
readr::read_lines_chunked(stockholm,
50+
readr::AccumulateCallback$new(parse_stockholm_msa_chunk, acc = list()))
51+
out <- attributes(seqs)
52+
out[["alignment"]] <- Biostrings::RNAMultipleAlignment(unlist(seqs))
53+
out
54+
}
55+
56+
#' Search for Covariance Models (CM) in a set of sequences.
57+
#'
58+
#' This function calls "\code{cmsearch}" from
59+
#' \href{http://eddylab.org/infernal/}{Infernal}. Infernal must be installed.
60+
#' Many parameters are not included (yet!), and the function is focused on
61+
#' retrieving the hits table and, optionally, producing an alignment.
62+
#'
63+
#' @param cm (character of length 1) Path to the covariance model (.cm) file.
64+
#' The covariance model must include calibration data from running
65+
#' "\code{cmcalibrate}".
66+
#' @param seq (filename, character vector, or
67+
#' \code{\link[Biostrings]{XStringSet}}) Sequences to search with the CM.
68+
#' If a filename, the file can be of any type supported by Infernal.
69+
#' @param glocal (logical of length 1) Whether to run the search in glocal mode
70+
#' (global with respect to the model, local with respect to the sequence).
71+
#' When \code{TRUE}, the search is faster, but will fail to find matches with
72+
#' only partially homologous structure.
73+
#' @param alignment (filename) A file to save the aligned hits to. If given,
74+
#' the alignment is saved in Stockholm format with annotations for secondary
75+
#' structure, posterior probablility, etc.
76+
#' @param cpu (integer of length 1) The number of threads to use in the search.
77+
#'
78+
#' @return a \code{\link[tibble]{tibble}} with columns:
79+
#' \itemize{
80+
#' \item{target_name}{ (character) the name of the target sequence}
81+
#' \item{taget_accession}{(character) the target's accession number}
82+
#' \item{query_name}{(character) the name of the query CM}
83+
#' \item{query_accession}{(character) the query CM's accession number}
84+
#' \item{mdl}{(character) the model type ("cm" or "hmm")}
85+
#' \item{mdl_from}{(integer) the start location of the hit in the model}
86+
#' \item{mdl_to}{(integer) the end location of the hit in the model}
87+
#' \item{seq_from}{(integer) the start location of the hit in the sequence}
88+
#' \item{seq_to}{(integer) the end location of the hit in the sequence}
89+
#' \item{strand}{(character) the strand the hit was found on ("+" or "-")}
90+
#' \item{trunc)}{(character) whether the hit is truncated, and where ("no", "5'", "3'", "5'&3'", or "-" for hmm hits).}
91+
#' \item{pass}{(integer) which algorithm pass the hit was found on.}
92+
#' \item{gc}{(numeric) GC content of the hit}
93+
#' \item{bias}{(numeric) biased composition correction. See the Infernal documentation.}
94+
#' \item{score}{(numeric) bit-score of the hit, including the biased
95+
#' composition correction.}
96+
#' \item{E_value}{(numeric) Expectation value for the hit.}}
97+
#' \item{inc}{(character) "!" if the sequence meets the inclusion threshold,
98+
#' "?" if it only meets the reporting threshold.}
99+
#' @export
100+
#'
101+
#' @examples
102+
#' # search sequences from a fasta file for Rfam RF00002 (5.8S rRNA)
103+
#' cm5_8S <- system.file(file.path("extdata", "RF00002.cm"), package = "inferrnal")
104+
#' sampfasta <- system.file(file.path("extdata", "sample.fasta"), package = "inferrnal")
105+
#' cmsearch(cm = cm5_8S, seq = sampfasta, cpu = 1)
106+
#' # also works if the fasta file has already been loaded
107+
#' samp <- Biostrings::readDNAStringSet(sampfasta)
108+
#' cmsearch(cm = cm5_8S, seq = samp, cpu = 1)
109+
cmsearch <- function(cm,
110+
seq,
111+
glocal = TRUE,
112+
alignment,
113+
cpu) {
114+
assertthat::assert_that(assertthat::is.string(cm),
115+
file.exists(cm),
116+
assertthat::is.flag(glocal))
117+
tablefile <- tempfile("cmsearch", fileext = ".dat")
118+
on.exit(unlink(tablefile))
119+
args <- c("--tblout", tablefile, "--toponly", "--noali")
120+
if (isTRUE(glocal)) args <- c(args, "-g")
121+
122+
if (!missing(cpu)) {
123+
assertthat::assert_that(assertthat::is.count(cpu))
124+
args <- c(args, "--cpu", cpu)
125+
}
126+
127+
if (!missing(alignment)) {
128+
assertthat::assert_that(assertthat::is.string(alignment))
129+
d <- dirname(alignment)
130+
if (nchar(d) > 0 && !dir.exists(d)) dir.create(d, recursive = TRUE)
131+
args <- c(args, "-A", alignment)
132+
}
133+
134+
args <- c(args, cm)
135+
136+
seqfile <- NULL
137+
if (assertthat::is.string(seq) && file.exists(seq)) {
138+
seqfile <- seq
139+
} else {
140+
seqfile <- tempfile("seq", fileext = ".fasta")
141+
if (is.character(seq)) {
142+
seq <- Biostrings::BStringSet(seq)
143+
abc <- Biostrings::uniqueLetters(seq)
144+
if (all(abc %in% Biostrings::DNA_ALPHABET)) {
145+
seq <- Biostrings::DNAStringSet(seq)
146+
seq <- Biostrings::RNAStringSet(seq)
147+
} else if (all(abc %in% Biostrings::RNA_ALPHABET)) {
148+
seq <- Biostrings::RNAStringSet(seq)
149+
} else stop("Sequence alphabet should be DNA or RNA for CMalign.")
150+
}
151+
if (methods::is(seq, "XStringSet")) {
152+
Biostrings::writeXStringSet(seq, seqfile)
153+
on.exit(unlink(seqfile))
154+
} else {
155+
stop("'seq' should be a filename, XStringSet, or character vector.")
156+
}
157+
}
158+
args <- c(args, seqfile)
159+
160+
system2("cmsearch", args)
161+
162+
readr::read_table2(tablefile,
163+
col_names = c("target_name", "target_accession",
164+
"query_name", "query_accession",
165+
"mdl", "mdl_from", "mdl_to",
166+
"seq_from", "seq_to", "strand",
167+
"trunc", "pass", "gc", "bias",
168+
"score", "E_value", "inc",
169+
"description"),
170+
col_types = "ccccciiiicciddddcc",
171+
comment = "#")
172+
}
173+
174+
#' Align sequences to a covariance model
175+
#'
176+
#' This function calls \code{cmalign} from
177+
#' \href{http://eddylab.org/infernal/}{Infernal}. Infernal must be installed
178+
#' and on the path. Not all options are included.
179+
#'
180+
#' One of the easiest places to obtain CMs is \href{https://rfam.xfam.org/}{Rfam}.
181+
#'
182+
#' @param cmfile (\code{character} scalar) path to a covariance model file
183+
#' @param seq (\code{character} scalar, names \code{character} vector, or
184+
#' \code{\link[Biostrings]{XStringSet}}) sequences to align to the
185+
#' covariance model. This may be given as a character path to a fasta
186+
#' file, or the sequences as a character vector object of class
187+
#' \code{\link[Biostrings]{DNAStringSet}} or
188+
#' \code{\link[Biostrings]{RNAStringSet}}. For \code{cmalign}, the
189+
#' sequences should be known \emph{a priori} to represent the same region
190+
#' as the CM; to find the region in longer sequences and align it, use
191+
#' the \code{alignment} option of \code{\link{cmsearch}}.
192+
#' @param glocal (\code{logical} scalar) If \code{TRUE}, align in glocal mode.
193+
#' See \href{http://eddylab.org/infernal/}{Infernal} documentation for
194+
#' more information.
195+
#' @param cpu (\code{integer} scalar) The number of cpus to use.
196+
#'
197+
#' @return the aligned sequences, as returned by
198+
#' \code{\link{read_stockholm_msa}}.
199+
#' @export
200+
#'
201+
#' @examples
202+
#' # align a set of unaligned 5.8S rRNA sequences to the Rfam CM.
203+
#' cm5_8S <- system.file(file.path("extdata", "RF00002.cm"), package = "inferrnal")
204+
#' unaln <- system.file(file.path("extdata", "samp_5_8S.fasta"), package = "inferrnal")
205+
#' cmalign(cm5_8S, unaln, cpu = 1)
206+
#' # also works if the fasta file has already been loaded
207+
#' unaln <- Biostrings::readRNAStringSet(unaln)
208+
#' cmalign(cm5_8S, unaln, cpu = 1)
209+
cmalign <- function(cmfile, seq, glocal = TRUE, cpu) {
210+
assertthat::assert_that(assertthat::is.readable(cmfile),
211+
assertthat::is.flag(glocal))
212+
args <- "cmalign"
213+
if (isTRUE(glocal)) args <- c(args, "-g")
214+
if (!missing(cpu)) {
215+
assertthat::assert_that(assertthat::is.count(cpu))
216+
args <- c(args, "--cpu", cpu)
217+
}
218+
219+
seqfile <- NULL
220+
if (assertthat::is.string(seq) && file.exists(seq)) {
221+
seqfile <- seq
222+
} else {
223+
seqfile <- tempfile("seq", fileext = ".fasta")
224+
if (is.character(seq)) {
225+
seq <- Biostrings::BStringSet(seq)
226+
abc <- Biostrings::uniqueLetters(seq)
227+
if (all(abc %in% Biostrings::DNA_ALPHABET)) {
228+
seq <- Biostrings::DNAStringSet(seq)
229+
seq <- Biostrings::RNAStringSet(seq)
230+
} else if (all(abc %in% Biostrings::RNA_ALPHABET)) {
231+
seq <- Biostrings::RNAStringSet(seq)
232+
} else stop("Sequence alphabet should be DNA or RNA for CMalign.")
233+
}
234+
if (methods::is(seq, "XStringSet")) {
235+
Biostrings::writeXStringSet(seq, seqfile)
236+
on.exit(unlink(seqfile))
237+
} else {
238+
stop("'seq' should be a filename, XStringSet, or character vector.")
239+
}
240+
}
241+
args <- c(args, cmfile, seqfile)
242+
args <- paste(args, collapse = " ")
243+
alnpipe <- pipe(args)
244+
read_stockholm_msa(alnpipe)
245+
}

0 commit comments

Comments
 (0)
Please sign in to comment.