diff --git a/InfoAndInputs/.gitignore b/InfoAndInputs/.gitignore new file mode 100644 index 00000000..e43b0f98 --- /dev/null +++ b/InfoAndInputs/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/phyloscannerR/.gitignore b/phyloscannerR/.gitignore index 807ea251..728e389b 100644 --- a/phyloscannerR/.gitignore +++ b/phyloscannerR/.gitignore @@ -1,3 +1,4 @@ .Rproj.user .Rhistory .RData +.DS_Store diff --git a/phyloscannerR/DESCRIPTION b/phyloscannerR/DESCRIPTION index 05d5a78c..003b530d 100644 --- a/phyloscannerR/DESCRIPTION +++ b/phyloscannerR/DESCRIPTION @@ -4,7 +4,7 @@ Version: 1.8.0 Authors@R: c(person("Matthew", "Hall", email = "matthew.hall@bdi.ox.ac.uk", role = c("aut", "cre")), person("Chris", "Wymant", email = "matthew.hall@bdi.ox.ac.uk", role = c("aut")), person("Oliver", "Ratmann", email = "matthew.hall@bdi.ox.ac.uk", role = c("aut"))) Description: An R package for the second half of phyloscanner (tree analysis). Depends: R (>= 3.4.0) -Imports: ape, argparse, extraDistr, ff, GGally, ggtree, grid, gtable, kimisc, network, pegas, phangorn, phytools, prodlim, RColorBrewer, reshape2, scales, sna, tidyverse, treeio (>= 1.6.2), viridis +Imports: ape, argparse, extraDistr, ff, GGally, ggtree, grid, gtable, kimisc, network, pegas, phangorn, phytools, prodlim, RColorBrewer, reshape2, RBGL, scales, sna, tidyverse, treeio (>= 1.6.2), viridis License: GPL Encoding: UTF-8 LazyData: true diff --git a/phyloscannerR/NAMESPACE b/phyloscannerR/NAMESPACE index 8e1da200..ce40e8e3 100644 --- a/phyloscannerR/NAMESPACE +++ b/phyloscannerR/NAMESPACE @@ -54,11 +54,14 @@ export(downsample.host) export(downsample.tree) export(draw.summary.statistics) export(drop.tip.get.map) +export(extract.subgraph) export(extract.subtrees.for.hosts) export(extract.tt.subgraph) export(find.bam.and.references) export(find.duplicate.tips) export(find.gaps) +export(find.networks) +export(find.pairs.in.networks) export(form.rectangles) export(gather.summary.statistics) export(get.ancestral.sequence) @@ -110,6 +113,9 @@ export(phyloscanner.analyse.tree) export(phyloscanner.analyse.trees) export(phyloscanner.generate.blacklist) export(phyloscanner.reconstruct) +export(phyloscanner.to.simmap) +export(plot.chain) +export(plot.network) export(prepare.tree) export(process.tree) export(produce.host.graphs) @@ -127,6 +133,7 @@ export(reconstruct.host.ancestral.sequences) export(rename.blacklisted.tips) export(resolve.tree.into.host.clades) export(select.windows.by.read.and.tip.count) +export(simmap.to.phyloscanner) export(simplified.transmission.summary) export(simplify.summary) export(simplify.summary.multinomial) @@ -141,9 +148,12 @@ export(transmission.summary) export(write.ann.nexus) export(write.ann.tree) export(write.annotated.tree) +import(GGally) import(RColorBrewer) import(dplyr) import(ggplot2) +import(ggtree) +import(glue) import(grid) import(gtable) import(purrr) @@ -152,6 +162,7 @@ import(tidyr) import(tidyverse) import(viridis) importFrom(GGally,ggnet2) +importFrom(RBGL,edmondsOptimumBranching) importFrom(ape,.compressTipLabel) importFrom(ape,.uncompressTipLabel) importFrom(ape,as.DNAbin) @@ -175,7 +186,11 @@ importFrom(ggtree,geom_point2) importFrom(ggtree,geom_tiplab) importFrom(ggtree,geom_treescale) importFrom(ggtree,ggtree) +importFrom(igraph,clusters) +importFrom(igraph,graph.data.frame) +importFrom(igraph,igraph.to.graphNEL) importFrom(network,as.network.matrix) +importFrom(network,network) importFrom(phangorn,Ancestors) importFrom(phangorn,Children) importFrom(phangorn,Descendants) @@ -188,6 +203,7 @@ importFrom(phangorn,phyDat2alignment) importFrom(phangorn,pml) importFrom(phytools,nodeheight) importFrom(readr,read_csv) +importFrom(reshape2,dcast) importFrom(reshape2,melt) importFrom(scales,pretty_breaks) importFrom(tibble,as.tibble) diff --git a/phyloscannerR/R/cmd.R b/phyloscannerR/R/command_line_utilities.R similarity index 100% rename from phyloscannerR/R/cmd.R rename to phyloscannerR/R/command_line_utilities.R diff --git a/phyloscannerR/R/main_analysis.R b/phyloscannerR/R/main_analysis.R index 29bc9506..2a84b029 100644 --- a/phyloscannerR/R/main_analysis.R +++ b/phyloscannerR/R/main_analysis.R @@ -628,6 +628,7 @@ blacklist <- function(ptrees, #' @importFrom tidyr unnest #' @importFrom phangorn Ancestors Descendants Children mrca.phylo getRoot #' @export phyloscanner.analyse.trees +#' @seealso \code{\link{find.pairs.in.networks}}, \code{\link{find.networks}} #' @example inst/example/ex.phyloscanner.analyse.trees.R #' phyloscanner.analyse.trees <- function( @@ -1253,7 +1254,7 @@ multipage.summary.statistics <- function(ptrees, sum.stats, hosts = all.hosts.fr #' @import ggplot2 #' @export write.annotated.tree -write.annotated.tree <- function(ptree, file.name, format = c("pdf", "nex"), pdf.scale.bar.width = 0.01, pdf.w = 50, pdf.hm = 0.15, verbose = F){ +write.annotated.tree <- function(ptree, file.name, format = c("pdf", "nex", "ggplot"), tree.colours=NULL, pdf.scale.bar.width = 0.01, pdf.w = 50, pdf.hm = 0.15, verbose = F){ tree <- ptree$tree read.counts <- ptree$read.counts @@ -1266,12 +1267,26 @@ write.annotated.tree <- function(ptree, file.name, format = c("pdf", "nex"), pdf if(verbose) cat(paste0("Writing .",format," tree to file ",file.name,"\n")) - if(format == "pdf"){ + if(!format %in% c("pdf","nex","ggplot")){ + stop("Unknown tree output format") + } + + if(format == "nex"){ + write.ann.nexus(tree, file=file.name, annotations = c("INDIVIDUAL", "SPLIT", "READ_COUNT")) + } + + if(is.null(tree.colours)){ + hue.pal <- scales:::hue_pal(h= c(0, 360) + 15, c = 100, l = 65, h.start = 0, direction = 1) + tree.colours <- hue.pal(length(levels(attr(tree,'INDIVIDUAL')))) + names(tree.colours) <- levels(attr(tree,'INDIVIDUAL')) + } + + if(format %in% c("pdf","ggplot")){ - tree.display <- ggtree(tree, aes(color=BRANCH_COLOURS)) + + tree.display <- ggtree(tree, aes(color=BRANCH_COLOURS)) + geom_point2(aes(subset=SUBGRAPH_MRCA, color=INDIVIDUAL), shape = 23, size = 3, fill="white") + geom_point2(aes(color=INDIVIDUAL, shape=BLACKLISTED), size=1) + - scale_color_hue(na.value = "black", drop=F) + + scale_color_manual(values=tree.colours, na.value = "black", drop=F) + scale_shape_manual(values=c(16, 4)) + theme(legend.position="none") + geom_tiplab(aes(col=INDIVIDUAL)) + @@ -1296,12 +1311,14 @@ write.annotated.tree <- function(ptree, file.name, format = c("pdf", "nex"), pdf tree.display <- tree.display + ggplot2::xlim(0, 1.1*x.max) tree.display - ggsave(file.name, device="pdf", height = pdf.hm*length(tree$tip.label), width = pdf.w, limitsize = F) - - } else if(format == "nex"){ - write.ann.nexus(tree, file=file.name, annotations = c("INDIVIDUAL", "SPLIT", "READ_COUNT")) - } else { - stop("Unknown tree output format") + } + + if(format == "pdf"){ + ggsave(file.name, device="pdf", height = pdf.hm*length(tree$tip.label), width = pdf.w, limitsize = F) + } + + if(format == "ggplot"){ + return(tree.display) } } diff --git a/phyloscannerR/R/parsimony_reconstruction_methods.R b/phyloscannerR/R/parsimony_reconstruction_methods.R index fca5ce08..5e7c9fff 100644 --- a/phyloscannerR/R/parsimony_reconstruction_methods.R +++ b/phyloscannerR/R/parsimony_reconstruction_methods.R @@ -153,7 +153,7 @@ split.hosts.to.subgraphs <- function(tree, rs.subgraphs <- rs.subgraphs %>% mutate(host = map_chr(subgraph, function(x) unlist(strsplit(x, "-SPLIT"))[1]), tip = map(subgraph, function(x) tree$tip.label[results$split.tips[[x]]])) %>% - unnest() %>% + unnest(c(host, subgraph, tip)) %>% select(host, subgraph, tip) list(tree=tree, rs.subgraphs=rs.subgraphs) diff --git a/phyloscannerR/R/plotting_functions.R b/phyloscannerR/R/plotting_functions.R index ba60e030..30ed2999 100644 --- a/phyloscannerR/R/plotting_functions.R +++ b/phyloscannerR/R/plotting_functions.R @@ -435,18 +435,31 @@ produce.host.graphs <- function(sum.stats, host, xcoords, x.limits, missing.wind #' @param control List of plotting attributes. #' @return A list whose elements are \code{data}, the underlying data frame for the graph, and \code{graph}, the graph itself. #' @export produce.pairwise.graphs2 -#' @seealso \link{\code{classify.pairwise.relationships}} +#' @seealso \code{\link{classify.pairwise.relationships}} #' @examples #' # #' # Example on data from Rakai Community Cohort Study #' # remember that you can specify dwin to save computing time, if you have it already computed #' # #' \dontrun{ +#' # +#' # Example 1 +#' # #' file <- system.file(file.path('extdata','ptyr192_phsc_analyse_trees_output.RData'),package='phyloscannerR') #' load(file) #loads 'phsc', output from 'phyloscanner.analyse.trees' #' hosts <- c('RkA05868F','RkA05968M','RkA00369F','RkA01344M') #' inclusion <- "both" -#' tmp <- produce.pairwise.graphs2(phsc, hosts=hosts, inclusion = "both") +#' tmp <- phyloscannerR:::produce.pairwise.graphs2(phsc, hosts=hosts, inclusion = "both") +#' tmp$graph +#' +#' # +#' # Example 2 +#' # +#' infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +#' load(infile) +#' # loaded "dpl" (pairs of individuals between whom linkage not excluded), "dc" (phyloscanner classification counts), "dw" (phyloscanner classifications per window) +#' hosts <- c('RkA05868F','RkA05968M','RkA00369F','RkA01344M') +#' tmp <- phyloscannerR:::produce.pairwise.graphs2(hosts=hosts, dwin=dw, inclusion = "both") #' tmp$graph #' } produce.pairwise.graphs2 <- function( ptrees, @@ -471,6 +484,13 @@ produce.pairwise.graphs2 <- function( ptrees, relationship.types=c(), verbose=TRUE) } + if(!is.null(dwin)) + { + setnames(dwin, colnames(dwin), gsub('^h([1-2])','host\\.\\1',gsub('_','\\.',tolower(colnames(dwin)))) ) + if(!all(c('host.1','host.2','tree.id','basic.classification','patristic.distance')%in%colnames(dwin))) + stop("Expected columns 'host.1','host.2','tree.id','basic.classification','patristic.distance'") + } + dwin <- dwin %>% select(host.1, host.2, tree.id, basic.classification, patristic.distance) stopifnot( dwin%>%filter(host.2%nrow() == 0) # define basic topology dwin <- dwin %>% @@ -693,4 +713,273 @@ produce.pairwise.graphs <- function(ptrees, ylab("Host") return(list(graph = pairwise.plot, data = pair.data)) +} + + +#' @export +#' @author Oliver Ratmann +#' @import tidyverse grid ggtree GGally +#' @importFrom network network +#' @title Plot transmission chain +#' @description This function plots a transmission chain, showing one edge with two labels: +#' L indicates the proportion of deep-sequence phylogenies in whom the two individuals are phylogenetically close and adjacent. +#' D indicates the proportion of deep-sequence phylogenies that support the indicated direction of transmission, out of all +#' deep-sequence phylogenies that support either direction of transmission. +#' @param df tibble with the following columns 'H1', 'H2', 'CATEGORISATION', 'TYPE', 'SCORE' +#' @param di tibble with meta-data to customize the plot with columns 'H', node.shape, node.label, node.fill +#' @param control list of plotting control variables; see examples. +#' @return ggplot object +#' @seealso \code{\link{find.networks}}, \code{\link{plot.network}} +#' @example inst/example/ex.transmission.networks.plotting.R +plot.chain<- function(df, di, control) +{ + point.size <- control$point.size + edge.gap <- control$edge.gap + edge.size <- control$edge.size + curvature <- control$curvature + arrow <- control$arrow + curv.shift <- control$curv.shift + label.size <- control$label.size + node.label <- control$node.label + node.fill <- control$node.fill + node.shape <- control$node.shape + node.shape.values <- control$node.shape.values + node.fill.values <- control$node.fill.values + threshold.linked <- control$threshold.linked + layout <- control$layout + if(is.na(node.label)) + { + node.label<- paste0('DUMMY',1+length(which(grepl('DUMMY',colnames(di))))) + di <- di%>%mutate(!!node.label:=NA_character_) + } + if(is.na(node.shape)) + { + node.shape<- paste0('DUMMY',1+length(which(grepl('DUMMY',colnames(di))))) + di <- di%>%mutate(!!node.shape:='NA') + } + if(is.na(node.fill)) + { + node.fill<- paste0('DUMMY',1+length(which(grepl('DUMMY',colnames(di))))) + di <- di%>%mutate(!!node.fill:='NA') + } + if(any(is.na(node.fill.values))) + { + z <- unique(di[[node.fill]]) + node.fill.values <- heat.colors(length(z)) + names(node.fill.values) <- z + } + if(any(is.na(node.shape.values))) + { + z <- unique(di[[node.shape]]) + node.shape.values <- seq_along(z) + names(node.shape.values)<- z + } + di <- di %>% rename( NODE_LABEL:= !!node.label, + NODE_SHAPE:= !!node.shape, + NODE_FILL:= !!node.fill) + tmp <- c('NODE_LABEL','NODE_SHAPE','NODE_FILL')[which(c(node.label, node.shape, node.fill)=='H')] + if(length(tmp)) + di <- di %>% mutate(H:= !!rlang::sym(tmp)) + di <- di %>% select(H, NODE_LABEL, NODE_SHAPE, NODE_FILL) + # + if(is.null(layout)) + { + layout <- as_tibble(ggnet2(network(df %>% select(H1,H2) %>% distinct(), directed=FALSE, matrix.type="edgelist"))$data[,c("label", "x", "y")]) + } + if(any(grepl('label', colnames(layout)))) + { + setnames(layout, c('label','x','y'), c('H1','H1_X','H1_Y')) + } + if(any(colnames(layout)=='H')) + { + setnames(layout, c('H','X','Y'), c('H1','H1_X','H1_Y')) + } + df <- df %>% inner_join(layout, by='H1') + setnames(layout, c('H1','H1_X','H1_Y'), c('H2','H2_X','H2_Y')) + df <- df %>% inner_join(layout, by='H2') + setnames(layout, c('H2','H2_X','H2_Y'), c('H','X','Y')) + layout <- layout %>% inner_join(di, by='H') + + df <- df %>% mutate( EDGETEXT_X:= (H1_X+H2_X)/2, + EDGETEXT_Y:= (H1_Y+H2_Y)/2, + EDGE_LABEL:= paste0('L',round(100*SCORE_LINKED,d=1),'%',' // ','D',round(100*pmax(SCORE_DIR_12,SCORE_DIR_21),d=1),'%') + ) + if(is.na(threshold.linked)) + { + df <- df %>% mutate(EDGE_COL:='edge_col_2') + } + if(!is.na(threshold.linked)) + { + df <- df %>% + mutate( EDGE_COL:= case_when( SCORE_LINKED>=threshold.linked ~ 'edge_col_2', + SCORE_LINKED% mutate( MX:= (H2_X - H1_X), MY:= (H2_Y - H1_Y) ) + tmp <- sqrt(df$MX*df$MX+df$MY*df$MY) + df <- df %>% + mutate( MX:= MX/tmp, MY:= MY/tmp) %>% + mutate( H1_X:= H1_X + MX*edge.gap, + H1_Y:= H1_Y + MY*edge.gap, + H2_X:= H2_X - MX*edge.gap, + H2_Y:= H2_Y - MY*edge.gap + ) + # + p <- ggplot() + + geom_point(data=layout, aes(x=X, y=Y, colour=NODE_FILL, pch=NODE_SHAPE), size=point.size) + + geom_segment(data= df%>%filter(LINK_12==1), aes(x=H1_X, xend=H2_X, y=H1_Y, yend=H2_Y, size=edge.size*pmax(SCORE_DIR_21,SCORE_DIR_12), colour=EDGE_COL), arrow=arrow, lineend="butt") + + geom_segment(data= df%>%filter(LINK_21==1), aes(x=H2_X, xend=H1_X, y=H2_Y, yend=H1_Y, size=edge.size*pmax(SCORE_DIR_21,SCORE_DIR_12), colour=EDGE_COL), arrow=arrow, lineend="butt") + + scale_colour_manual(values=c(node.fill.values, 'edge_col_1'='grey80', 'edge_col_2'='grey40','NA'='grey50')) + + scale_shape_manual(values=c(node.shape.values, 'NA'=16)) + + scale_fill_manual(values=c(node.fill.values, 'NA'='grey50')) + + scale_size_identity() + + geom_text(data=df, aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=EDGE_LABEL), size=label.size) + + geom_text(data=layout, aes(x=X, y=Y, label=NODE_LABEL)) + + theme_void() + + guides(colour='none', fill='none',size='none', pch='none') + layout <- subset(layout, select=c(H,X,Y)) + setnames(layout, c('H','X','Y'), c('label','x','y')) + p$layout <- layout + p +} + +#' @export +#' @author Oliver Ratmann +#' @import tidyverse grid ggtree GGally +#' @importFrom network network +#' @title Plot transmission network +#' @description This function plots a phylogenetic transmission network, showing three types of edges: +#' two directed edges respectively in the 1->2 and 2->1 direction, and an undirected edge that represents phylogenetic support of close and adjacent individuals without evidence into the direction transmission. +#' @param df tibble with the following columns 'H1', 'H2', 'CATEGORISATION', 'TYPE', 'SCORE' +#' @param di tibble with meta-data to customize the plot with columns 'H', node.shape, node.label, node.fill +#' @param control list of plotting control variables; see examples. +#' @return ggplot object +#' @seealso \code{\link{find.networks}}, \code{\link{plot.chain}} +#' @example inst/example/ex.transmission.networks.plotting.R +plot.network<- function(df, di, control) +{ + point.size <- control$point.size + edge.gap <- control$edge.gap + edge.size <- control$edge.size + curvature <- control$curvature + arrow <- control$arrow + curv.shift <- control$curv.shift + label.size <- control$label.size + node.label <- control$node.label + node.fill <- control$node.fill + node.shape <- control$node.shape + node.shape.values <- control$node.shape.values + node.fill.values <- control$node.fill.values + threshold.linked <- control$threshold.linked + if(is.na(node.label)) + { + node.label<- paste0('DUMMY',1+length(which(grepl('DUMMY',colnames(di))))) + di <- di%>%mutate(!!node.label:=NA_character_) + } + if(is.na(node.shape)) + { + node.shape<- paste0('DUMMY',1+length(which(grepl('DUMMY',colnames(di))))) + di <- di%>%mutate(!!node.shape:='NA') + } + if(is.na(node.fill)) + { + node.fill<- paste0('DUMMY',1+length(which(grepl('DUMMY',colnames(di))))) + di <- di%>%mutate(!!node.fill:='NA') + } + if(any(is.na(node.fill.values))) + { + z <- unique(di[[node.fill]]) + node.fill.values <- heat.colors(length(z)) + names(node.fill.values) <- z + } + if(any(is.na(node.shape.values))) + { + z <- unique(di[[node.shape]]) + node.shape.values <- seq_along(z) + names(node.shape.values)<- z + } + di <- di %>% rename( NODE_LABEL:= !!node.label, + NODE_SHAPE:= !!node.shape, + NODE_FILL:= !!node.fill) + tmp <- c('NODE_LABEL','NODE_SHAPE','NODE_FILL')[which(c(node.label, node.shape, node.fill)=='H')] + if(length(tmp)) + di <- di %>% mutate(H:= !!rlang::sym(tmp)) + di <- di %>% select(H, NODE_LABEL, NODE_SHAPE, NODE_FILL) + + layout <- as_tibble(ggnet2(network(df %>% select(H1,H2) %>% distinct(), directed=FALSE, matrix.type="edgelist"))$data[,c("label", "x", "y")]) + setnames(layout, c('label','x','y'), c('H1','H1_X','H1_Y')) + df <- df %>% inner_join(layout, by='H1') + setnames(layout, c('H1','H1_X','H1_Y'), c('H2','H2_X','H2_Y')) + df <- df %>% inner_join(layout, by='H2') + setnames(layout, c('H2','H2_X','H2_Y'), c('H','X','Y')) + layout <- layout %>% inner_join(di, by='H') + + df <- df %>% mutate( EDGETEXT_X:= (H1_X+H2_X)/2, + EDGETEXT_Y:= (H1_Y+H2_Y)/2) + # + # calculate score for linked + if(is.na(threshold.linked)) + { + df <- df %>% mutate(EDGE_COL:='edge_col_2') + } + if(!is.na(threshold.linked)) + { + tmp <- df %>% + filter(TYPE!='not.close.or.nonadjacent') %>% + group_by(H1,H2) %>% + summarise(SCORE:= sum(SCORE)) %>% + mutate( EDGE_COL:= case_when( SCORE>=threshold.linked ~ 'edge_col_2', + SCORE% + ungroup() %>% + select(-SCORE) + df <- df %>% inner_join(tmp, by=c('H1','H2')) + } + # for edges, move the start and end points on the line between X and Y + # define unit gradient + df <- df %>% mutate( MX:= (H2_X - H1_X), MY:= (H2_Y - H1_Y) ) + tmp <- sqrt(df$MX*df$MX+df$MY*df$MY) + df <- df %>% + mutate( MX:= MX/tmp, MY:= MY/tmp) %>% + mutate( H1_X:= H1_X + MX*edge.gap, + H1_Y:= H1_Y + MY*edge.gap, + H2_X:= H2_X - MX*edge.gap, + H2_Y:= H2_Y - MY*edge.gap + ) + # label could just be move on the tangent vector to the line + # define unit tangent + df <- df %>% + mutate(TX:= -MY, TY:=MX) %>% + mutate( EDGETEXT_X:= case_when( TYPE=='12' ~ EDGETEXT_X + TX*curv.shift, + TYPE=='21' ~ EDGETEXT_X - TX*curv.shift, + TYPE=='complex.or.no.ancestry' ~ EDGETEXT_X + ), + EDGETEXT_Y:= case_when( TYPE=='12' ~ EDGETEXT_Y + TY*curv.shift, + TYPE=='21' ~ EDGETEXT_Y - TY*curv.shift, + TYPE=='complex.or.no.ancestry' ~ EDGETEXT_Y + ) + ) + # + p <- ggplot() + + geom_point(data=layout, aes(x=X, y=Y, colour=NODE_FILL, pch=NODE_SHAPE), size=point.size) + + geom_segment(data= df%>%filter(EDGE_COL=='edge_col_1' & TYPE=='complex.or.no.ancestry' & SCORE>0), aes(x=H1_X, xend=H2_X, y=H1_Y, yend=H2_Y, size=edge.size*SCORE, colour=EDGE_COL), lineend="butt") + + geom_curve(data= df%>%filter(EDGE_COL=='edge_col_1' & TYPE=='12' & SCORE>0), aes(x=H1_X, xend=H2_X, y=H1_Y, yend=H2_Y, size=edge.size*SCORE, colour=EDGE_COL), curvature=curvature, arrow=arrow, lineend="butt") + + geom_curve(data= df%>%filter(EDGE_COL=='edge_col_1' & TYPE=='21' & SCORE>0), aes(x=H2_X, xend=H1_X, y=H2_Y, yend=H1_Y, size=edge.size*SCORE, colour=EDGE_COL), curvature=curvature, arrow=arrow, lineend="butt") + + geom_segment(data= df%>%filter(EDGE_COL=='edge_col_2' & TYPE=='complex.or.no.ancestry' & SCORE>0), aes(x=H1_X, xend=H2_X, y=H1_Y, yend=H2_Y, size=edge.size*SCORE, colour=EDGE_COL), lineend="butt") + + geom_curve(data= df%>%filter(EDGE_COL=='edge_col_2' & TYPE=='12' & SCORE>0), aes(x=H1_X, xend=H2_X, y=H1_Y, yend=H2_Y, size=edge.size*SCORE, colour=EDGE_COL), curvature=curvature, arrow=arrow, lineend="butt") + + geom_curve(data= df%>%filter(EDGE_COL=='edge_col_2' & TYPE=='21' & SCORE>0), aes(x=H2_X, xend=H1_X, y=H2_Y, yend=H1_Y, size=edge.size*SCORE, colour=EDGE_COL), curvature=curvature, arrow=arrow, lineend="butt") + + scale_colour_manual(values=c(node.fill.values, 'edge_col_1'='grey80', 'edge_col_2'='grey40','NA'='grey50')) + + scale_shape_manual(values=c(node.shape.values, 'NA'=21)) + + scale_fill_manual(values=c(node.fill.values, 'NA'='grey50')) + + scale_size_identity() + + geom_text(data= df%>%filter(TYPE!='not.close.or.nonadjacent' & SCORE>0), aes(x=EDGETEXT_X, y=EDGETEXT_Y, label=paste0(round(100*SCORE,d=1),'%')), size=label.size) + + geom_text(data=layout, aes(x=X, y=Y, label=NODE_LABEL)) + + theme_void() + + guides(colour='none', fill='none',size='none', pch='none') + layout <- layout %>% select(H,X,Y) + setnames(layout, c('H','X','Y'), c('label','x','y')) + p$layout <- layout + p } \ No newline at end of file diff --git a/phyloscannerR/R/relationship_summary_methods.R b/phyloscannerR/R/relationship_summary_methods.R index 1e70fc10..a92bd877 100644 --- a/phyloscannerR/R/relationship_summary_methods.R +++ b/phyloscannerR/R/relationship_summary_methods.R @@ -13,7 +13,7 @@ #' # continue Rakai example, #' # load phyloscanner output from 'phyloscanner.analyse.trees' #' # -#' file <- system.file(file.path('extdata','ptyr192_phsc_analyse_trees_output.R'),package='phyloscannerR') +#' file <- system.file(file.path('extdata','ptyr192_phsc_analyse_trees_output.Rdata'),package='phyloscannerR') #' load(file) #loads 'phsc', output from 'phyloscanner.analyse.trees' #' # use distance thresholds found in analysis of Rakai couples #' close.threshold <- 0.025 diff --git a/phyloscannerR/R/transmission_network_functions.R b/phyloscannerR/R/transmission_network_functions.R new file mode 100644 index 00000000..7462cfbe --- /dev/null +++ b/phyloscannerR/R/transmission_network_functions.R @@ -0,0 +1,351 @@ +#' @export +#' @author Oliver Ratmann +#' @import tidyverse +#' @title Find pairs of individuals between whom linkage is not excluded phylogenetically +#' @param dwin A \code{data.frame} describing pairwise relationships between the hosts in each tree; normally output of \code{classify.pairwise.relationships} +#' @param dc A \code{data.frame} summarising pairwise relationships between the hosts across all trees; normally output of \code{count.pairwise.relationships} +#' @param control List of control variables: +#' \itemize{ +#' \item{\code{linked.group}} Phyloscanner classification used to identify pairs in networks. Default 'close.and.adjacent.cat'. +#' \item{\code{linked.no}} Phyloscanner classification type quantifying that pairs are not linked. Default 'not.close.or.nonadjacent'. +#' \item{\code{linked.yes}} Phyloscanner classification type quantifying that pairs are linked. Default 'close.and.adjacent'. +#' \item{\code{conf.cut}} Threshold on the proportion of deep-sequence phylogenies with distant/disconnected subgraphs above which pairs are considered phylogenetically unlinked. Default: 0.6 +#' \item{\code{neff.cut}} Threshold on the minimum number of deep-sequence phylogenies with sufficient reads from two individuals to make any phylogenetic inferences. Default: 3. +#' } +#' @param verbose Flag to switch on/off verbose mode. Default: TRUE. +#' @param dmeta Optional individual-level meta-data that is to be added to output. Can be NULL. +#' @return +#' Three R objects are generated: +#' \itemize{ +#' \item{\code{network.pairs}} is a tibble that describes pairs of individuals between whom linkage is not excluded phylogenetically. +#' \item{\code{relationship.counts}} is a tibble that summarises the phylogenetic relationship counts for each pair. +#' \item{\code{windows}} is a tibble that describes the basic phyloscanner statistics (distance, adjacency, paths between subgraphs) in each deep-sequence phylogeny for each pair. +#' } +#' @description This function identifies pairs of individuals between whom linkage is not excluded phylogenetically in a large number of phyloscanner analyses, and provides detailed information on them. +#' @seealso \code{\link{phyloscanner.analyse.trees}}, \code{\link{cmd.phyloscanner.analyse.trees}} +#' @example inst/example/ex.transmission.networks.R +find.pairs.in.networks <- function(dwin, dc, control= list(linked.group='close.and.adjacent.cat',linked.no='not.close.or.nonadjacent',linked.yes='close.and.adjacent', conf.cut=0.6, neff.cut=3), dmeta=NULL, verbose=TRUE) +{ + # check if dmeta in right format + if(!is.null(dmeta)) + { + if( !'tbl_df'%in%class(dmeta) ) + stop('"dmeta" must have class tbl_df (be a tibble)') + if( !'ID'%in%colnames(dmeta) ) + stop('"dmeta" must have a column "ID"') + if( class(dmeta$ID)!='character' ) + stop('"dmeta$ID" must be a character') + } + # internal variables + linked.group <- control$linked.group + linked.no <- control$linked.no + linked.yes <- control$linked.yes + conf.cut <- control$conf.cut + neff.cut <- control$neff.cut + + # ensure IDs are characters + if(!all(c( class( dc$host.1 )=='character', + class( dc$host.2 )=='character', + class( dwin$host.1 )=='character', + class( dwin$host.2 )=='character' + ))) + stop('host.1 or host.2 not of character. This is unexpected, contact maintainer.') + # ensure host.1 < host.2 + if( dc %>% filter( host.1 < host.2) %>% nrow() != dc %>% nrow() ) + stop('Not all host.1 < host.2 in "dc". This is unexpected, contact maintainer.') + if( dwin %>% filter( host.1 < host.2) %>% nrow() != dwin %>% nrow() ) + stop('Not all host.1 < host.2 in "dwin". This is unexpected, contact maintainer.') + + # get list of pairs that are not ph-unlinked + dpl <- dc %>% + filter(categorisation==linked.group & type==linked.no) %>% + filter(n.eff>=neff.cut & k.eff/n.eff < conf.cut) %>% + select(host.1, host.2) %>% + mutate(PTY_RUN:= infiles$PTY_RUN[i]) + dpl <- dc %>% + filter(categorisation==linked.group & type==linked.yes) %>% + right_join(dpl, by=c('host.1','host.2')) + # the pairs may just be ambiguous, keep only those with some evidence for linkage + dpl <- dpl %>% filter(k.eff/n.eff > 0) + # gather all classifications counts for these pairs + dwin <- dpl %>% + select(PTY_RUN, host.1, host.2) %>% + left_join(dwin, by=c('host.1','host.2')) + # gather all classification counts for these pairs + dc <- dpl %>% + select(PTY_RUN, host.1, host.2) %>% + left_join(dc, by=c('host.1','host.2')) + + # + # upper case col names + setnames(dpl, colnames(dpl), toupper(gsub('\\.','_',gsub('host\\.','h',colnames(dpl))))) + setnames(dc, colnames(dc), toupper(gsub('\\.','_',gsub('host\\.','h',colnames(dc))))) + setnames(dwin, colnames(dwin), toupper(gsub('\\.','_',gsub('host\\.','h',colnames(dwin))))) + + # + # add meta-data if provided + if(!is.null(dmeta)) + { + if(verbose) cat('\nAdd meta-data...') + tmp <- unique(dmeta, by='ID') + setnames(tmp, colnames(tmp), gsub('H1_ID','H1',paste0('H1_',colnames(tmp)))) + dpl <- dpl %>% left_join(tmp, by='H1') + dc <- dc %>% left_join(tmp, by='H1') + dwin <- dwin %>% left_join(tmp, by='H1') + setnames(tmp, colnames(tmp), gsub('H1','H2',colnames(tmp))) + dpl <- dpl %>% left_join(tmp, by='H2') + dc <- dc %>% left_join(tmp, by='H2') + dwin <- dwin %>% left_join(tmp, by='H2') + } + + if(verbose) cat('\nDone. Found pairs, n=', nrow(dpl), '. Found relationship counts, n=', nrow(dc), '. Found phyloscanner statistics, n=', nrow(dwin), '.') + # return + list(network.pairs=dpl, relationship.counts=dc, windows=dwin) +} + + +#' @export +#' @author Oliver Ratmann +#' @import tidyverse glue +#' @importFrom igraph graph.data.frame clusters +#' @title Find phylogenetic transmission networks and most likely transmission chain +#' @param dc Summary of phylogenetic relationship counts for each pair, stored as tibble. +#' @param control List of control variables: +#' \itemize{ +#' \item{\code{linked.group}} Phyloscanner classification used to identify pairs in networks. Default 'close.and.adjacent.cat'. +#' \item{\code{linked.no}} Phyloscanner classification type quantifying that pairs are not linked. Default 'not.close.or.nonadjacent'. +#' \item{\code{linked.yes}} Phyloscanner classification type quantifying that pairs are linked. Default 'close.and.adjacent'. +#' \item{\code{neff.cut}} Threshold on the minimum number of deep-sequence phylogenies with sufficient reads from two individuals to make any phylogenetic inferences. Default: 3. +#' \item{\code{weight.complex.or.no.ancestry}} Weight given to score complex.or.no.ancestry. Default: 50%. +#' } +#' @param verbose Flag to switch on/off verbose mode. Default: TRUE. +#' @return list of two R objects +#' \itemize{ +#' \item{\code{transmission.networks}} is a tibble that describes the edge list of pairs of individuals in a network, and corresponding phyloscanner scores +#' \item{\code{most.likely.transmission.chains}} is a tibble that describes the edge list of pairs of individuals in the most likely chain, and corresponding phyloscanner scores +#' } +#' See description. +#' @description +#' This function computes transmission networks from phyloscanner output of a population-based deep sequence sample. +#' A transmission network is defined as a set of individuals between whom phylogenetic linkage is not excluded. +#' Every individual in the network has at least one partner in the network between whom evidence for being phylogenetically unlinked is below a threshold. +#' These pairs of individuals are identified with a separate function, \code{\link{find.pairs.in.networks}}. +#' Due to the nature of the deep-sequence +#' data, there are up to three edges between pairs of individuals, giving the strength of evidence +#' of spread in each direction (two possibilities) and the strength of evidence for phylogenetic linkage with +#' the direction remaining unclear. Some of these pairs have limited evidence for phylogenetic linkage. The networks +#' are best interpreted as partially sampled transmission chain. +#' +#' This function also finds the most likely transmission +#' chain among all chains spanning the nodes in a specified transmission network. +#' The transmission network consists of at most three edges between a set of +#' individuals (directed edge in either direction, and undirected edge). Chains +#' are defined as spanning graphs through the set of nodes, without loops and with +#' in-degree equal to one for all nodes, except the start node. Each directed edge +#' in a chain has a weight, which corresponds to the phylogenetic evidence of transmission +#' in this direction. It is set to the phyloscanner score for transmission in this +#' direction plus half the phyloscanner score of the undirected edge, for transmission +#' with direction unclear. The probability of the entire chain is given by the product +#' of the phyloscanner scores along each edge in the chain. +#' +#' @seealso \code{\link{find.pairs.in.networks}}, \code{\link{plot.network}}, \code{\link{plot.chain}} +#' @example inst/example/ex.transmission.networks.R +find.networks<- function(dc, control= list(linked.group='close.and.adjacent.cat',linked.no='not.close.or.nonadjacent',linked.yes='close.and.adjacent', neff.cut=3, weight.complex.or.no.ancestry=0.5), verbose=TRUE) +{ + # internal constants + linked.group <- control$linked.group + linked.no <- control$linked.no + linked.yes <- control$linked.yes + neff.cut <- control$neff.cut + weight.complex.or.no.ancestry <- control$weight.complex.or.no.ancestry + scores.group <- 'close.and.adjacent.and.ancestry.cat' + scores.nolink <- 'not.close.or.nonadjacent' + scores.ambig <- 'complex.or.no.ancestry' + dir.group <- "close.and.adjacent.and.directed.cat" + + # + # construct tri-edge transmission network + dnet <- dc %>% + filter( CATEGORISATION==linked.group & TYPE==linked.yes & N_EFF>neff.cut) %>% + select(-c(CATEGORICAL_DISTANCE, TYPE, K, K_EFF, N, N_EFF, SCORE)) + # define potential transmission network membership + if(verbose) cat(glue('\nReconstructing {nrow(dnet)} transmission networks among linked pairs')) + tmp <- dnet %>% select(H1, H2) + tmp <- igraph:::graph.data.frame(tmp, directed=FALSE, vertices=NULL) + rtc <- tibble(ID=V(tmp)$name, CLU=clusters(tmp, mode="weak")$membership) + rtc <- rtc %>% + group_by(CLU) %>% + summarise( CLU_SIZE:=length(ID) ) %>% + arrange(desc(CLU_SIZE)) %>% + ungroup() %>% + mutate( IDCLU:=seq_along(CLU_SIZE) ) %>% + inner_join(rtc, by='CLU') %>% + select(-CLU) + # add info on edges: network membership + rtc <- rtc %>% rename(H1:=ID) + dnet <- dnet %>% inner_join(rtc, by='H1') + rtc <- rtc %>% select(-CLU_SIZE) + rtc <- rtc %>% rename(H2:=H1) + dnet <- dnet %>% inner_join(rtc, by=c('H2','IDCLU')) + # add info on edges: direction 12, direction 21, direction ambiguous, unlinked + # add this info in new rows + tmp <- dc %>% + filter(CATEGORISATION==scores.group) %>% + select(PTY_RUN, H1, H2, TYPE, SCORE) + dnet <- dnet %>% inner_join(tmp, by=c('H1','H2','PTY_RUN')) + if(verbose) cat(glue('\nFound {length(unique(dnet$IDCLU))} transmission networks with + {dnet %>% filter(TYPE!=scores.nolink) %>% distinct(H1,H2) %>% nrow()} + links (in either direction or undirected) and {length(unique(c(dnet$H1, dnet$H2)))} + individuals')) + # generate most likely transmission chains + if(verbose) cat('\nReconstructing most likely transmission chains...') + tmp <- dnet %>% + filter(TYPE!=scores.nolink) %>% + select(PTY_RUN,H1,H2,IDCLU,CLU_SIZE,TYPE,SCORE) + dchain <- find.most.likely.chains.RBGLedmonds(tmp, weight.complex.or.no.ancestry=weight.complex.or.no.ancestry) + + # merge pw linkage scores to the ML chain representation + # rationale: this describes prob of linkage. here, any 'inconsistent direction' is still considered as prob for linkage + tmp <- dc %>% + filter( CATEGORISATION==linked.group & TYPE==linked.yes ) %>% + select(H1,H2,PTY_RUN,SCORE) %>% + rename(SCORE_LINKED=SCORE) + dchain <- dchain %>% inner_join(tmp, by=c('H1','H2','PTY_RUN')) + # merge pw direction scores to the ML chain representation + # this is considering in denominator 12 + 21 before reducing probs to achieve self-consistency + # rationale: decide on evidence for direction based on comparing only the flows in either direction, 12 vs 21 + tmp <- dc %>% + filter( CATEGORISATION==dir.group ) %>% + select(H1,H2,PTY_RUN,TYPE,SCORE) %>% + mutate(TYPE:= paste0('SCORE_DIR_',TYPE)) %>% + spread(TYPE,SCORE) + dchain <- dchain %>% left_join(tmp, by=c('H1','H2','PTY_RUN')) + # merge pw network scores to the ML chain representation + # this is considering in denominator 12 + 21 + unclear reducing probs to achieve self-consistency + # same as MX_PROB_12, MX_PROB_21, after the final step below that sets one of the two probs to zero + tmp <- dc %>% + filter( CATEGORISATION==scores.group & TYPE!=scores.nolink) %>% + select(H1,H2,PTY_RUN,TYPE,SCORE) %>% + mutate(TYPE:= replace(TYPE, TYPE==scores.ambig, 'AMB')) %>% + mutate(TYPE:= paste0('SCORE_NW_',TYPE)) %>% + spread(TYPE,SCORE) + dchain <- dchain %>% inner_join(tmp, by=c('H1','H2','PTY_RUN')) + # fill in NA + dchain <- dchain %>% + mutate_at(c('SCORE_DIR_12','SCORE_DIR_21','SCORE_NW_12','SCORE_NW_21'), function(x) replace(x, is.na(x), 0)) + # ensure scores are compatible with self-consistency in ML chain + dchain <- dchain %>% + filter(LINK_12==1 | LINK_21==1) %>% + mutate( SCORE_NW_12= replace(SCORE_NW_12, LINK_12==0 & LINK_21==1 & SCORE_NW_12>SCORE_NW_21, 0), + SCORE_NW_21= replace(SCORE_NW_21, LINK_12==1 & LINK_21==0 & SCORE_NW_21>SCORE_NW_12, 0), + SCORE_DIR_12= replace(SCORE_DIR_12, LINK_12==0 & LINK_21==1 & SCORE_DIR_12>SCORE_DIR_21, 0), + SCORE_DIR_21= replace(SCORE_DIR_21, LINK_12==1 & LINK_21==0 & SCORE_DIR_21>SCORE_DIR_12, 0), + ) + # copy meta data to ML chain + tmp <- dnet %>% + select(-c(CLU_SIZE, IDCLU, TYPE, SCORE)) %>% + distinct() + dchain <- dchain %>% inner_join(tmp, by=c('H1','H2','PTY_RUN')) + # verbose + if(verbose) cat(glue("\nIdentified {dchain %>% select(IDCLU) %>% distinct() %>% nrow()} most likely transmission chains + with {nrow(dchain)} links and {length(unique(c(dchain$H1, dchain$H2)))} individuals")) + if(verbose) cat('\nDone.') + # return + list(transmission.networks=dnet, most.likely.transmission.chains=dchain) +} + +#' @keywords internal +#' @author Oliver Ratmann +#' @importFrom igraph graph.data.frame igraph.to.graphNEL +#' @importFrom RBGL edmondsOptimumBranching +#' @title Find most likely transmission chains +#' @description This function finds the most likely transmission +#' chain among all chains spanning the nodes in a specified transmission network. +#' The transmission network consists of at most three edges between a set of +#' individuals (directed edge in either direction, and undirected edge). Chains +#' are defined as spanning graphs through the set of nodes, without loops and with +#' in-degree equal to one for all nodes, except the start node. Each directed edge +#' in a chain has a weight, which corresponds to the phylogenetic evidence of transmission +#' in this direction. It is set to the phyloscanner score for transmission in this +#' direction plus half the phyloscanner score of the undirected edge, for transmission +#' with direction unclear. The probability of the entire chain is given by the product +#' of the phyloscanner scores along each edge in the chain. +#' Edmonds algorithm is used to solve for the most probable chain. +#' @param rtnn tibble encoding the three edges of a phyloscanner transmission network. Must have columns 'H1','H2','IDCLU','TYPE','SCORE','K_EFF'. +#' @param weight.complex.or.no.ancestry weight given to score complex.or.no.ancestry, default: 50% +#' @return tibble encoding the most likely chain. Has columns 'H1','H2','IDCLU', 'LINK_12', 'LINK_21' (either 1 or 0 for a link in the corresponding direction), and 'MX_PROB_12', 'MX_PROB_21' (associated posterior probabilities) +#' @seealso \code{\link{find.networks}} +find.most.likely.chains.RBGLedmonds<- function(rtnn, weight.complex.or.no.ancestry=0.5, verbose=0) +{ + stopifnot(c('PTY_RUN','H1','H2','IDCLU','CLU_SIZE','TYPE','SCORE')%in%colnames(rtnn)) + if( length(setdiff(c('complex.or.no.ancestry','21','12'), rtnn %>% distinct(TYPE) %>% pull(TYPE)) )) + stop('Unexpected classification types. Contact maintainer.') + # define weights to convert tri-edges to bi-edges + # and create bi-edges + rtm <- rtnn %>% + mutate( ID1_IN_WEIGHT= case_when( TYPE=='12' ~ 0, + TYPE=='complex.or.no.ancestry' ~ weight.complex.or.no.ancestry, + TYPE=='21' ~ 1), + ID2_IN_WEIGHT= case_when( TYPE=='21' ~ 0, + TYPE=='complex.or.no.ancestry' ~ weight.complex.or.no.ancestry, + TYPE=='12' ~ 1) + ) %>% + group_by( PTY_RUN,IDCLU,CLU_SIZE,H1,H2 ) %>% + summarise( PROB_21= sum(SCORE*ID1_IN_WEIGHT), + PROB_12= sum(SCORE*ID2_IN_WEIGHT) + ) %>% + ungroup() + # handle networks of size 2 - this is easy + ans <- rtm %>% + filter(CLU_SIZE==2) %>% + transform(TIES_RND_BRK= runif(length(CLU_SIZE))) %>% + mutate( LINK_12 = ifelse((PROB_12>PROB_21) | (PROB_12==PROB_21 & TIES_RND_BRK>=0.5), 1L, 0L), + LINK_21 = ifelse((PROB_21>PROB_12) | (PROB_12==PROB_21 & TIES_RND_BRK<0.5), 1L, 0L) ) %>% + select(-TIES_RND_BRK) + # handle networks of size >2 - use Edmonds algorithm + rtm <- rtm %>% + filter(CLU_SIZE>2) %>% + group_by(IDCLU) %>% + group_split() + for(i in seq_along(rtm)) + { + # + #i <- 13 + if(verbose) + cat('\nIDCLU ',i) + # convert to weighted edge list + tmp <- rtm[[i]] %>% + mutate(weight:=PROB_12) %>% + select(H1,H2,weight) + edgelist <- rtm[[i]] %>% + mutate(weight:=PROB_21) %>% + rename(H1:= H2, H2:= H1) %>% + select(H1,H2,weight) %>% + bind_rows(tmp) %>% + filter(weight>0) + # maximisation is over sum of weights, so transform to log prob + # since edmonds cannot handle negative values or zeros, add some constant + edgelist <- edgelist %>% mutate(weight:= log(weight) - min(log(weight)) + 1 ) + # run Edmonds + g <- graph.data.frame(edgelist) + g2 <- igraph.to.graphNEL(g) + g3 <- edmondsOptimumBranching(g2) + edgelist <- as_tibble(t(g3$edgeList)) %>% + rename(H1:=from, H2:= to) %>% + mutate( LINK_12:= 1L, + LINK_21:= 0L) + edgelist <- edgelist %>% + rename(H1:= H2, H2:=H1, LINK_12:=LINK_21, LINK_21:=LINK_12) %>% + bind_rows(edgelist) + rtm[[i]] <- rtm[[i]] %>% inner_join(edgelist, by=c('H1','H2')) + } + rtm <- do.call('rbind',rtm) + ans <- rbind(ans, rtm) + ans <- ans %>% + select(IDCLU, CLU_SIZE, PTY_RUN, H1, H2, LINK_12, LINK_21) + #mutate( PROB_21= replace(PROB_21, LINK_12==1, 0), + # PROB_12= replace(PROB_12, LINK_21==1, 0) + # ) %>% + #rename(MX_PROB_21=PROB_21, MX_PROB_12=PROB_12, MX_KEFF_21=KEFF_21, MX_KEFF_12=KEFF_12 ) + ans +} \ No newline at end of file diff --git a/phyloscannerR/R/tree_utility_functions.R b/phyloscannerR/R/tree_utility_functions.R index e15a03ed..badc621b 100644 --- a/phyloscannerR/R/tree_utility_functions.R +++ b/phyloscannerR/R/tree_utility_functions.R @@ -553,3 +553,187 @@ select.windows.by.read.and.tip.count <- function(ptrees, dwin, tip.regex, min.re dwin } + +#' @title Cast phyloscanner tree to SIMMAP tree +#' @export phyloscanner.to.simmap +#' @author Oliver Ratmann +#' @importFrom reshape2 dcast +#' @usage phyloscanner.to.simmap(ph, delete.phyloscanner.structures) +#' @param ph A \code{phyloscanner.tree} with attributes \code{SPLIT}, \code{INDIVIDUAL}, \code{BRANCH_COLOURS}, \code{SUBGRAPH_MRCA}. +#' @param delete.phyloscanner.structures Logical value. If true \code{phyloscanner} attributes are removed from the tree. +#' @return Same tree in \code{SIMMAP} format, with elements \code{maps}, \code{mapped.edge}, \code{node.states}. +#' @seealso \code{\link{simmap.to.phyloscanner}}, \code{\link{extract.subgraph}} +#' @example inst/example/ex.cast.to.simmap.R +phyloscanner.to.simmap<- function(ph, delete.phyloscanner.structures=FALSE) +{ + # make maps for each edge + # attr(ph, 'SPLIT') specifies state of branch ending in node + edge.state <- as.character( attr(ph, 'SPLIT')[ ph$edge[,2] ] ) + edge.state[is.na(edge.state)] <- 'Unknown' + edge.maps <- ph$edge.length + # make mapped.edge matrix + mapped.edge <- data.frame(STATE= edge.state, LEN=edge.maps, IDX=seq_along(edge.state)) + mapped.edge <- reshape2:::dcast(mapped.edge, IDX~STATE, value.var='LEN') + for(x in colnames(mapped.edge)) + mapped.edge[[x]][ which(is.na(mapped.edge[[x]])) ] <- 0 + mapped.edge <- as.matrix(mapped.edge) + rownames(mapped.edge) <- apply(ph$edge,1,paste, collapse=',') + # finalise edge.maps + edge.maps <- lapply(seq_along(edge.maps), function(k){ x<- edge.maps[[k]]; names(x)<- edge.state[k]; x}) + # make node.states + node.states <- as.character( attr(ph, 'SPLIT')[ ph$edge[,1] ] ) + node.states[is.na(node.states)] <- 'Unknown' + node.states <- matrix(node.states, nrow=nrow(ph$edge), ncol=2) + node.states[,2] <- edge.state + # set list elements + ph[['maps']] <- edge.maps + ph[['mapped.edge']] <- mapped.edge + ph[['node.states']] <- node.states + # set attributes + attr(ph,'class') <- c("simmap", "phylo") + attr(ph, "map.order") <- "right-to-left" + attr(ph, "order") <- "cladewise" + # delete attributes + if(delete.phyloscanner.structures) + { + attr(ph, "SPLIT") <- NULL + attr(ph, "INDIVIDUAL") <- NULL + attr(ph, "BRANCH_COLOURS") <- NULL + attr(ph, "SUBGRAPH_MRCA") <- NULL + } + ph +} + +#' @title Cast SIMMAP tree to phyloscanner tree +#' @export simmap.to.phyloscanner +#' @importFrom phangorn Ancestors +#' @author Oliver Ratmann +#' @usage simmap.to.phyloscanner(ph, delete.simmap.structures) +#' @param ph A \code{SIMMAP} tree with elements \code{maps}, \code{mapped.edge}, \code{node.states}. +#' @param delete.simmap.structures Logical value. If true \code{SIMMAP} elements are removed from the tree. +#' @return Same tree in \code{phyloscanner.tree} format, with attributes \code{SPLIT}, \code{INDIVIDUAL}, \code{BRANCH_COLOURS}, \code{SUBGRAPH_MRCA}. +#' @seealso \code{\link{phyloscanner.to.simmap}}, \code{\link{extract.subgraph}} +#' @example inst/example/ex.cast.to.simmap.R +simmap.to.phyloscanner <- function(ph, delete.simmap.structures=FALSE) +{ + stopifnot( any(class(ph)=='simmap') ) + # state at each node. set root to Unknown by default + node.states <- vector('character', Nnode(ph, internal.only=FALSE)) + node.states[ph$edge[,2]] <- ph$node.states[,2] + node.states[Ntip(ph)+1] <- 'Unknown' + # find subgraph mrcas + subgraphs <- setdiff(sort(unique(node.states)),'Unknown') + # for each subgraph, find one member node + subgraph.member <- sapply(subgraphs, function(x) which(node.states==x)[1] ) + # for each member, descend in tree and find most ancestral member node, which is the mrca of the subgraph + subgraph.ancestors <- phangorn:::Ancestors(ph, subgraph.member) + subgraph.mrcas <- sapply(seq_along(subgraph.ancestors), function(j){ + subgraph.mrca <- subgraph.member[j] + subgraph.name <- names(subgraph.member)[j] + nodes.in.same.subgraph.idx <- which(node.states[subgraph.ancestors[[j]]]==subgraph.name) + if(length(nodes.in.same.subgraph.idx)) + subgraph.mrca <- subgraph.ancestors[[j]][ tail(nodes.in.same.subgraph.idx,1) ] + subgraph.mrca + }) + names(subgraph.mrcas) <- subgraphs + # make phyloscanner attributes + attr(ph, 'SPLIT') <- node.states + tmp <- rep(FALSE, length(node.states)) + tmp[subgraph.mrcas] <- TRUE + attr(ph, 'SUBGRAPH_MRCA') <- tmp + attr(ph, 'INDIVIDUAL') <- gsub('-SPLIT[0-9]+','',node.states) + tmp <- t( rbind( node.states[ ph$edge[,1] ], + node.states[ ph$edge[,2] ], + node.states[ ph$edge[,2] ] ) ) + tmp[tmp[,1]!=tmp[,2], 3] <- 'Unknown' + tmp <- gsub('-SPLIT[0-9]+','',tmp[,3]) + attr(ph, 'BRANCH_COLOURS') <- rep(NA, Nnode(ph, internal.only=FALSE)) + attr(ph, 'BRANCH_COLOURS')[ ph$edge[,2] ] <- tmp + attr(ph, 'SPLIT')[attr(ph, 'SPLIT')=='Unknown'] <- NA + attr(ph, 'INDIVIDUAL')[attr(ph, 'INDIVIDUAL')=='Unknown'] <- NA + attr(ph, 'BRANCH_COLOURS')[attr(ph, 'BRANCH_COLOURS')=='Unknown'] <- NA + attr(ph, 'SPLIT') <- factor(attr(ph, 'SPLIT')) + attr(ph, 'INDIVIDUAL') <- factor(attr(ph, 'INDIVIDUAL')) + attr(ph, 'BRANCH_COLOURS') <- factor(attr(ph, 'BRANCH_COLOURS')) + if(delete.simmap.structures) + { + ph[["maps"]] <- NULL + ph[["mapped.edge"]] <- NULL + ph[["node.states"]] <- NULL + attr(ph, 'map.order') <- NULL + attr(ph, 'class') <- 'phylo' + } + ph +} + +#' @title Extract subgraph from phyloscanner tree +#' @export extract.subgraph +#' @importFrom phangorn Ancestors +#' @author Oliver Ratmann +#' @usage extract.subgraph(ph, mrca) +#' @param ph A tree of class \code{SIMMAP} and \code{phyloscanner}, i.e. with elements \code{maps}, \code{mapped.edge}, \code{node.states} and with attributes \code{SPLIT}, \code{INDIVIDUAL}, \code{BRANCH_COLOURS}, \code{SUBGRAPH_MRCA}. +#' @param mrca Node index of subgraph MRCA +#' @return Subgraph in ape format, with additional elements \code{subgraph.name} (subgraph name in the phyloscanner SPLIT attribute), \code{subgraph.root.edge} (length of the ancestral edge of the subgraph MRCA), \code{subgraph.parent.state} (ancestral state of the parent of subgraph MRCA). +#' @seealso \code{\link{phyloscanner.to.simmap}} +#' @example inst/example/ex.extract.subgraph.R +extract.subgraph<- function(ph, mrca) +{ + stopifnot( any(class(ph)=='simmap') ) + stopifnot( c("SPLIT","INDIVIDUAL","BRANCH_COLOURS","SUBGRAPH_MRCA")%in%names(attributes(ph)) ) + subgraph.name <- as.character(attr(ph, 'SPLIT')[mrca]) + host <- as.character(attr(ph, "INDIVIDUAL")[mrca]) + subgraph.root.edge <- ph$edge.length[ which( ph$edge[,2]==mrca ) ] + subgraph.parent.state <- as.character(attr(ph, "BRANCH_COLOURS")[ ph$edge[ph$edge[,2]==mrca,1] ]) + # it is possible that: + # the original tree had a subgraph rooted in some state X, but dropping all tips of that subgraph + # gives a tree in which the subgraph of mrca is again rooted in a subgraph of state host + if(!is.na(subgraph.parent.state) && subgraph.parent.state==host) + { + cat('\nFound subgraph.parent.state that is of same state. This may not be an error but check. mrca=',mrca) + subgraph.parent.state <- NA + } + stopifnot( is.na(subgraph.parent.state) || subgraph.parent.state!=host ) + if(mrca<=Ntip(ph)) + { + subgraph<- list( edge= matrix(nrow=0, ncol=2), + edge.length=vector('numeric',0), + Nnode=0, + tip.label=ph$tip.label[mrca], + maps= vector('list',0), + mapped.edge= matrix(nrow=0, ncol=0), + node.states= matrix(nrow=0, ncol=2) + ) + #attr(subgraph,'class') <- c( "phylo") + #attr(subgraph, "map.order") <- "right-to-left" + #attr(subgraph, "order") <- "cladewise" + } + if(mrca>Ntip(ph)) + { + subgraph <- phytools:::extract.clade.simmap(ph, mrca) + subgraph.root <- Ntip(subgraph)+1L + descendants <- Descendants(subgraph, subgraph.root, type='all') + tmp <- sapply(descendants, function(x) which(subgraph$edge[,2]==x)) + descendants.states <- subgraph[['node.states']][tmp,2] + descendants.not.in.subgraph <- descendants[ descendants.states!=subgraph.name ] + if(length(descendants.not.in.subgraph)) + { + # find all tips of descendants.not.in.subgraph, and remove the corresponding tips, which will remove the subtree for each descendant + tips.not.in.subgraph <- sort(unique(unlist(Descendants(subgraph, descendants.not.in.subgraph, type='tips')))) + subgraph <- ape:::drop.tip(subgraph, tips.not.in.subgraph) + subgraph[['maps']] <- NULL + subgraph[['mapped.edge']] <- NULL + subgraph[['node.states']] <- NULL + attr(subgraph,'class') <- c( "phylo") + attr(subgraph, "map.order") <- NULL + attr(subgraph, "order") <- NULL + attr(subgraph, "SPLIT") <- NULL + attr(subgraph, "INDIVIDUAL") <- NULL + attr(subgraph, "BRANCH_COLOURS") <- NULL + attr(subgraph, "SUBGRAPH_MRCA") <- NULL + } + } + subgraph[['subgraph.name']]<- subgraph.name + subgraph[['subgraph.root.edge']]<- subgraph.root.edge + subgraph[['subgraph.parent.state']]<- subgraph.parent.state + subgraph +} \ No newline at end of file diff --git a/phyloscannerR/inst/.DS_Store b/phyloscannerR/inst/.DS_Store index 035a1fc5..3fb4dabd 100644 Binary files a/phyloscannerR/inst/.DS_Store and b/phyloscannerR/inst/.DS_Store differ diff --git a/phyloscannerR/inst/.gitignore b/phyloscannerR/inst/.gitignore new file mode 100644 index 00000000..e43b0f98 --- /dev/null +++ b/phyloscannerR/inst/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/phyloscannerR/inst/example/ex.cast.to.simmap.R b/phyloscannerR/inst/example/ex.cast.to.simmap.R new file mode 100644 index 00000000..7775d63e --- /dev/null +++ b/phyloscannerR/inst/example/ex.cast.to.simmap.R @@ -0,0 +1,40 @@ +# +# Example on data from Rakai Community Cohort Study +# +require(phyloscannerR) +ph.phyloscanner <- readRDS(file=system.file(file.path('extdata','ptyr292_1600_to_1849.rds'),package='phyloscannerR')) + +# cast from phyloscanner format to SIMMAP format +# so that the SIMMAP tree still has the phyloscanner attributes +ph.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=FALSE) +# to delete the phyloscanner attributes, use +ph.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=TRUE) +# +# cast from SIMMAP format to phyloscanner format +# similarly, use delete.simmap.structures to keep/delete SIMMAP elements from the tree in phyloscanner format +ph.phyloscanner2 <- simmap.to.phyloscanner(ph.simmap, delete.simmap.structures=TRUE) +# +# test that ph.phyloscanner2 is the same as ph.phyloscanner: +stopifnot(all( ph.phyloscanner$tip.label == ph.phyloscanner2$tip.label )) +stopifnot(all( ph.phyloscanner$edge == ph.phyloscanner2$edge )) +stopifnot(all( ph.phyloscanner$edge.length == ph.phyloscanner2$edge.length )) +stopifnot(all( ph.phyloscanner$Nnode == ph.phyloscanner2$Nnode )) +tmp <- as.character(attr(ph.phyloscanner,'SPLIT')) +tmp2 <- as.character(attr(ph.phyloscanner2,'SPLIT')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- as.character(attr(ph.phyloscanner,'INDIVIDUAL')) +tmp2 <- as.character(attr(ph.phyloscanner2,'INDIVIDUAL')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- as.character(attr(ph.phyloscanner,'BRANCH_COLOURS')) +tmp2 <- as.character(attr(ph.phyloscanner2,'BRANCH_COLOURS')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- attr(ph.phyloscanner,'SUBGRAPH_MRCA') +tmp2 <- attr(ph.phyloscanner2,'SUBGRAPH_MRCA') +stopifnot( all(tmp==tmp2) ) + \ No newline at end of file diff --git a/phyloscannerR/inst/example/ex.extract.subgraph.R b/phyloscannerR/inst/example/ex.extract.subgraph.R new file mode 100644 index 00000000..2c7c73fa --- /dev/null +++ b/phyloscannerR/inst/example/ex.extract.subgraph.R @@ -0,0 +1,23 @@ +# +# Example on data from Rakai Community Cohort Study +# +require(phyloscannerR) +ph.phyloscanner <- readRDS(file=system.file(file.path('extdata','ptyr292_1600_to_1849.rds'),package='phyloscannerR')) + +# add SIMMAP elements to tree +# because internally the extract.subgraph function uses functions written for +# trees in SIMMAP format +ph.phsc.plus.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=FALSE) + +# extract subgraph MRCAS and choose all those corresponding to a particular host +host <- 'RkA07714M' +mrcas <- which( attr(ph.phsc.plus.simmap, 'SUBGRAPH_MRCA') ) +mrcas <- mrcas[ attr(ph.phsc.plus.simmap, 'INDIVIDUAL')[mrcas]==host ] + +# extract subgraphs +subgraphs <- lapply(mrcas, function(mrca) extract.subgraph(ph.phsc.plus.simmap, mrca)) +# the first subgraph consists of just 1 taxon and is therefore hard to represent as an ape object +# note the dummy structures +# the second subgraph consists of 9 taxa and is in standard ape format +# note the extra elements "subgraph.name", "subgraph.root.edge", "subgraph.parent.state" +str( subgraphs[[2]] ) \ No newline at end of file diff --git a/phyloscannerR/inst/example/ex.transmission.networks.R b/phyloscannerR/inst/example/ex.transmission.networks.R new file mode 100644 index 00000000..1866aadf --- /dev/null +++ b/phyloscannerR/inst/example/ex.transmission.networks.R @@ -0,0 +1,200 @@ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) +workdir<- '~/sandbox/DeepSeqProjects' +setwd(workdir) +# PLEASE NOTE: the output from PART 1+2 below is available through the package +# so these steps can be skipped + + +# +# PART 1: run phyloscanner_analyse_trees on a large number of deepseq trees +# +# prepare analysis: specify path to phyloscanner_analyse_trees +prog.phyloscanner_analyse_trees <- '/Users/Oliver/git/phyloscanner/phyloscanner_analyse_trees.R' +# prepare analysis: download the phyloscanner tree of the Rakai population-based analysis +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208473/Dataset_S1.tar?sequence=1" +# prepare analysis: specify directories to untar public data and to store results +tree.dir <- "RakaiPopSample_deepseqtrees" +outdir <- 'RakaiPopSample_phyloscanner_analysis_190706' +# prepare analysis: download and untar +download.file(tmp, destfile="Dataset_S1.tar", method="curl") +untar("Dataset_S1.tar", exdir=tree.dir, extras='-xvf') +# prepare analysis: specify valid input arguments to phyloscanner_analyse_trees +valid.input.args <- cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees) +# prepare analysis: set phyloscanner variables, arguments as used for the Rakai population-based analysis +control <- list() +control$allow.mt <- TRUE +control$alignment.file.directory = NULL +control$alignment.file.regex = NULL +control$blacklist.underrepresented = FALSE +control$count.reads.in.parsimony = TRUE +control$distance.threshold <- '0.025 0.05' +control$do.dual.blacklisting = FALSE +control$duplicate.file.directory = NULL +control$duplicate.file.regex = NULL +control$file.name.regex = "^\\D*([0-9]+)_to_([0-9]+)\\D*$" +control$guess.multifurcation.threshold = FALSE +control$max.reads.per.host = 50 +control$min.reads.per.host <- 30 +control$min.tips.per.host <- 1 +control$multifurcation.threshold = 1e-5 +control$multinomial= TRUE +control$norm.constants = NULL +control$norm.ref.file.name = system.file('HIV_DistanceNormalisationOverGenome.csv',package='phyloscannerR') +control$norm.standardise.gag.pol = TRUE +control$no.progress.bars = TRUE +control$outgroup.name = "REF_CPX_AF460972" +control$output.dir = outdir +control$parsimony.blacklist.k = 20 +control$prune.blacklist = FALSE +control$post.hoc.count.blacklisting= TRUE +control$ratio.blacklist.threshold = 0 +control$raw.blacklist.threshold = 20 +control$recombination.file.directory = NULL +control$recombination.file.regex = NULL +control$relaxed.ancestry = TRUE +control$sankoff.k = 20 +control$sankoff.unassigned.switch.threshold = 0 +control$seed = 42 +control$splits.rule = 's' +control$tip.regex = "^(.*)_fq[0-9]+_read_([0-9]+)_count_([0-9]+)$" +control$tree.file.regex = "^ptyr[0-9]+_InWindow_([0-9]+_to_[0-9]+)\\.tree$" +control$use.ff = FALSE +control$user.blacklist.directory = NULL +control$user.blacklist.file.regex = NULL +control$verbosity = 1 +# prepare analysis: list zipped tree files. One zip file contains the viral trees of individuals in one putative transmission network. +df <- tibble(F=list.files(tree.dir)) +df <- df %>% + mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\2', F), + RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\1', F))) %>% + mutate(TYPE:= gsub('^([^\\.]+)\\.[a-z]+$','\\1',TYPE)) %>% + spread(TYPE, F) %>% + set_names(~ str_to_upper(.)) +# make bash scripts: one each for processing the viral trees of individuals in one putative transmission network +cmds <- vector('list',nrow(df)) +for(i in seq_len(nrow(df))) +{ + control$output.string <- paste0('ptyr',df$RUN[i]) + tree.input <- file.path(indir, df$TREES_NEWICK[i]) + cmd <- phyloscannerR:::cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees, + tree.input, + control, + valid.input.args=valid.input.args) + cmds[[i]] <- cmd +} +# submit bash scripts: make PBS header +hpc.load <- "module load anaconda3/personal" # make third party requirements available +hpc.select <- 1 # number of nodes +hpc.nproc <- 1 # number of processors on node +hpc.walltime <- 23 # walltime +hpc.q <- "pqeelab" # PBS queue +hpc.mem <- "6gb" # RAM +hpc.array <- length(cmds) # number of runs for job array +pbshead <- "#!/bin/sh" +tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "") +pbshead <- paste(pbshead, tmp, sep = "\n") +tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "") +pbshead <- paste(pbshead, tmp, sep = "\n") +pbshead <- paste(pbshead, "#PBS -j oe", sep = "\n") +if(!is.na(hpc.array)) + pbshead <- paste(pbshead, "\n#PBS -J 1-", hpc.array, sep='') +if(!is.na(hpc.q)) + pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\n") +pbshead <- paste(pbshead, hpc.load, sep = "\n") +cat(pbshead) +# submit bash scripts: make array job +for(i in 1:length(cmds)) + cmds[[i]]<- paste0(i,')\n',cmds[[i]],';;\n') +cmd <- paste0('case $PBS_ARRAY_INDEX in\n',paste0(cmds, collapse=''),'esac') +cmd <- paste(pbshead,cmd,sep='\n') +# submit bash scripts: submit job +outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.')) +outfile <- file.path(tmpdir, outfile) +cat(cmd, file=outfile) +cmd <- paste("qsub", outfile) +cat(system(cmd, intern= TRUE)) + + + +# +# PART 2: find pairs in phylogenetic transmission networks +# + + + +# find network pairs: get meta-data +meta.file <- file.path(workdir,"RakaiPopSample_data","Dataset_S2.csv") +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208474/Dataset_S2.csv?sequence=2" +download.file(tmp, destfile=meta.file, method="curl") +dmeta <- as_tibble(read.csv(meta.file, stringsAsFactors=FALSE)) +# find network pairs: process analyse_tree output with file names "ptyr[0-9]+_workspace.rda" +indir <- file.path(workdir,"RakaiPopSample_phyloscanner_analysis_190706") +control <- list( linked.group='close.and.adjacent.cat', + linked.no='not.close.or.nonadjacent', + linked.yes='close.and.adjacent', + conf.cut=0.6, + neff.cut=3, + weight.complex.or.no.ancestry=0.5) +# process phyloscanner runs +batch.regex <- '^ptyr([0-9]+)_.*' +infiles <- tibble(F=list.files(indir, pattern='workspace.rda', full.names=TRUE)) %>% + mutate( PTY_RUN:= as.integer(gsub(batch.regex,'\\1',basename(F))) ) %>% + arrange(PTY_RUN) +dpls <- vector('list', nrow(infiles)) +dcs <- vector('list', nrow(infiles)) +dws <- vector('list', nrow(infiles)) +for(i in seq_len(nrow(infiles))) +{ + if(verbose) cat('\nReading ', infiles$F[i]) + tmp <- load( infiles$F[i] ) + # ensure we have multinomial output in workspace + if(!'dc'%in%tmp) + stop('Cannot find object "dc". Check that Analyze.trees was run with multinomial=TRUE.') + if(!'dwin'%in%tmp) + stop('Cannot find object "dwin". Check that Analyze.trees was run with multinomial=TRUE.') + # find pairs + tmp <- phyloscannerR:::find.pairs.in.networks(dwin, dc, control=control, verbose=TRUE) + # store output for each run + dpls[[i]] <- copy(tmp$network.pairs) + dcs[[i]] <- copy(tmp$relationship.counts) + dws[[i]] <- copy(tmp$windows) + gc() +} +dpl <- do.call('rbind', dpls) +dc <- do.call('rbind', dcs) +dw <- do.call('rbind', dwins) +dpls <- dcs <- dwins <- NULL +# select analysis in which each pair has highest neff +if(verbose) cat('\nIf pairs are in several batches, select batch with most deep-sequence phylogenies...') +tmp <- dc %>% + filter( CATEGORISATION==control$linked.group & TYPE==control$linked.yes ) %>% + group_by(H1,H2) %>% + summarise(PTY_RUN= PTY_RUN[which.max(N_EFF)]) %>% + ungroup() +dc <- tmp %>% left_join(dc, by=c('H1','H2','PTY_RUN')) +dw <- tmp %>% left_join(dw, by=c('H1','H2','PTY_RUN')) +dpl <- tmp %>% left_join(dpl, by=c('H1','H2','PTY_RUN')) +if(verbose) cat('\nLeft with pairs between whom linkage is not excluded phylogenetically, n=', nrow(dpls)) +# find network pairs: save +outfile <- file.path(workdir, "RakaiPopSample_phyloscanner_analysis_190706", "Rakai_phscnetworks_allpairs_190706.RData") +save(dpl, dc, dw, file=outfile) + + + +# +# PART 3: build transmission networks from constituting pairs +# + + +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) +# loads phyloscanner relationship counts 'dc' +control <- list(linked.group='close.and.adjacent.cat',linked.no='not.close.or.nonadjacent',linked.yes='close.and.adjacent', neff.cut=3) +tmp <- find.networks(dc, control=control, verbose=TRUE) +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +} diff --git a/phyloscannerR/inst/example/ex.transmission.networks.plotting.R b/phyloscannerR/inst/example/ex.transmission.networks.plotting.R new file mode 100644 index 00000000..08858f22 --- /dev/null +++ b/phyloscannerR/inst/example/ex.transmission.networks.plotting.R @@ -0,0 +1,57 @@ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) + +# +# reconstruct transmission networks and most likely transmission chain +# +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) # loads phyloscanner relationship counts 'dc' +tmp <- find.networks(dc, neff.cut=3, verbose=TRUE) +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +# get meta data +meta.file <- file.path(workdir,"RakaiPopSample_data","Dataset_S2.csv") +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208474/Dataset_S2.csv?sequence=2" +download.file(tmp, destfile=meta.file, method="curl") +dmeta <- as_tibble(read.csv(meta.file, stringsAsFactors=FALSE)) + +# +# plot network with host names, and male/female in blue/pink nodes, +# and highlight pairings with >60% support for phylogenetic linkage +# +idclus <- sort(unique(dnet$IDCLU)) +di <- copy(dmeta) +setnames(di, 'ID', 'H') +df <- dnet %>% + filter(IDCLU == 34) %>% + select(-c(H1_SEX,H2_SEX)) +control<- list() +control$point.size = 10 +control$edge.gap = 0.04 +control$edge.size = 2 +control$curvature = -0.2 +control$arrow = arrow(length = unit(0.04, "npc"), type = "open") +control$curv.shift = 0.06 +control$label.size = 3 +control$node.label = "H" +control$node.fill = "SEX" +control$node.shape = NA_character_ +control$node.shape.values = c(`NA` = 16) +control$node.fill.values = c(F = "hotpink2", M = "steelblue2") +control$threshold.linked = 0.6 +p <- plot.network(df, di, control) + +# +# plot most likely chain with host names, and male/female in blue/pink nodes, +# and highlight pairings with >60% support for phylogenetic linkage +# +control$layout <- p$layout +di <- copy(dmeta) +setnames(di, 'ID', 'H') +df <- subset(dchain, IDCLU==34) +p2 <- plot.chain(df, di, control) + +} \ No newline at end of file diff --git a/phyloscannerR/inst/extdata/Rakai_phscnetworks_allpairs_190706.RData b/phyloscannerR/inst/extdata/Rakai_phscnetworks_allpairs_190706.RData new file mode 100644 index 00000000..e7980f77 Binary files /dev/null and b/phyloscannerR/inst/extdata/Rakai_phscnetworks_allpairs_190706.RData differ diff --git a/phyloscannerR/inst/extdata/ptyr292_1600_to_1849.rds b/phyloscannerR/inst/extdata/ptyr292_1600_to_1849.rds new file mode 100644 index 00000000..103f37b2 Binary files /dev/null and b/phyloscannerR/inst/extdata/ptyr292_1600_to_1849.rds differ diff --git a/phyloscannerR/man/cmd.phyloscanner.analyse.trees.Rd b/phyloscannerR/man/cmd.phyloscanner.analyse.trees.Rd new file mode 100644 index 00000000..e1baa7e3 --- /dev/null +++ b/phyloscannerR/man/cmd.phyloscanner.analyse.trees.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/command_line_utilities.R +\name{cmd.phyloscanner.analyse.trees} +\alias{cmd.phyloscanner.analyse.trees} +\title{Make script file for a phyloscanner analysis on a tree or set of trees} +\usage{ +cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees, tree.input, + control, + valid.input.args = cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees)) +} +\arguments{ +\item{prog.phyloscanner_analyse_trees}{The full file name of \code{phyloscanner_analyse_trees.R}.} + +\item{tree.input}{One of the following: the name of a single tree file (Newick or NEXUS format); the directory containing all input trees; a zip file containing input trees.} + +\item{control}{List of input arguments to \link{\code{phyloscanner_analyse_trees}}.} + +\item{valid.input.args}{Vector of valid input arguments.} +} +\value{ +A character string of UNIX commands. +} +\description{ +This function makes a UNIX script file to call \code{phyloscanner_analyse_trees.R}. Usually, this is useful to parallelise computations; see the Examples. +} +\examples{ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) + +# specify path to phyloscanner_analyse_trees +prog.phyloscanner_analyse_trees <- '/Users/Oliver/git/phyloscanner/phyloscanner_analyse_trees.R' +# specify out directory +outdir <- '/Users/Oliver/sandbox/DeepSeqProjects/RakaiPopSample_phsc_out190512' +# specify valid input arguments to phyloscanner_analyse_trees +valid.input.args <- cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees) + +# set phyloscanner variables +# arguments as used for the Rakai population-based analysis +control <- list() +control$allow.mt <- TRUE +control$alignment.file.directory = NULL +control$alignment.file.regex = NULL +control$blacklist.underrepresented = FALSE +control$count.reads.in.parsimony = TRUE +control$distance.threshold <- '0.025 0.05' +control$do.dual.blacklisting = FALSE +control$duplicate.file.directory = NULL +control$duplicate.file.regex = NULL +control$file.name.regex = "^\\\\D*([0-9]+)_to_([0-9]+)\\\\D*$" +control$guess.multifurcation.threshold = FALSE +control$max.reads.per.host = 50 +control$min.reads.per.host <- 30 +control$min.tips.per.host <- 1 +control$multifurcation.threshold = 1e-5 +control$multinomial= TRUE +control$norm.constants = NULL +control$norm.ref.file.name = system.file('HIV_DistanceNormalisationOverGenome.csv',package='phyloscannerR') +control$norm.standardise.gag.pol = TRUE +control$no.progress.bars = TRUE +control$outgroup.name = "REF_CPX_AF460972" +control$output.dir = outdir +control$parsimony.blacklist.k = 20 +control$prune.blacklist = FALSE +control$post.hoc.count.blacklisting= TRUE +control$ratio.blacklist.threshold = 0 +control$raw.blacklist.threshold = 20 +control$recombination.file.directory = NULL +control$recombination.file.regex = NULL +control$relaxed.ancestry = TRUE +control$sankoff.k = 20 +control$sankoff.unassigned.switch.threshold = 0 +control$seed = 42 +control$splits.rule = 's' +control$tip.regex = "^(.*)_fq[0-9]+_read_([0-9]+)_count_([0-9]+)$" +control$tree.file.regex = "^ptyr[0-9]+_InWindow_([0-9]+_to_[0-9]+)\\\\.tree$" +control$use.ff = FALSE +control$user.blacklist.directory = NULL +control$user.blacklist.file.regex = NULL +control$verbosity = 1 + +# +# Example 1: make bash for one file +# +tree.input <- system.file(file.path('extdata','Rakai_run192_trees.zip'),package='phyloscannerR') +control$output.string <- 'Rakai_run192' +cmd <- cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees, + tree.input, + control, + valid.input.args=valid.input.args) +cat(cmd) + +# +# Example 2: make bash for many files +# +# download the phyloscanner tree of the Rakai population-based analysis +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208473/Dataset_S1.tar?sequence=1" +# specify directory to untar public data +tree.dir <- "RakaiPopSample_deepseqtrees" +# download and untar +download.file(tmp, destfile="Dataset_S1.tar", method="curl") +untar("Dataset_S1.tar", exdir=tree.dir, extras='-xvf') +# list zipped tree files. One zip file contains the viral trees of individuals in one putative transmission network. +df <- tibble(F=list.files(tree.dir)) +df <- df \%>\% + mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\\\2', F), + RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\\\1', F))) \%>\% + mutate(TYPE:= gsub('^([^\\\\.]+)\\\\.[a-z]+$','\\\\1',TYPE)) \%>\% + spread(TYPE, F) \%>\% + set_names(~ str_to_upper(.)) +# make one bash script for processing the viral trees of individuals in one putative transmission network. +cmds <- vector('list',nrow(df)) +for(i in seq_len(nrow(df))) +{ + control$output.string <- paste0('ptyr',df$RUN[i]) + tree.input <- file.path(indir, df$TREES_NEWICK[i]) + cmd <- cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees, + tree.input, + control, + valid.input.args=valid.input.args) + cmds[[i]] <- cmd +} +# output +cat(cmds[[100]]) +} +} +\seealso{ +\link{\code{phyloscanner.analyse.trees}}, \link{\code{cmd.phyloscanner.analyse.trees.valid.args}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/cmd.phyloscanner.analyse.trees.valid.args.Rd b/phyloscannerR/man/cmd.phyloscanner.analyse.trees.valid.args.Rd new file mode 100644 index 00000000..9d4e986e --- /dev/null +++ b/phyloscannerR/man/cmd.phyloscanner.analyse.trees.valid.args.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/command_line_utilities.R +\name{cmd.phyloscanner.analyse.trees.valid.args} +\alias{cmd.phyloscanner.analyse.trees.valid.args} +\title{Obtain valid input arguments for a phyloscanner analysis on a tree or set of trees} +\usage{ +cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees) +} +\arguments{ +\item{prog.phyloscanner_analyse_trees}{The full file name of \code{phyloscanner_analyse_trees.R}.} +} +\description{ +Obtain valid input arguments for a phyloscanner analysis on a tree or set of trees +} +\seealso{ +\link{\code{cmd.phyloscanner.analyse.trees}} +} diff --git a/phyloscannerR/man/count.pairwise.relationships.Rd b/phyloscannerR/man/count.pairwise.relationships.Rd index b72d7415..7e7fb7a8 100644 --- a/phyloscannerR/man/count.pairwise.relationships.Rd +++ b/phyloscannerR/man/count.pairwise.relationships.Rd @@ -26,7 +26,7 @@ require(phyloscannerR) # continue Rakai example, # load phyloscanner output from 'phyloscanner.analyse.trees' # -file <- system.file(file.path('extdata','ptyr192_phsc_analyse_trees_output.R'),package='phyloscannerR') +file <- system.file(file.path('extdata','ptyr192_phsc_analyse_trees_output.Rdata'),package='phyloscannerR') load(file) #loads 'phsc', output from 'phyloscanner.analyse.trees' # use distance thresholds found in analysis of Rakai couples close.threshold <- 0.025 diff --git a/phyloscannerR/man/extract.subgraph.Rd b/phyloscannerR/man/extract.subgraph.Rd new file mode 100644 index 00000000..d099557b --- /dev/null +++ b/phyloscannerR/man/extract.subgraph.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree_utility_functions.R +\name{extract.subgraph} +\alias{extract.subgraph} +\title{Extract subgraph from phyloscanner tree} +\usage{ +extract.subgraph(ph, mrca) +} +\arguments{ +\item{ph}{A tree of class \code{SIMMAP} and \code{phyloscanner}, i.e. with elements \code{maps}, \code{mapped.edge}, \code{node.states} and with attributes \code{SPLIT}, \code{INDIVIDUAL}, \code{BRANCH_COLOURS}, \code{SUBGRAPH_MRCA}.} + +\item{mrca}{Node index of subgraph MRCA} +} +\value{ +Subgraph in ape format, with additional elements \code{subgraph.name} (subgraph name in the phyloscanner SPLIT attribute), \code{subgraph.root.edge} (length of the ancestral edge of the subgraph MRCA), \code{subgraph.parent.state} (ancestral state of the parent of subgraph MRCA). +} +\description{ +Extract subgraph from phyloscanner tree +} +\examples{ +# +# Example on data from Rakai Community Cohort Study +# +require(phyloscannerR) +ph.phyloscanner <- readRDS(file=system.file(file.path('extdata','ptyr292_1600_to_1849.rds'),package='phyloscannerR')) + +# add SIMMAP elements to tree +# because internally the extract.subgraph function uses functions written for +# trees in SIMMAP format +ph.phsc.plus.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=FALSE) + +# extract subgraph MRCAS and choose all those corresponding to a particular host +host <- 'RkA07714M' +mrcas <- which( attr(ph.phsc.plus.simmap, 'SUBGRAPH_MRCA') ) +mrcas <- mrcas[ attr(ph.phsc.plus.simmap, 'INDIVIDUAL')[mrcas]==host ] + +# extract subgraphs +subgraphs <- lapply(mrcas, function(mrca) extract.subgraph(ph.phsc.plus.simmap, mrca)) +# the first subgraph consists of just 1 taxon and is therefore hard to represent as an ape object +# note the dummy structures +# the second subgraph consists of 9 taxa and is in standard ape format +# note the extra elements "subgraph.name", "subgraph.root.edge", "subgraph.parent.state" +str( subgraphs[[2]] ) +} +\seealso{ +\code{\link{phyloscanner.to.simmap}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/find.most.likely.chains.RBGLedmonds.Rd b/phyloscannerR/man/find.most.likely.chains.RBGLedmonds.Rd new file mode 100644 index 00000000..1a2fa6f0 --- /dev/null +++ b/phyloscannerR/man/find.most.likely.chains.RBGLedmonds.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transmission_network_functions.R +\name{find.most.likely.chains.RBGLedmonds} +\alias{find.most.likely.chains.RBGLedmonds} +\title{Find most likely transmission chains} +\usage{ +find.most.likely.chains.RBGLedmonds(rtnn, + weight.complex.or.no.ancestry = 0.5, verbose = 0) +} +\arguments{ +\item{rtnn}{tibble encoding the three edges of a phyloscanner transmission network. Must have columns 'H1','H2','IDCLU','TYPE','SCORE','K_EFF'.} + +\item{weight.complex.or.no.ancestry}{weight given to score complex.or.no.ancestry, default: 50%} +} +\value{ +tibble encoding the most likely chain. Has columns 'H1','H2','IDCLU', 'LINK_12', 'LINK_21' (either 1 or 0 for a link in the corresponding direction), and 'MX_PROB_12', 'MX_PROB_21' (associated posterior probabilities) +} +\description{ +This function finds the most likely transmission +chain among all chains spanning the nodes in a specified transmission network. +The transmission network consists of at most three edges between a set of +individuals (directed edge in either direction, and undirected edge). Chains +are defined as spanning graphs through the set of nodes, without loops and with +in-degree equal to one for all nodes, except the start node. Each directed edge +in a chain has a weight, which corresponds to the phylogenetic evidence of transmission +in this direction. It is set to the phyloscanner score for transmission in this +direction plus half the phyloscanner score of the undirected edge, for transmission +with direction unclear. The probability of the entire chain is given by the product +of the phyloscanner scores along each edge in the chain. +Edmonds algorithm is used to solve for the most probable chain. +} +\seealso{ +\code{\link{find.networks}} +} +\author{ +Oliver Ratmann +} +\keyword{internal} diff --git a/phyloscannerR/man/find.networks.Rd b/phyloscannerR/man/find.networks.Rd new file mode 100644 index 00000000..221eb4f1 --- /dev/null +++ b/phyloscannerR/man/find.networks.Rd @@ -0,0 +1,264 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transmission_network_functions.R +\name{find.networks} +\alias{find.networks} +\title{Find phylogenetic transmission networks and most likely transmission chain} +\usage{ +find.networks(dc, control = list(linked.group = "close.and.adjacent.cat", + linked.no = "not.close.or.nonadjacent", linked.yes = + "close.and.adjacent", neff.cut = 3, weight.complex.or.no.ancestry = 0.5), + verbose = TRUE) +} +\arguments{ +\item{dc}{Summary of phylogenetic relationship counts for each pair, stored as tibble.} + +\item{control}{List of control variables: +\itemize{ + \item{\code{linked.group}} Phyloscanner classification used to identify pairs in networks. Default 'close.and.adjacent.cat'. + \item{\code{linked.no}} Phyloscanner classification type quantifying that pairs are not linked. Default 'not.close.or.nonadjacent'. + \item{\code{linked.yes}} Phyloscanner classification type quantifying that pairs are linked. Default 'close.and.adjacent'. + \item{\code{neff.cut}} Threshold on the minimum number of deep-sequence phylogenies with sufficient reads from two individuals to make any phylogenetic inferences. Default: 3. + \item{\code{weight.complex.or.no.ancestry}} Weight given to score complex.or.no.ancestry. Default: 50%. +}} + +\item{verbose}{Flag to switch on/off verbose mode. Default: TRUE.} +} +\value{ +list of two R objects +\itemize{ + \item{\code{transmission.networks}} is a tibble that describes the edge list of pairs of individuals in a network, and corresponding phyloscanner scores + \item{\code{most.likely.transmission.chains}} is a tibble that describes the edge list of pairs of individuals in the most likely chain, and corresponding phyloscanner scores +} +See description. +} +\description{ +This function computes transmission networks from phyloscanner output of a population-based deep sequence sample. +A transmission network is defined as a set of individuals between whom phylogenetic linkage is not excluded. +Every individual in the network has at least one partner in the network between whom evidence for being phylogenetically unlinked is below a threshold. +These pairs of individuals are identified with a separate function, \code{\link{find.pairs.in.networks}}. +Due to the nature of the deep-sequence +data, there are up to three edges between pairs of individuals, giving the strength of evidence +of spread in each direction (two possibilities) and the strength of evidence for phylogenetic linkage with +the direction remaining unclear. Some of these pairs have limited evidence for phylogenetic linkage. The networks +are best interpreted as partially sampled transmission chain. + +This function also finds the most likely transmission +chain among all chains spanning the nodes in a specified transmission network. +The transmission network consists of at most three edges between a set of +individuals (directed edge in either direction, and undirected edge). Chains +are defined as spanning graphs through the set of nodes, without loops and with +in-degree equal to one for all nodes, except the start node. Each directed edge +in a chain has a weight, which corresponds to the phylogenetic evidence of transmission +in this direction. It is set to the phyloscanner score for transmission in this +direction plus half the phyloscanner score of the undirected edge, for transmission +with direction unclear. The probability of the entire chain is given by the product +of the phyloscanner scores along each edge in the chain. +} +\examples{ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) +workdir<- '~/sandbox/DeepSeqProjects' +setwd(workdir) +# PLEASE NOTE: the output from PART 1+2 below is available through the package +# so these steps can be skipped + + +# +# PART 1: run phyloscanner_analyse_trees on a large number of deepseq trees +# +# prepare analysis: specify path to phyloscanner_analyse_trees +prog.phyloscanner_analyse_trees <- '/Users/Oliver/git/phyloscanner/phyloscanner_analyse_trees.R' +# prepare analysis: download the phyloscanner tree of the Rakai population-based analysis +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208473/Dataset_S1.tar?sequence=1" +# prepare analysis: specify directories to untar public data and to store results +tree.dir <- "RakaiPopSample_deepseqtrees" +outdir <- 'RakaiPopSample_phyloscanner_analysis_190706' +# prepare analysis: download and untar +download.file(tmp, destfile="Dataset_S1.tar", method="curl") +untar("Dataset_S1.tar", exdir=tree.dir, extras='-xvf') +# prepare analysis: specify valid input arguments to phyloscanner_analyse_trees +valid.input.args <- cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees) +# prepare analysis: set phyloscanner variables, arguments as used for the Rakai population-based analysis +control <- list() +control$allow.mt <- TRUE +control$alignment.file.directory = NULL +control$alignment.file.regex = NULL +control$blacklist.underrepresented = FALSE +control$count.reads.in.parsimony = TRUE +control$distance.threshold <- '0.025 0.05' +control$do.dual.blacklisting = FALSE +control$duplicate.file.directory = NULL +control$duplicate.file.regex = NULL +control$file.name.regex = "^\\\\D*([0-9]+)_to_([0-9]+)\\\\D*$" +control$guess.multifurcation.threshold = FALSE +control$max.reads.per.host = 50 +control$min.reads.per.host <- 30 +control$min.tips.per.host <- 1 +control$multifurcation.threshold = 1e-5 +control$multinomial= TRUE +control$norm.constants = NULL +control$norm.ref.file.name = system.file('HIV_DistanceNormalisationOverGenome.csv',package='phyloscannerR') +control$norm.standardise.gag.pol = TRUE +control$no.progress.bars = TRUE +control$outgroup.name = "REF_CPX_AF460972" +control$output.dir = outdir +control$parsimony.blacklist.k = 20 +control$prune.blacklist = FALSE +control$post.hoc.count.blacklisting= TRUE +control$ratio.blacklist.threshold = 0 +control$raw.blacklist.threshold = 20 +control$recombination.file.directory = NULL +control$recombination.file.regex = NULL +control$relaxed.ancestry = TRUE +control$sankoff.k = 20 +control$sankoff.unassigned.switch.threshold = 0 +control$seed = 42 +control$splits.rule = 's' +control$tip.regex = "^(.*)_fq[0-9]+_read_([0-9]+)_count_([0-9]+)$" +control$tree.file.regex = "^ptyr[0-9]+_InWindow_([0-9]+_to_[0-9]+)\\\\.tree$" +control$use.ff = FALSE +control$user.blacklist.directory = NULL +control$user.blacklist.file.regex = NULL +control$verbosity = 1 +# prepare analysis: list zipped tree files. One zip file contains the viral trees of individuals in one putative transmission network. +df <- tibble(F=list.files(tree.dir)) +df <- df \%>\% + mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\\\2', F), + RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\\\1', F))) \%>\% + mutate(TYPE:= gsub('^([^\\\\.]+)\\\\.[a-z]+$','\\\\1',TYPE)) \%>\% + spread(TYPE, F) \%>\% + set_names(~ str_to_upper(.)) +# make bash scripts: one each for processing the viral trees of individuals in one putative transmission network +cmds <- vector('list',nrow(df)) +for(i in seq_len(nrow(df))) +{ + control$output.string <- paste0('ptyr',df$RUN[i]) + tree.input <- file.path(indir, df$TREES_NEWICK[i]) + cmd <- phyloscannerR:::cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees, + tree.input, + control, + valid.input.args=valid.input.args) + cmds[[i]] <- cmd +} +# submit bash scripts: make PBS header +hpc.load <- "module load anaconda3/personal" # make third party requirements available +hpc.select <- 1 # number of nodes +hpc.nproc <- 1 # number of processors on node +hpc.walltime <- 23 # walltime +hpc.q <- "pqeelab" # PBS queue +hpc.mem <- "6gb" # RAM +hpc.array <- length(cmds) # number of runs for job array +pbshead <- "#!/bin/sh" +tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "") +pbshead <- paste(pbshead, tmp, sep = "\\n") +tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "") +pbshead <- paste(pbshead, tmp, sep = "\\n") +pbshead <- paste(pbshead, "#PBS -j oe", sep = "\\n") +if(!is.na(hpc.array)) + pbshead <- paste(pbshead, "\\n#PBS -J 1-", hpc.array, sep='') +if(!is.na(hpc.q)) + pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\\n") +pbshead <- paste(pbshead, hpc.load, sep = "\\n") +cat(pbshead) +# submit bash scripts: make array job +for(i in 1:length(cmds)) + cmds[[i]]<- paste0(i,')\\n',cmds[[i]],';;\\n') +cmd <- paste0('case $PBS_ARRAY_INDEX in\\n',paste0(cmds, collapse=''),'esac') +cmd <- paste(pbshead,cmd,sep='\\n') +# submit bash scripts: submit job +outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.')) +outfile <- file.path(tmpdir, outfile) +cat(cmd, file=outfile) +cmd <- paste("qsub", outfile) +cat(system(cmd, intern= TRUE)) + + + +# +# PART 2: find pairs in phylogenetic transmission networks +# + + + +# find network pairs: get meta-data +meta.file <- file.path(workdir,"RakaiPopSample_data","Dataset_S2.csv") +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208474/Dataset_S2.csv?sequence=2" +download.file(tmp, destfile=meta.file, method="curl") +dmeta <- as_tibble(read.csv(meta.file, stringsAsFactors=FALSE)) +# find network pairs: process analyse_tree output with file names "ptyr[0-9]+_workspace.rda" +indir <- file.path(workdir,"RakaiPopSample_phyloscanner_analysis_190706") +control <- list( linked.group='close.and.adjacent.cat', + linked.no='not.close.or.nonadjacent', + linked.yes='close.and.adjacent', + conf.cut=0.6, + neff.cut=3, + weight.complex.or.no.ancestry=0.5) +# process phyloscanner runs +batch.regex <- '^ptyr([0-9]+)_.*' +infiles <- tibble(F=list.files(indir, pattern='workspace.rda', full.names=TRUE)) \%>\% + mutate( PTY_RUN:= as.integer(gsub(batch.regex,'\\\\1',basename(F))) ) \%>\% + arrange(PTY_RUN) +dpls <- vector('list', nrow(infiles)) +dcs <- vector('list', nrow(infiles)) +dws <- vector('list', nrow(infiles)) +for(i in seq_len(nrow(infiles))) +{ + if(verbose) cat('\\nReading ', infiles$F[i]) + tmp <- load( infiles$F[i] ) + # ensure we have multinomial output in workspace + if(!'dc'\%in\%tmp) + stop('Cannot find object "dc". Check that Analyze.trees was run with multinomial=TRUE.') + if(!'dwin'\%in\%tmp) + stop('Cannot find object "dwin". Check that Analyze.trees was run with multinomial=TRUE.') + # find pairs + tmp <- phyloscannerR:::find.pairs.in.networks(dwin, dc, control=control, verbose=TRUE) + # store output for each run + dpls[[i]] <- copy(tmp$network.pairs) + dcs[[i]] <- copy(tmp$relationship.counts) + dws[[i]] <- copy(tmp$windows) + gc() +} +dpl <- do.call('rbind', dpls) +dc <- do.call('rbind', dcs) +dw <- do.call('rbind', dwins) +dpls <- dcs <- dwins <- NULL +# select analysis in which each pair has highest neff +if(verbose) cat('\\nIf pairs are in several batches, select batch with most deep-sequence phylogenies...') +tmp <- dc \%>\% + filter( CATEGORISATION==control$linked.group & TYPE==control$linked.yes ) \%>\% + group_by(H1,H2) \%>\% + summarise(PTY_RUN= PTY_RUN[which.max(N_EFF)]) \%>\% + ungroup() +dc <- tmp \%>\% left_join(dc, by=c('H1','H2','PTY_RUN')) +dw <- tmp \%>\% left_join(dw, by=c('H1','H2','PTY_RUN')) +dpl <- tmp \%>\% left_join(dpl, by=c('H1','H2','PTY_RUN')) +if(verbose) cat('\\nLeft with pairs between whom linkage is not excluded phylogenetically, n=', nrow(dpls)) +# find network pairs: save +outfile <- file.path(workdir, "RakaiPopSample_phyloscanner_analysis_190706", "Rakai_phscnetworks_allpairs_190706.RData") +save(dpl, dc, dw, file=outfile) + + + +# +# PART 3: build transmission networks from constituting pairs +# + + +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) +# loads phyloscanner relationship counts 'dc' +control <- list(linked.group='close.and.adjacent.cat',linked.no='not.close.or.nonadjacent',linked.yes='close.and.adjacent', neff.cut=3) +tmp <- find.networks(dc, control=control, verbose=TRUE) +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +} +} +\seealso{ +\code{\link{find.pairs.in.networks}}, \code{\link{plot.network}}, \code{\link{plot.chain}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/find.pairs.in.networks.Rd b/phyloscannerR/man/find.pairs.in.networks.Rd new file mode 100644 index 00000000..c7e2e227 --- /dev/null +++ b/phyloscannerR/man/find.pairs.in.networks.Rd @@ -0,0 +1,248 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transmission_network_functions.R +\name{find.pairs.in.networks} +\alias{find.pairs.in.networks} +\title{Find pairs of individuals between whom linkage is not excluded phylogenetically} +\usage{ +find.pairs.in.networks(dwin, dc, control = list(linked.group = + "close.and.adjacent.cat", linked.no = "not.close.or.nonadjacent", + linked.yes = "close.and.adjacent", conf.cut = 0.6, neff.cut = 3), + dmeta = NULL, verbose = TRUE) +} +\arguments{ +\item{dwin}{A \code{data.frame} describing pairwise relationships between the hosts in each tree; normally output of \code{classify.pairwise.relationships}} + +\item{dc}{A \code{data.frame} summarising pairwise relationships between the hosts across all trees; normally output of \code{count.pairwise.relationships}} + +\item{control}{List of control variables: +\itemize{ + \item{\code{linked.group}} Phyloscanner classification used to identify pairs in networks. Default 'close.and.adjacent.cat'. + \item{\code{linked.no}} Phyloscanner classification type quantifying that pairs are not linked. Default 'not.close.or.nonadjacent'. + \item{\code{linked.yes}} Phyloscanner classification type quantifying that pairs are linked. Default 'close.and.adjacent'. + \item{\code{conf.cut}} Threshold on the proportion of deep-sequence phylogenies with distant/disconnected subgraphs above which pairs are considered phylogenetically unlinked. Default: 0.6 + \item{\code{neff.cut}} Threshold on the minimum number of deep-sequence phylogenies with sufficient reads from two individuals to make any phylogenetic inferences. Default: 3. +}} + +\item{dmeta}{Optional individual-level meta-data that is to be added to output. Can be NULL.} + +\item{verbose}{Flag to switch on/off verbose mode. Default: TRUE.} +} +\value{ +Three R objects are generated: +\itemize{ + \item{\code{network.pairs}} is a tibble that describes pairs of individuals between whom linkage is not excluded phylogenetically. + \item{\code{relationship.counts}} is a tibble that summarises the phylogenetic relationship counts for each pair. + \item{\code{windows}} is a tibble that describes the basic phyloscanner statistics (distance, adjacency, paths between subgraphs) in each deep-sequence phylogeny for each pair. +} +} +\description{ +This function identifies pairs of individuals between whom linkage is not excluded phylogenetically in a large number of phyloscanner analyses, and provides detailed information on them. +} +\examples{ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) +workdir<- '~/sandbox/DeepSeqProjects' +setwd(workdir) +# PLEASE NOTE: the output from PART 1+2 below is available through the package +# so these steps can be skipped + + +# +# PART 1: run phyloscanner_analyse_trees on a large number of deepseq trees +# +# prepare analysis: specify path to phyloscanner_analyse_trees +prog.phyloscanner_analyse_trees <- '/Users/Oliver/git/phyloscanner/phyloscanner_analyse_trees.R' +# prepare analysis: download the phyloscanner tree of the Rakai population-based analysis +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208473/Dataset_S1.tar?sequence=1" +# prepare analysis: specify directories to untar public data and to store results +tree.dir <- "RakaiPopSample_deepseqtrees" +outdir <- 'RakaiPopSample_phyloscanner_analysis_190706' +# prepare analysis: download and untar +download.file(tmp, destfile="Dataset_S1.tar", method="curl") +untar("Dataset_S1.tar", exdir=tree.dir, extras='-xvf') +# prepare analysis: specify valid input arguments to phyloscanner_analyse_trees +valid.input.args <- cmd.phyloscanner.analyse.trees.valid.args(prog.phyloscanner_analyse_trees) +# prepare analysis: set phyloscanner variables, arguments as used for the Rakai population-based analysis +control <- list() +control$allow.mt <- TRUE +control$alignment.file.directory = NULL +control$alignment.file.regex = NULL +control$blacklist.underrepresented = FALSE +control$count.reads.in.parsimony = TRUE +control$distance.threshold <- '0.025 0.05' +control$do.dual.blacklisting = FALSE +control$duplicate.file.directory = NULL +control$duplicate.file.regex = NULL +control$file.name.regex = "^\\\\D*([0-9]+)_to_([0-9]+)\\\\D*$" +control$guess.multifurcation.threshold = FALSE +control$max.reads.per.host = 50 +control$min.reads.per.host <- 30 +control$min.tips.per.host <- 1 +control$multifurcation.threshold = 1e-5 +control$multinomial= TRUE +control$norm.constants = NULL +control$norm.ref.file.name = system.file('HIV_DistanceNormalisationOverGenome.csv',package='phyloscannerR') +control$norm.standardise.gag.pol = TRUE +control$no.progress.bars = TRUE +control$outgroup.name = "REF_CPX_AF460972" +control$output.dir = outdir +control$parsimony.blacklist.k = 20 +control$prune.blacklist = FALSE +control$post.hoc.count.blacklisting= TRUE +control$ratio.blacklist.threshold = 0 +control$raw.blacklist.threshold = 20 +control$recombination.file.directory = NULL +control$recombination.file.regex = NULL +control$relaxed.ancestry = TRUE +control$sankoff.k = 20 +control$sankoff.unassigned.switch.threshold = 0 +control$seed = 42 +control$splits.rule = 's' +control$tip.regex = "^(.*)_fq[0-9]+_read_([0-9]+)_count_([0-9]+)$" +control$tree.file.regex = "^ptyr[0-9]+_InWindow_([0-9]+_to_[0-9]+)\\\\.tree$" +control$use.ff = FALSE +control$user.blacklist.directory = NULL +control$user.blacklist.file.regex = NULL +control$verbosity = 1 +# prepare analysis: list zipped tree files. One zip file contains the viral trees of individuals in one putative transmission network. +df <- tibble(F=list.files(tree.dir)) +df <- df \%>\% + mutate(TYPE:= gsub('ptyr([0-9]+)_(.*)','\\\\2', F), + RUN:= as.integer(gsub('ptyr([0-9]+)_(.*)','\\\\1', F))) \%>\% + mutate(TYPE:= gsub('^([^\\\\.]+)\\\\.[a-z]+$','\\\\1',TYPE)) \%>\% + spread(TYPE, F) \%>\% + set_names(~ str_to_upper(.)) +# make bash scripts: one each for processing the viral trees of individuals in one putative transmission network +cmds <- vector('list',nrow(df)) +for(i in seq_len(nrow(df))) +{ + control$output.string <- paste0('ptyr',df$RUN[i]) + tree.input <- file.path(indir, df$TREES_NEWICK[i]) + cmd <- phyloscannerR:::cmd.phyloscanner.analyse.trees(prog.phyloscanner_analyse_trees, + tree.input, + control, + valid.input.args=valid.input.args) + cmds[[i]] <- cmd +} +# submit bash scripts: make PBS header +hpc.load <- "module load anaconda3/personal" # make third party requirements available +hpc.select <- 1 # number of nodes +hpc.nproc <- 1 # number of processors on node +hpc.walltime <- 23 # walltime +hpc.q <- "pqeelab" # PBS queue +hpc.mem <- "6gb" # RAM +hpc.array <- length(cmds) # number of runs for job array +pbshead <- "#!/bin/sh" +tmp <- paste("#PBS -l walltime=", hpc.walltime, ":59:00,pcput=", hpc.walltime, ":45:00", sep = "") +pbshead <- paste(pbshead, tmp, sep = "\\n") +tmp <- paste("#PBS -l select=", hpc.select, ":ncpus=", hpc.nproc,":mem=", hpc.mem, sep = "") +pbshead <- paste(pbshead, tmp, sep = "\\n") +pbshead <- paste(pbshead, "#PBS -j oe", sep = "\\n") +if(!is.na(hpc.array)) + pbshead <- paste(pbshead, "\\n#PBS -J 1-", hpc.array, sep='') +if(!is.na(hpc.q)) + pbshead <- paste(pbshead, paste("#PBS -q", hpc.q), sep = "\\n") +pbshead <- paste(pbshead, hpc.load, sep = "\\n") +cat(pbshead) +# submit bash scripts: make array job +for(i in 1:length(cmds)) + cmds[[i]]<- paste0(i,')\\n',cmds[[i]],';;\\n') +cmd <- paste0('case $PBS_ARRAY_INDEX in\\n',paste0(cmds, collapse=''),'esac') +cmd <- paste(pbshead,cmd,sep='\\n') +# submit bash scripts: submit job +outfile <- gsub(':','',paste("phsc",paste(strsplit(date(),split=' ')[[1]],collapse='_',sep=''),'sh',sep='.')) +outfile <- file.path(tmpdir, outfile) +cat(cmd, file=outfile) +cmd <- paste("qsub", outfile) +cat(system(cmd, intern= TRUE)) + + + +# +# PART 2: find pairs in phylogenetic transmission networks +# + + + +# find network pairs: get meta-data +meta.file <- file.path(workdir,"RakaiPopSample_data","Dataset_S2.csv") +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208474/Dataset_S2.csv?sequence=2" +download.file(tmp, destfile=meta.file, method="curl") +dmeta <- as_tibble(read.csv(meta.file, stringsAsFactors=FALSE)) +# find network pairs: process analyse_tree output with file names "ptyr[0-9]+_workspace.rda" +indir <- file.path(workdir,"RakaiPopSample_phyloscanner_analysis_190706") +control <- list( linked.group='close.and.adjacent.cat', + linked.no='not.close.or.nonadjacent', + linked.yes='close.and.adjacent', + conf.cut=0.6, + neff.cut=3, + weight.complex.or.no.ancestry=0.5) +# process phyloscanner runs +batch.regex <- '^ptyr([0-9]+)_.*' +infiles <- tibble(F=list.files(indir, pattern='workspace.rda', full.names=TRUE)) \%>\% + mutate( PTY_RUN:= as.integer(gsub(batch.regex,'\\\\1',basename(F))) ) \%>\% + arrange(PTY_RUN) +dpls <- vector('list', nrow(infiles)) +dcs <- vector('list', nrow(infiles)) +dws <- vector('list', nrow(infiles)) +for(i in seq_len(nrow(infiles))) +{ + if(verbose) cat('\\nReading ', infiles$F[i]) + tmp <- load( infiles$F[i] ) + # ensure we have multinomial output in workspace + if(!'dc'\%in\%tmp) + stop('Cannot find object "dc". Check that Analyze.trees was run with multinomial=TRUE.') + if(!'dwin'\%in\%tmp) + stop('Cannot find object "dwin". Check that Analyze.trees was run with multinomial=TRUE.') + # find pairs + tmp <- phyloscannerR:::find.pairs.in.networks(dwin, dc, control=control, verbose=TRUE) + # store output for each run + dpls[[i]] <- copy(tmp$network.pairs) + dcs[[i]] <- copy(tmp$relationship.counts) + dws[[i]] <- copy(tmp$windows) + gc() +} +dpl <- do.call('rbind', dpls) +dc <- do.call('rbind', dcs) +dw <- do.call('rbind', dwins) +dpls <- dcs <- dwins <- NULL +# select analysis in which each pair has highest neff +if(verbose) cat('\\nIf pairs are in several batches, select batch with most deep-sequence phylogenies...') +tmp <- dc \%>\% + filter( CATEGORISATION==control$linked.group & TYPE==control$linked.yes ) \%>\% + group_by(H1,H2) \%>\% + summarise(PTY_RUN= PTY_RUN[which.max(N_EFF)]) \%>\% + ungroup() +dc <- tmp \%>\% left_join(dc, by=c('H1','H2','PTY_RUN')) +dw <- tmp \%>\% left_join(dw, by=c('H1','H2','PTY_RUN')) +dpl <- tmp \%>\% left_join(dpl, by=c('H1','H2','PTY_RUN')) +if(verbose) cat('\\nLeft with pairs between whom linkage is not excluded phylogenetically, n=', nrow(dpls)) +# find network pairs: save +outfile <- file.path(workdir, "RakaiPopSample_phyloscanner_analysis_190706", "Rakai_phscnetworks_allpairs_190706.RData") +save(dpl, dc, dw, file=outfile) + + + +# +# PART 3: build transmission networks from constituting pairs +# + + +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) +# loads phyloscanner relationship counts 'dc' +control <- list(linked.group='close.and.adjacent.cat',linked.no='not.close.or.nonadjacent',linked.yes='close.and.adjacent', neff.cut=3) +tmp <- find.networks(dc, control=control, verbose=TRUE) +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +} +} +\seealso{ +\code{\link{phyloscanner.analyse.trees}}, \code{\link{cmd.phyloscanner.analyse.trees}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/phyloscanner.analyse.trees.Rd b/phyloscannerR/man/phyloscanner.analyse.trees.Rd index 556a3b84..c127f23d 100644 --- a/phyloscannerR/man/phyloscanner.analyse.trees.Rd +++ b/phyloscannerR/man/phyloscanner.analyse.trees.Rd @@ -261,3 +261,6 @@ phsc <- phyloscanner.analyse.trees(tree.file.directory, ) } } +\seealso{ +\code{\link{find.pairs.in.networks}}, \code{\link{find.networks}} +} diff --git a/phyloscannerR/man/phyloscanner.to.simmap.Rd b/phyloscannerR/man/phyloscanner.to.simmap.Rd new file mode 100644 index 00000000..233e5833 --- /dev/null +++ b/phyloscannerR/man/phyloscanner.to.simmap.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree_utility_functions.R +\name{phyloscanner.to.simmap} +\alias{phyloscanner.to.simmap} +\title{Cast phyloscanner tree to SIMMAP tree} +\usage{ +phyloscanner.to.simmap(ph, delete.phyloscanner.structures) +} +\arguments{ +\item{ph}{A \code{phyloscanner.tree} with attributes \code{SPLIT}, \code{INDIVIDUAL}, \code{BRANCH_COLOURS}, \code{SUBGRAPH_MRCA}.} + +\item{delete.phyloscanner.structures}{Logical value. If true \code{phyloscanner} attributes are removed from the tree.} +} +\value{ +Same tree in \code{SIMMAP} format, with elements \code{maps}, \code{mapped.edge}, \code{node.states}. +} +\description{ +Cast phyloscanner tree to SIMMAP tree +} +\examples{ +# +# Example on data from Rakai Community Cohort Study +# +require(phyloscannerR) +ph.phyloscanner <- readRDS(file=system.file(file.path('extdata','ptyr292_1600_to_1849.rds'),package='phyloscannerR')) + +# cast from phyloscanner format to SIMMAP format +# so that the SIMMAP tree still has the phyloscanner attributes +ph.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=FALSE) +# to delete the phyloscanner attributes, use +ph.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=TRUE) +# +# cast from SIMMAP format to phyloscanner format +# similarly, use delete.simmap.structures to keep/delete SIMMAP elements from the tree in phyloscanner format +ph.phyloscanner2 <- simmap.to.phyloscanner(ph.simmap, delete.simmap.structures=TRUE) +# +# test that ph.phyloscanner2 is the same as ph.phyloscanner: +stopifnot(all( ph.phyloscanner$tip.label == ph.phyloscanner2$tip.label )) +stopifnot(all( ph.phyloscanner$edge == ph.phyloscanner2$edge )) +stopifnot(all( ph.phyloscanner$edge.length == ph.phyloscanner2$edge.length )) +stopifnot(all( ph.phyloscanner$Nnode == ph.phyloscanner2$Nnode )) +tmp <- as.character(attr(ph.phyloscanner,'SPLIT')) +tmp2 <- as.character(attr(ph.phyloscanner2,'SPLIT')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- as.character(attr(ph.phyloscanner,'INDIVIDUAL')) +tmp2 <- as.character(attr(ph.phyloscanner2,'INDIVIDUAL')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- as.character(attr(ph.phyloscanner,'BRANCH_COLOURS')) +tmp2 <- as.character(attr(ph.phyloscanner2,'BRANCH_COLOURS')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- attr(ph.phyloscanner,'SUBGRAPH_MRCA') +tmp2 <- attr(ph.phyloscanner2,'SUBGRAPH_MRCA') +stopifnot( all(tmp==tmp2) ) + +} +\seealso{ +\code{\link{simmap.to.phyloscanner}}, \code{\link{extract.subgraph}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/plot.chain.Rd b/phyloscannerR/man/plot.chain.Rd new file mode 100644 index 00000000..762a3191 --- /dev/null +++ b/phyloscannerR/man/plot.chain.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_functions.R +\name{plot.chain} +\alias{plot.chain} +\title{Plot transmission chain} +\usage{ +plot.chain(df, di, control) +} +\arguments{ +\item{df}{tibble with the following columns 'H1', 'H2', 'CATEGORISATION', 'TYPE', 'SCORE'} + +\item{di}{tibble with meta-data to customize the plot with columns 'H', node.shape, node.label, node.fill} + +\item{control}{list of plotting control variables; see examples.} +} +\value{ +ggplot object +} +\description{ +This function plots a transmission chain, showing one edge with two labels: +L indicates the proportion of deep-sequence phylogenies in whom the two individuals are phylogenetically close and adjacent. +D indicates the proportion of deep-sequence phylogenies that support the indicated direction of transmission, out of all +deep-sequence phylogenies that support either direction of transmission. +} +\examples{ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) + +# +# reconstruct transmission networks and most likely transmission chain +# +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) # loads phyloscanner relationship counts 'dc' +tmp <- find.networks(dc, neff.cut=3, verbose=TRUE) +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +# get meta data +meta.file <- file.path(workdir,"RakaiPopSample_data","Dataset_S2.csv") +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208474/Dataset_S2.csv?sequence=2" +download.file(tmp, destfile=meta.file, method="curl") +dmeta <- as_tibble(read.csv(meta.file, stringsAsFactors=FALSE)) + +# +# plot network with host names, and male/female in blue/pink nodes, +# and highlight pairings with >60\% support for phylogenetic linkage +# +idclus <- sort(unique(dnet$IDCLU)) +di <- copy(dmeta) +setnames(di, 'ID', 'H') +df <- dnet \%>\% + filter(IDCLU == 34) \%>\% + select(-c(H1_SEX,H2_SEX)) +control<- list() +control$point.size = 10 +control$edge.gap = 0.04 +control$edge.size = 2 +control$curvature = -0.2 +control$arrow = arrow(length = unit(0.04, "npc"), type = "open") +control$curv.shift = 0.06 +control$label.size = 3 +control$node.label = "H" +control$node.fill = "SEX" +control$node.shape = NA_character_ +control$node.shape.values = c(`NA` = 16) +control$node.fill.values = c(F = "hotpink2", M = "steelblue2") +control$threshold.linked = 0.6 +p <- plot.network(df, di, control) + +# +# plot most likely chain with host names, and male/female in blue/pink nodes, +# and highlight pairings with >60\% support for phylogenetic linkage +# +control$layout <- p$layout +di <- copy(dmeta) +setnames(di, 'ID', 'H') +df <- subset(dchain, IDCLU==34) +p2 <- plot.chain(df, di, control) + +} +} +\seealso{ +\code{\link{find.networks}}, \code{\link{plot.network}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/plot.network.Rd b/phyloscannerR/man/plot.network.Rd new file mode 100644 index 00000000..f2e0da61 --- /dev/null +++ b/phyloscannerR/man/plot.network.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_functions.R +\name{plot.network} +\alias{plot.network} +\title{Plot transmission network} +\usage{ +plot.network(df, di, control) +} +\arguments{ +\item{df}{tibble with the following columns 'H1', 'H2', 'CATEGORISATION', 'TYPE', 'SCORE'} + +\item{di}{tibble with meta-data to customize the plot with columns 'H', node.shape, node.label, node.fill} + +\item{control}{list of plotting control variables; see examples.} +} +\value{ +ggplot object +} +\description{ +This function plots a phylogenetic transmission network, showing three types of edges: +two directed edges respectively in the 1->2 and 2->1 direction, and an undirected edge that represents phylogenetic support of close and adjacent individuals without evidence into the direction transmission. +} +\examples{ +\dontrun{ +require(data.table) +require(tidyverse) +require(phyloscannerR) + +# +# reconstruct transmission networks and most likely transmission chain +# +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) # loads phyloscanner relationship counts 'dc' +tmp <- find.networks(dc, neff.cut=3, verbose=TRUE) +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +# get meta data +meta.file <- file.path(workdir,"RakaiPopSample_data","Dataset_S2.csv") +tmp <- "https://datadryad.org/bitstream/handle/10255/dryad.208474/Dataset_S2.csv?sequence=2" +download.file(tmp, destfile=meta.file, method="curl") +dmeta <- as_tibble(read.csv(meta.file, stringsAsFactors=FALSE)) + +# +# plot network with host names, and male/female in blue/pink nodes, +# and highlight pairings with >60\% support for phylogenetic linkage +# +idclus <- sort(unique(dnet$IDCLU)) +di <- copy(dmeta) +setnames(di, 'ID', 'H') +df <- dnet \%>\% + filter(IDCLU == 34) \%>\% + select(-c(H1_SEX,H2_SEX)) +control<- list() +control$point.size = 10 +control$edge.gap = 0.04 +control$edge.size = 2 +control$curvature = -0.2 +control$arrow = arrow(length = unit(0.04, "npc"), type = "open") +control$curv.shift = 0.06 +control$label.size = 3 +control$node.label = "H" +control$node.fill = "SEX" +control$node.shape = NA_character_ +control$node.shape.values = c(`NA` = 16) +control$node.fill.values = c(F = "hotpink2", M = "steelblue2") +control$threshold.linked = 0.6 +p <- plot.network(df, di, control) + +# +# plot most likely chain with host names, and male/female in blue/pink nodes, +# and highlight pairings with >60\% support for phylogenetic linkage +# +control$layout <- p$layout +di <- copy(dmeta) +setnames(di, 'ID', 'H') +df <- subset(dchain, IDCLU==34) +p2 <- plot.chain(df, di, control) + +} +} +\seealso{ +\code{\link{find.networks}}, \code{\link{plot.chain}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/produce.pairwise.graphs2.Rd b/phyloscannerR/man/produce.pairwise.graphs2.Rd index fcf2a337..5a6a10e6 100644 --- a/phyloscannerR/man/produce.pairwise.graphs2.Rd +++ b/phyloscannerR/man/produce.pairwise.graphs2.Rd @@ -39,16 +39,29 @@ deep-sequence phylogeny across the genome. The genomic position on the x-axis in # remember that you can specify dwin to save computing time, if you have it already computed # \dontrun{ +# +# Example 1 +# file <- system.file(file.path('extdata','ptyr192_phsc_analyse_trees_output.RData'),package='phyloscannerR') load(file) #loads 'phsc', output from 'phyloscanner.analyse.trees' hosts <- c('RkA05868F','RkA05968M','RkA00369F','RkA01344M') inclusion <- "both" -tmp <- produce.pairwise.graphs2(phsc, hosts=hosts, inclusion = "both") +tmp <- phyloscannerR:::produce.pairwise.graphs2(phsc, hosts=hosts, inclusion = "both") +tmp$graph + +# +# Example 2 +# +infile <- system.file(file.path('extdata','Rakai_phscnetworks_allpairs_190706.RData'),package='phyloscannerR') +load(infile) +# loaded "dpl" (pairs of individuals between whom linkage not excluded), "dc" (phyloscanner classification counts), "dw" (phyloscanner classifications per window) +hosts <- c('RkA05868F','RkA05968M','RkA00369F','RkA01344M') +tmp <- phyloscannerR:::produce.pairwise.graphs2(hosts=hosts, dwin=dw, inclusion = "both") tmp$graph } } \seealso{ -\link{\code{classify.pairwise.relationships}} +\code{\link{classify.pairwise.relationships}} } \author{ Oliver Ratmann diff --git a/phyloscannerR/man/simmap.to.phyloscanner.Rd b/phyloscannerR/man/simmap.to.phyloscanner.Rd new file mode 100644 index 00000000..b9727f30 --- /dev/null +++ b/phyloscannerR/man/simmap.to.phyloscanner.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree_utility_functions.R +\name{simmap.to.phyloscanner} +\alias{simmap.to.phyloscanner} +\title{Cast SIMMAP tree to phyloscanner tree} +\usage{ +simmap.to.phyloscanner(ph, delete.simmap.structures) +} +\arguments{ +\item{ph}{A \code{SIMMAP} tree with elements \code{maps}, \code{mapped.edge}, \code{node.states}.} + +\item{delete.simmap.structures}{Logical value. If true \code{SIMMAP} elements are removed from the tree.} +} +\value{ +Same tree in \code{phyloscanner.tree} format, with attributes \code{SPLIT}, \code{INDIVIDUAL}, \code{BRANCH_COLOURS}, \code{SUBGRAPH_MRCA}. +} +\description{ +Cast SIMMAP tree to phyloscanner tree +} +\examples{ +# +# Example on data from Rakai Community Cohort Study +# +require(phyloscannerR) +ph.phyloscanner <- readRDS(file=system.file(file.path('extdata','ptyr292_1600_to_1849.rds'),package='phyloscannerR')) + +# cast from phyloscanner format to SIMMAP format +# so that the SIMMAP tree still has the phyloscanner attributes +ph.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=FALSE) +# to delete the phyloscanner attributes, use +ph.simmap <- phyloscanner.to.simmap(ph.phyloscanner, delete.phyloscanner.structures=TRUE) +# +# cast from SIMMAP format to phyloscanner format +# similarly, use delete.simmap.structures to keep/delete SIMMAP elements from the tree in phyloscanner format +ph.phyloscanner2 <- simmap.to.phyloscanner(ph.simmap, delete.simmap.structures=TRUE) +# +# test that ph.phyloscanner2 is the same as ph.phyloscanner: +stopifnot(all( ph.phyloscanner$tip.label == ph.phyloscanner2$tip.label )) +stopifnot(all( ph.phyloscanner$edge == ph.phyloscanner2$edge )) +stopifnot(all( ph.phyloscanner$edge.length == ph.phyloscanner2$edge.length )) +stopifnot(all( ph.phyloscanner$Nnode == ph.phyloscanner2$Nnode )) +tmp <- as.character(attr(ph.phyloscanner,'SPLIT')) +tmp2 <- as.character(attr(ph.phyloscanner2,'SPLIT')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- as.character(attr(ph.phyloscanner,'INDIVIDUAL')) +tmp2 <- as.character(attr(ph.phyloscanner2,'INDIVIDUAL')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- as.character(attr(ph.phyloscanner,'BRANCH_COLOURS')) +tmp2 <- as.character(attr(ph.phyloscanner2,'BRANCH_COLOURS')) +tmp[is.na(tmp)] <- 'Unknown' +tmp2[is.na(tmp2)] <- 'Unknown' +stopifnot( all(tmp==tmp2) ) +tmp <- attr(ph.phyloscanner,'SUBGRAPH_MRCA') +tmp2 <- attr(ph.phyloscanner2,'SUBGRAPH_MRCA') +stopifnot( all(tmp==tmp2) ) + +} +\seealso{ +\code{\link{phyloscanner.to.simmap}}, \code{\link{extract.subgraph}} +} +\author{ +Oliver Ratmann +} diff --git a/phyloscannerR/man/write.annotated.tree.Rd b/phyloscannerR/man/write.annotated.tree.Rd index 8e65b610..4790e1ee 100644 --- a/phyloscannerR/man/write.annotated.tree.Rd +++ b/phyloscannerR/man/write.annotated.tree.Rd @@ -4,9 +4,9 @@ \alias{write.annotated.tree} \title{Write the phylogeny with reconstructed host annotations to file} \usage{ -write.annotated.tree(ptree, file.name, format = c("pdf", "nex"), - pdf.scale.bar.width = 0.01, pdf.w = 50, pdf.hm = 0.15, - verbose = F) +write.annotated.tree(ptree, file.name, format = c("pdf", "nex", + "ggplot"), tree.colours = NULL, pdf.scale.bar.width = 0.01, + pdf.w = 50, pdf.hm = 0.15, verbose = F) } \arguments{ \item{file.name}{The name of the output file} diff --git a/phyloscannerR/vignettes/.DS_Store b/phyloscannerR/vignettes/.DS_Store new file mode 100644 index 00000000..a9928580 Binary files /dev/null and b/phyloscannerR/vignettes/.DS_Store differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dc.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dc.png new file mode 100644 index 00000000..c845b57e Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dc.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dchain.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dchain.png new file mode 100644 index 00000000..a80da697 Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dchain.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dchainplot.pdf b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dchainplot.pdf new file mode 100644 index 00000000..3042afa5 Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dchainplot.pdf differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dnet.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dnet.png new file mode 100644 index 00000000..5dc0986a Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dnet.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dnetplot.pdf b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dnetplot.pdf new file mode 100644 index 00000000..9af12438 Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dnetplot.pdf differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dpl.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dpl.png new file mode 100644 index 00000000..a7c681aa Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dpl.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dw.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dw.png new file mode 100644 index 00000000..f5a5e633 Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_dw.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_findnetsoutput.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_findnetsoutput.png new file mode 100644 index 00000000..9a7e0c54 Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_findnetsoutput.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_findpairsoutput.png b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_findpairsoutput.png new file mode 100644 index 00000000..bb0c667e Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_findpairsoutput.png differ diff --git a/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_phyloscan.pdf b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_phyloscan.pdf new file mode 100644 index 00000000..7244c67d Binary files /dev/null and b/phyloscannerR/vignettes/figures/UVRI.03.reconstruct_transmission_networks_phyloscan.pdf differ diff --git a/phyloscannerR/vignettes/phyloflows_aggregating.pdf b/phyloscannerR/vignettes/phyloflows_aggregating.pdf new file mode 100644 index 00000000..08feec11 Binary files /dev/null and b/phyloscannerR/vignettes/phyloflows_aggregating.pdf differ diff --git a/phyloscannerR/vignettes/phyloflows_diagnostics.pdf b/phyloscannerR/vignettes/phyloflows_diagnostics.pdf new file mode 100644 index 00000000..d36c374f Binary files /dev/null and b/phyloscannerR/vignettes/phyloflows_diagnostics.pdf differ diff --git a/phyloscannerR/vignettes/phyloflows_first_example.pdf b/phyloscannerR/vignettes/phyloflows_first_example.pdf new file mode 100644 index 00000000..c07639c8 Binary files /dev/null and b/phyloscannerR/vignettes/phyloflows_first_example.pdf differ diff --git a/phyloscannerR/vignettes/phyloflows_keyquantities.pdf b/phyloscannerR/vignettes/phyloflows_keyquantities.pdf new file mode 100644 index 00000000..6fa278bb Binary files /dev/null and b/phyloscannerR/vignettes/phyloflows_keyquantities.pdf differ diff --git a/phyloscannerR/vignettes/phyloscannerR_reconstruct_transmission_networks.Rmd b/phyloscannerR/vignettes/phyloscannerR_reconstruct_transmission_networks.Rmd new file mode 100644 index 00000000..2c1726d8 --- /dev/null +++ b/phyloscannerR/vignettes/phyloscannerR_reconstruct_transmission_networks.Rmd @@ -0,0 +1,373 @@ +--- +title: "*phyloscannerR*: reconstructing transmission networks" +author: "Oliver Ratmann and Matthew Hall" +date: "2019-09-02" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +urlcolor: blue +--- + +```{r setup, echo=FALSE, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + tidy.opts=list(width.cutoff=80), + tidy=TRUE, + fig.pos= "h", + comment = "#>" +) +``` +```{r, include=TRUE, eval=TRUE, echo=FALSE, tidy=TRUE, results='hide',message=FALSE, warning=FALSE} +require(knitr) +require(kableExtra) +``` + +## Introduction +In this vignette, we describe how transmission networks can be +reconstructed from *phyloscanner* output with the R package `phyloscannerR`. + +This will involve the following steps: + +1. Find pairs of individuals between whom phylogenetic linkage cannot be + excluded based on the distance and topological relationship of viral reads + from both individuals. This will use output from a previous *phyloscanner* + analysis. +2. Inspect the *phyloscanner* statistics for each deep-sequence tree for these + pairs. +3. Inspect the *phyloscanner* relationship counts across all deep-sequence + tree for these pairs. +4. Reconstruct distinct transmission networks between these pairs, and + reconstruct the most likely transmission chain for each network. +5. Plot the transmission networks and the transmission chains. + +We will introduce the following functions from the `phyloscannerR` package: + +* `find.pairs.in.networks` +* `produce.pairwise.graphs2` +* `find.networks` +* `plot.network` +* `plot.chain` + +## Data +The data that we will be using in this vignette is from the MRC UVRI cohort in +south-eastern Uganda, and was generated by the PANGEA consortium. + +## Getting started +We assume that output from a previous *phyloscanner* analysis is available, +containing files that end in `*workspace.rda`. We will need only these files to +reconstruct the transmission networks. Let us get started. + +```{r, include=TRUE, eval=FALSE, echo=TRUE} +# load packages +require(tidyverse) +require(RBGL) +require(igraph) +require(network) +require(ggnet) +require(phyloscannerR) + +# directory with phyloscanner output containing the *workspace.rda file +home <- '/Users/Oliver/Box Sync/OR_Work/2019/2019_PANGEA_BBosa' +indir <- file.path(home,'MRCPopSample_phsc_stage2_output_newali_300_HKC_phsc') + +# output files +outdir <- file.path(home, 'MRCPopSample_phsc_stage2_output_newali_300_HKC_analysis') +file.pairs <- file.path(outdir, 'MRCUVRI_phscallpairs_190827.rda') +file.nets <- file.path(outdir, 'MRCUVRI_phscnetworks_190827.rda') +``` + +## Pairs of individuals between whom linkage is not excluded +We will now processes the *phyloscanner* output in `indir`. We will use the +*phyloscanner* specification as used for [the +Rakai analysis](https://www.nature.com/articles/s41467-019-09139-4). The next +lines of code specify a list of `control` options, and then use function +`find.pairs.in.networks`. + +```{r, include=TRUE, eval=FALSE, echo=TRUE, tidy=FALSE, results='hide'} +# control options +# +# phyloscanner produces several different phylogenetic classifications +# schemes; for example by phylogenetic distance only; or using phylogenetic +# distance and topologic adjacency (`close.and.adjacent.cat`); or phylogenetic +# distance and topologic contiguity. We select one of them. +# +control <- list( linked.group='close.and.adjacent.cat', +# ( use classification based on phylogenetic distance and topological adjacency ) + linked.no='not.close.or.nonadjacent', +# ( pairs are interpreted to be unlinked +# if classified as 'not.close.or.nonadjacent' ) + linked.yes='close.and.adjacent', +# ( pairs are interpreted to be linked +# if classified as 'close.or.nonadjacent' ) + conf.cut=0.6, +# ( threshold on the proportion of deep-sequence phylogenies +# pairs unlinked in more than this are discarded ) + neff.cut=3 +# ( threshold on the effective number of deep-sequence phylogenies +# pairs with fewer data are discarded ) + ) + +tmp <- find.pairs.in.networks( indir, + batch.regex='^ptyr([0-9]+)_.*', +# ( batch.regex identifies the batch number from the +# file names of *phyloscanner* output. Typically, this batch number corresponds +# to the analysis of a particular transmission network. ) + control=control, + verbose=TRUE) + +dpl <- copy(tmp$network.pairs) +# the pairs +dc <- copy(tmp$relationship.counts) +# their relationship counts summed over phylogenies +dw <- copy(tmp$windows) +# their relationship stats for all phylogenies + +#save(dpl, dc, dw, file=file.pairs) +``` + + + +## Output of `find.pairs.in.networks` + +Let us have a look at the selected pairs: +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dpl.png") +``` + +Here, + +1. Columns `H1` and `H2` list the individual ID of the individuals between whom + linkage is not rejected. +2. Column `PTY_RUN` lists the batch number of the phyloscanner analysis + specified through `batch.regex`. +3. Column `NEFF` gives the total number of deep-sequence phylogenies for the + pair, and `KEFF` gives the total number of phylogenies in which the two + individuals have virus that is `close.and.adjacent`. + +Now let us have a look at the relationship stats for all phylogenies: +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dw.png") +``` + +Here, + +1. Columns `H1`, `H2` and `PTY_RUN` are as before. +2. Column `TREE_ID` gives the identifier of the phylogeny for which the + *phyloscanner* statistics are evaluated. +3. Columns `ADJACENT`, `CONTIGUOUS`, `PATHS12`, `PATHS21`, `ANCESTRY`, + `PATRISTIC_DISTANCE` give the basic *phyloscanner* statistics that describe + the relationship of virus from two individuals in this phylogeny. +4. The remaining columns list all the different classification schemes that + *phyloscanner* produces by default (e.g. `PROXIMITY-3_WAY_CAT`), and the + classifications in rows (e.g. `close`). + +Now let us have a look at the relationship counts summed over all +phylogenies: +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dc.png") +``` + +Here, + +1. Columns `H1`, `H2` and `PTY_RUN` are as before. +2. Column `CATEGORISATION` then lists one of the classification schemes and + column `TYPE` all of the classifications in this scheme. +3. For each classification, `NEFF` gives the total number of deep-sequence + phylogenies for the pair, and `KEFF` gives the total number of phylogenies in + which the two individuals have that classification. + +## Plotting the phylogenetic relationships between pairs of individuals +At this point, we can easily visualise the phylogenetic relationships between +two individuals across the genome. We call these plots phyloscans: +```{r, include=TRUE, eval=FALSE, echo=TRUE, tidy=TRUE} +# plot phyloscans of all likely pairs +hosts <- dw %>% select(H1, H2) %>% + gather('HOST_TYPE','H') %>% + select(-HOST_TYPE) %>% + distinct() %>% + arrange(H) %>% + pull(H) +# plot phyloscans of one pair +hosts <- dw[1,] %>% select(H1, H2) %>% + gather('HOST_TYPE','H') %>% + select(-HOST_TYPE) %>% + pull(H) +tmp <- copy(dw) +tmp <- produce.pairwise.graphs2(NULL, hosts=hosts, dwin=tmp, inclusion = "both") +tmp$graph +``` +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE, out.width = "100%"} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_phyloscan.pdf") +``` + + +## Reconstructing transmission networks +We can now reconstruct transmission networks, which are defined as sets of +individuals between whom phylogenetic linkage is not excluded. Between any two +individuals in a network, there are three weighted edges that describe the +phylogenetic support for transmission in the one direction, in the other +direction, and support for phylogenetic linkage without evidence for +directionality. + +The same function also reconstructs the most likely transmission chain for each +network. A transmission chain is defined as the directed graph with nodes of +indegree 1 and arbitrary outdegree that connects all individuals and has the +largest product of edge weights. Full details are given in the [the Rakai +paper](https://www.nature.com/articles/s41467-019-09139-4). + +Finally, from the transmission chains it is straightforward to extract highly +supported pairs. + +Here is the code: +```{r, include=TRUE, eval=FALSE, echo=TRUE, tidy=FALSE, results='hide'} +# construct networks between pairs using the same +# control options as before +tmp <- find.networks(dc, control=control, verbose=TRUE) +# extract networks and the transmission chains within them +dnet <- copy(tmp$transmission.networks) +dchain <- copy(tmp$most.likely.transmission.chains) + +# construct highly supported pairs +conf.cut <- 0.6 +dconfpairs <- dchain %>% + filter( SCORE_LINKED>conf.cut & + pmax(SCORE_DIR_12, SCORE_DIR_21)>conf.cut) %>% + select(-c(SCORE_NW_12, SCORE_NW_12, SCORE_NW_AMB)) + +# save +#save(dpl, dc, dw, dnet, dchain, dconfpairs, file=file.nets) +``` + + + +## Output from reconstructing transmission networks +Let us have a look at the transmission networks: +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dnet.png") +``` + +Here, + +1. Columns `H1` and `H2` list as before the individual ID of the individuals + between whom linkage is not rejected. +2. Column `PTY_RUN` lists as before the batch number of the phyloscanner + analysis specified through `batch.regex`. +3. Column `IDCLU` gives an identifier for each reconstructed transmission + network. +4. Column `TYPE` gives the possible classifications of the phylogenetic + relationship types between the individuals. Using the default options in + `control`, this is evidence for transmission in either direction (`12` and + `21`), support for phylogenetic linkage without evidence for directionality + (`complex.or.no.ancestry`), and evidence for no phylogenetic linkage + (`not.close.or.nonadjacent`). +5. Column `SCORE` gives the *phyloscanner* score for each classification, which + is defined by `KEFF` divided by `NEFF`. + + +Let us plot one of these networks. The code to do this is as follows: + +```{r, include=TRUE, eval=FALSE, echo=TRUE, tidy=FALSE, fig.align="center"} +# plot all networks +idclus <- sort(unique(dnet$IDCLU)) +# find IDs of all networks +control<- list() +control$point.size = 10 +control$edge.gap = 0.04 +control$edge.size = 2 +control$curvature = -0.2 +control$arrow = arrow(length = unit(0.04, "npc"), type = "open") +control$curv.shift = 0.06 +control$label.size = 3 +# the above options may need to be changed, depending on the +# size of the networks and the size of your pdf output +control$node.label = "H" +# specify the column in 'di' below that should be used as nodel label +control$node.fill = NA_character_ +control$node.fill.values = c(`NA` = "steelblue2") +# specify the background colour for each node +control$node.shape = NA_character_ +control$node.shape.values = c(`NA` = 16) +# specify the shape for each node +control$threshold.linked = 0.6 +# edges will be highlighted in darkgrey if 'SCORE_LINKED' is above this threshold +pns <- lapply(seq_along(idclus), function(i) + { + idclu <- idclus[i] + df <- dnet %>% + filter(IDCLU == idclu) + di <- df %>% + select(H1, H2) %>% + gather('HOST_TYPE','H') %>% + select(-HOST_TYPE) %>% + distinct() + p <- plot.network(df, di, control) + p + }) +pns[[10]] +``` +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE, out.width = "80%"} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dnetplot.pdf") +``` + +## Output from reconstructing transmission chains +Now, let us have a look at the corresponding transmission chains: +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dchain.png") +``` + +Here, + +1. Columns `H1`, `H2`, `PTY_RUN` and `IDCLU` are as before. +2. Column `LINK_12` states if there is a directed edge from `H1` to `H2` in the + most likely transmission chain, and `LINK_21` states if there is a directed + edge in the other direction. +3. Column `SCORE_LINKED` gives the *phyloscanner* score for phylogenetic + linkage. Using the default options, this is the sum of `KEFF` for `12`, `21` + and `complex.or.no.ancestry`, divided by `NEFF`. +4. Column `SCORE_DIR_12` gives the *phyloscanner* score for transmission + direction `H1` to `H2` among phylogenies supporting phylogenetic linkage. + Column `SCORE_DIR_21` gives the *phyloscanner* score for transmission in the + opposite direction. + +We can also plot the transmission chains. The code to do this is as follows: + +```{r, include=TRUE, eval=FALSE, echo=TRUE, tidy=FALSE} +# plot corresponding most likely chains +pcs <- lapply(seq_along(idclus), function(i) + { + idclu <- idclus[i] + control$layout <- pns[[i]][['layout']] + df <- dchain %>% + filter(IDCLU == idclu) + di <- df %>% + select(H1, H2) %>% + gather('HOST_TYPE','H') %>% + select(-HOST_TYPE) %>% + distinct() + p <- plot.chain(df, di, control) + p + }) +pcs[[10]] +``` +```{r, out.width="\\linewidth", include=TRUE, fig.align="center", echo=FALSE, out.width = "80%"} +knitr::include_graphics("figures/UVRI.03.reconstruct_transmission_networks_dchainplot.pdf") +``` + + +## Final notes + +1. It is also easy to add any further individual-level meta-data to the analysis + output. Specify the `dmeta` input variable `find.pairs.in.networks`. + \ No newline at end of file diff --git a/phyloscannerR/vignettes/phyloscannerR_reconstruct_transmission_networks.pdf b/phyloscannerR/vignettes/phyloscannerR_reconstruct_transmission_networks.pdf new file mode 100644 index 00000000..5343f54b Binary files /dev/null and b/phyloscannerR/vignettes/phyloscannerR_reconstruct_transmission_networks.pdf differ