Skip to content

Commit

Permalink
Merge remote-tracking branch 'unstable/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
eitanbanks committed Sep 16, 2011
2 parents e63d9d8 + 57b3efa commit 3cd9f3f
Show file tree
Hide file tree
Showing 434 changed files with 19,932 additions and 14,329 deletions.
427 changes: 244 additions & 183 deletions build.xml

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
<dependency org="net.sf" name="picard" rev="latest.integration"/>
<dependency org="edu.mit.broad" name="picard-private-parts" rev="latest.integration"/>

<!-- Tribble -->
<dependency org="org.broad" name="tribble" rev="latest.integration"/>

<dependency org="log4j" name="log4j" rev="1.2.15">
<!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
<exclude org="javax.mail" />
Expand All @@ -30,6 +33,9 @@

<!-- Dependencies for the graph aligner -->
<dependency org="org.jgrapht" name="jgrapht-jdk1.5" rev="0.7.3"/>

<!-- Dependencies for the html walker documention -->
<dependency org="org.freemarker" name="freemarker" rev="2.3.18"/>

<!-- Commons Dependencies -->
<dependency org="org.apache.commons" name="commons-email" rev="1.2"/>
Expand Down
169 changes: 169 additions & 0 deletions public/R/queueJobReport.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
library(gsalib)
require("ggplot2")
require("gplots")

#
# Standard command line switch. Can we loaded interactively for development
# or executed with RScript
#
args = commandArgs(TRUE)
onCMDLine = ! is.na(args[1])
if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/[email protected]"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
}

RUNTIME_UNITS = "(sec)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000

#
# Helper function to aggregate all of the jobs in the report across all tables
#
allJobsFromReport <- function(report) {
names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts")
sub <- lapply(report, function(table) table[,names])
do.call("rbind", sub)
}

#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ]
} else {
title = "All jobs, sorted by start time"
allJobs = allJobs[order(allJobs$startTime, decreasing=T), ]
}
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
maxRelTime = max(allJobs$relDoneTime)
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
p <- p + ylab("Job")
p <- p + opts(title=title)
print(p)
}

#
# Plots scheduling efficiency at job events
#
plotProgressByTime <- function(gatkReport) {
allJobs = allJobsFromReport(gatkReport)
nJobs = dim(allJobs)[1]
allJobs = allJobs[order(allJobs$startTime, decreasing=F),]
allJobs$index = 1:nrow(allJobs)

minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime

times = sort(c(allJobs$relStartTime, allJobs$relDoneTime))

countJobs <- function(p) {
s = allJobs$relStartTime
e = allJobs$relDoneTime
x = c() # I wish I knew how to make this work with apply
for ( time in times )
x = c(x, sum(p(s, e, time)))
x
}

pending = countJobs(function(s, e, t) s > t)
done = countJobs(function(s, e, t) e < t)
running = nJobs - pending - done

d = data.frame(times=times, pending=pending, running=running, done=done)

p <- ggplot(data=melt(d, id.vars=c("times")), aes(x=times, y=value, color=variable))
p <- p + facet_grid(variable ~ ., scales="free")
p <- p + geom_line(size=2)
p <- p + xlab(paste("Time since start of first job", RUNTIME_UNITS))
p <- p + opts(title = "Job scheduling")
print(p)
}

#
# Creates tables for each job in this group
#
standardColumns = c("jobName", "startTime", "formattedStartTime", "analysisName", "intermediate", "exechosts", "formattedDoneTime", "doneTime", "runtime")
plotGroup <- function(groupTable) {
name = unique(groupTable$analysisName)[1]
groupAnnotations = setdiff(names(groupTable), standardColumns)
sub = groupTable[,c("jobName", groupAnnotations, "runtime")]
sub = sub[order(sub$iteration, sub$jobName, decreasing=F), ]

# create a table showing each job and all annotations
textplot(sub, show.rownames=F)
title(paste("Job summary for", name, "full itemization"), cex=3)

# create the table for each combination of values in the group, listing iterations in the columns
sum = cast(melt(sub, id.vars=groupAnnotations, measure.vars=c("runtime")), ... ~ iteration, fun.aggregate=mean)
textplot(as.data.frame(sum), show.rownames=F)
title(paste("Job summary for", name, "itemizing each iteration"), cex=3)

# histogram of job times by groupAnnotations
if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) {
# todo -- how do we group by annotations?
p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram()
p <- p + xlab("runtime in seconds") + ylab("No. of jobs")
p <- p + opts(title=paste("Job runtime histogram for", name))
print(p)
}

# as above, but averaging over all iterations
groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
if ( dim(sub)[1] > 1 ) {
sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
textplot(as.data.frame(sum), show.rownames=F)
title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
}
}

# print out some useful basic information
print("Report")
print(paste("Project :", inputFileName))

convertUnits <- function(gatkReportData) {
convertGroup <- function(g) {
g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
g
}
lapply(gatkReportData, convertGroup)
}


# read the table
gatkReportData <- gsa.read.gatkreport(inputFileName)
gatkReportData <- convertUnits(gatkReportData)
#print(summary(gatkReportData))

if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}

plotJobsGantt(gatkReportData, T)
plotJobsGantt(gatkReportData, F)
plotProgressByTime(gatkReportData)
for ( group in gatkReportData ) {
plotGroup(group)
}

if ( ! is.na(outputPDF) ) {
dev.off()
}
45 changes: 42 additions & 3 deletions public/R/src/gsalib/R/gsa.read.gatkreport.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,20 @@
assign(tableName, d, envir=tableEnv);
}

# Read a fixed width line of text into a list.
.gsa.splitFixedWidth <- function(line, columnStarts) {
splitStartStop <- function(x) {
x = substring(x, starts, stops);
x = gsub("^[[:space:]]+|[[:space:]]+$", "", x);
x;
}

starts = c(1, columnStarts);
stops = c(columnStarts - 1, nchar(line));

sapply(line, splitStartStop)[,1];
}

# Load all GATKReport tables from a file
gsa.read.gatkreport <- function(filename) {
con = file(filename, "r", blocking = TRUE);
Expand All @@ -31,9 +45,10 @@ gsa.read.gatkreport <- function(filename) {
tableName = NA;
tableHeader = c();
tableRows = c();
version = NA;

for (line in lines) {
if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
headerFields = unlist(strsplit(line, "[[:space:]]+"));

if (!is.na(tableName)) {
Expand All @@ -43,13 +58,37 @@ gsa.read.gatkreport <- function(filename) {
tableName = headerFields[2];
tableHeader = c();
tableRows = c();

# For differences in versions see
# $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
version = "v0.1";

} else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
version = "v0.2";
columnStarts = c();

}

} else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
# do nothing
} else if (!is.na(tableName)) {
row = unlist(strsplit(line, "[[:space:]]+"));

if (version == "v0.1") {
row = unlist(strsplit(line, "[[:space:]]+"));

} else if (version == "v0.2") {
if (length(tableHeader) == 0) {
headerChars = unlist(strsplit(line, ""));
# Find the first position of non space characters, excluding the first character
columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
}

row = .gsa.splitFixedWidth(line, columnStarts);
}

if (length(tableHeader) == 0) {
tableHeader = row;
tableHeader = row;
} else {
tableRows = rbind(tableRows, row);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

package net.sf.picard.reference;

import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSourceProgressListener;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*;
Expand All @@ -39,8 +38,8 @@
* Produces fai file with same output as samtools faidx
*/
public class FastaSequenceIndexBuilder {
public File fastaFile;
ReferenceDataSourceProgressListener progress; // interface that provides a method for updating user on progress of reading file
final public File fastaFile;
final boolean printProgress;

// keep track of location in file
long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file;
Expand All @@ -55,10 +54,10 @@ public class FastaSequenceIndexBuilder {
public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT }
Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum?

public FastaSequenceIndexBuilder(File fastaFile, ReferenceDataSourceProgressListener progress) {
this.progress = progress;
public FastaSequenceIndexBuilder(File fastaFile, boolean printProgress) {
this.fastaFile = fastaFile;
fileLength = fastaFile.length();
this.printProgress = printProgress;
}

/**
Expand Down Expand Up @@ -252,8 +251,8 @@ private void finishReadingContig(FastaSequenceIndex sequenceIndex) {

if (System.currentTimeMillis() - lastTimestamp > 10000) {
int percentProgress = (int) (100*bytesRead/fileLength);
if (progress != null)
progress.percentProgress(percentProgress);
if (printProgress)
System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percentProgress));
lastTimestamp = System.currentTimeMillis();
}
}
Expand Down
Loading

0 comments on commit 3cd9f3f

Please sign in to comment.