Merge remote-tracking branch 'unstable/master'

eroller · Sep 16, 2011 · 3cd9f3f · 3cd9f3f
2 parents e63d9d8 + 57b3efa
commit 3cd9f3f
Show file tree

Hide file tree

Showing 434 changed files with 19,932 additions and 14,329 deletions.
diff --git a/build.xml b/build.xml
diff --git a/ivy.xml b/ivy.xml
@@ -12,6 +12,9 @@
     <dependency org="net.sf" name="picard" rev="latest.integration"/>
     <dependency org="edu.mit.broad" name="picard-private-parts" rev="latest.integration"/>
 
+    <!-- Tribble -->
+    <dependency org="org.broad" name="tribble" rev="latest.integration"/>
+
     <dependency org="log4j" name="log4j" rev="1.2.15">
       <!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
       <exclude org="javax.mail" />
@@ -30,6 +33,9 @@
 
     <!-- Dependencies for the graph aligner -->  
     <dependency org="org.jgrapht" name="jgrapht-jdk1.5" rev="0.7.3"/>
+
+    <!-- Dependencies for the html walker documention -->  
+    <dependency org="org.freemarker" name="freemarker" rev="2.3.18"/>
 
     <!-- Commons Dependencies -->  
     <dependency org="org.apache.commons" name="commons-email" rev="1.2"/>

diff --git a/public/R/queueJobReport.R b/public/R/queueJobReport.R
@@ -0,0 +1,169 @@
+library(gsalib)
+require("ggplot2")
+require("gplots")
+
+#
+# Standard command line switch.  Can we loaded interactively for development
+# or executed with RScript
+#
+args = commandArgs(TRUE)
+onCMDLine = ! is.na(args[1])
+if ( onCMDLine ) {
+  inputFileName = args[1]
+  outputPDF = args[2]
+} else {
+  #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
+  inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/[email protected]"
+  #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
+  outputPDF = NA
+}
+
+RUNTIME_UNITS = "(sec)"
+ORIGINAL_UNITS_TO_SECONDS = 1/1000
+
+# 
+# Helper function to aggregate all of the jobs in the report across all tables
+#
+allJobsFromReport <- function(report) {
+  names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts")
+  sub <- lapply(report, function(table) table[,names])
+  do.call("rbind", sub)
+}
+
+#
+# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
+#
+plotJobsGantt <- function(gatkReport, sortOverall) {
+  allJobs = allJobsFromReport(gatkReport)
+  if ( sortOverall ) {
+    title = "All jobs, by analysis, by start time"
+    allJobs = allJobs[order(allJobs$analysisName, allJobs$startTime, decreasing=T), ]
+  } else {
+    title = "All jobs, sorted by start time"
+    allJobs = allJobs[order(allJobs$startTime, decreasing=T), ]
+  }
+  allJobs$index = 1:nrow(allJobs)
+  minTime = min(allJobs$startTime)
+  allJobs$relStartTime = allJobs$startTime - minTime
+  allJobs$relDoneTime = allJobs$doneTime - minTime
+  allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
+  maxRelTime = max(allJobs$relDoneTime)
+  p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
+  p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
+  p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
+  p <- p + xlim(0, maxRelTime * 1.1)
+  p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
+  p <- p + ylab("Job")
+  p <- p + opts(title=title)
+  print(p)
+}
+
+#
+# Plots scheduling efficiency at job events
+#
+plotProgressByTime <- function(gatkReport) {
+  allJobs = allJobsFromReport(gatkReport)
+  nJobs = dim(allJobs)[1]
+  allJobs = allJobs[order(allJobs$startTime, decreasing=F),]
+  allJobs$index = 1:nrow(allJobs)
+
+  minTime = min(allJobs$startTime)
+  allJobs$relStartTime = allJobs$startTime - minTime
+  allJobs$relDoneTime = allJobs$doneTime - minTime
+
+  times = sort(c(allJobs$relStartTime, allJobs$relDoneTime))
+
+  countJobs <- function(p) {
+    s = allJobs$relStartTime
+    e = allJobs$relDoneTime
+    x = c() # I wish I knew how to make this work with apply
+    for ( time in times )
+      x = c(x, sum(p(s, e, time)))
+    x
+  }
+
+  pending = countJobs(function(s, e, t) s > t)
+  done = countJobs(function(s, e, t) e < t)
+  running = nJobs - pending - done
+
+  d = data.frame(times=times, pending=pending, running=running, done=done)
+
+  p <- ggplot(data=melt(d, id.vars=c("times")), aes(x=times, y=value, color=variable))
+  p <- p + facet_grid(variable ~ ., scales="free")
+  p <- p + geom_line(size=2)
+  p <- p + xlab(paste("Time since start of first job", RUNTIME_UNITS))
+  p <- p + opts(title = "Job scheduling")
+  print(p)
+}
+
+# 
+# Creates tables for each job in this group
+#
+standardColumns = c("jobName", "startTime", "formattedStartTime", "analysisName", "intermediate", "exechosts", "formattedDoneTime", "doneTime", "runtime")
+plotGroup <- function(groupTable) {
+  name = unique(groupTable$analysisName)[1]
+  groupAnnotations = setdiff(names(groupTable), standardColumns)  
+  sub = groupTable[,c("jobName", groupAnnotations, "runtime")]
+  sub = sub[order(sub$iteration, sub$jobName, decreasing=F), ]
+
+  # create a table showing each job and all annotations
+  textplot(sub, show.rownames=F)
+  title(paste("Job summary for", name, "full itemization"), cex=3)
+
+  # create the table for each combination of values in the group, listing iterations in the columns
+  sum = cast(melt(sub, id.vars=groupAnnotations, measure.vars=c("runtime")), ... ~ iteration, fun.aggregate=mean)
+  textplot(as.data.frame(sum), show.rownames=F)
+  title(paste("Job summary for", name, "itemizing each iteration"), cex=3)
+
+  # histogram of job times by groupAnnotations
+  if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) {
+    # todo -- how do we group by annotations?
+    p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram()
+    p <- p + xlab("runtime in seconds") + ylab("No. of jobs")
+    p <- p + opts(title=paste("Job runtime histogram for", name))
+    print(p)
+  }
+
+  # as above, but averaging over all iterations
+  groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
+  if ( dim(sub)[1] > 1 ) {
+    sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
+    textplot(as.data.frame(sum), show.rownames=F)
+    title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
+  }
+}
+
+# print out some useful basic information
+print("Report")
+print(paste("Project          :", inputFileName))
+
+convertUnits <- function(gatkReportData) {
+  convertGroup <- function(g) {
+    g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS
+    g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS
+    g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS
+    g
+  }
+  lapply(gatkReportData, convertGroup)
+}
+
+
+# read the table
+gatkReportData <- gsa.read.gatkreport(inputFileName)
+gatkReportData <- convertUnits(gatkReportData)
+#print(summary(gatkReportData))
+
+if ( ! is.na(outputPDF) ) {
+  pdf(outputPDF, height=8.5, width=11)
+} 
+
+plotJobsGantt(gatkReportData, T)
+plotJobsGantt(gatkReportData, F)
+plotProgressByTime(gatkReportData)
+for ( group in gatkReportData ) {
+ plotGroup(group)
+}
+
+if ( ! is.na(outputPDF) ) {
+  dev.off()
+} 
diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R
@@ -20,6 +20,20 @@
     assign(tableName, d, envir=tableEnv);
 }
 
+# Read a fixed width line of text into a list.
+.gsa.splitFixedWidth <- function(line, columnStarts) {
+    splitStartStop <- function(x) {
+        x = substring(x, starts, stops);
+        x = gsub("^[[:space:]]+|[[:space:]]+$", "", x);
+        x;
+    }
+
+    starts = c(1, columnStarts);
+    stops = c(columnStarts - 1, nchar(line));
+
+    sapply(line, splitStartStop)[,1];
+}
+
 # Load all GATKReport tables from a file
 gsa.read.gatkreport <- function(filename) {
     con = file(filename, "r", blocking = TRUE);
@@ -31,9 +45,10 @@ gsa.read.gatkreport <- function(filename) {
     tableName = NA;
     tableHeader = c();
     tableRows = c();
+    version = NA;
 
     for (line in lines) {
-        if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+        if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
             headerFields = unlist(strsplit(line, "[[:space:]]+"));
 
             if (!is.na(tableName)) {
@@ -43,13 +58,37 @@ gsa.read.gatkreport <- function(filename) {
             tableName = headerFields[2];
             tableHeader = c();
             tableRows = c();
+
+            # For differences in versions see
+            #   $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
+            if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+                version = "v0.1";
+
+            } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+                version = "v0.2";
+                columnStarts = c();
+
+            }
+
         } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
             # do nothing
         } else if (!is.na(tableName)) {
-            row = unlist(strsplit(line, "[[:space:]]+"));
+
+            if (version == "v0.1") {
+                row = unlist(strsplit(line, "[[:space:]]+"));
+
+            } else if (version == "v0.2") {
+                if (length(tableHeader) == 0) {
+                    headerChars = unlist(strsplit(line, ""));
+                    # Find the first position of non space characters, excluding the first character
+                    columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
+                }
+
+                row = .gsa.splitFixedWidth(line, columnStarts);
+            }
 
             if (length(tableHeader) == 0) {
-                tableHeader = row;
+              tableHeader = row;
             } else {
                 tableRows = rbind(tableRows, row);
             }

diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
@@ -25,7 +25,6 @@
 
 package net.sf.picard.reference;
 
-import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSourceProgressListener;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 
 import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*;
@@ -39,8 +38,8 @@
  * Produces fai file with same output as samtools faidx
  */
 public class FastaSequenceIndexBuilder {
-    public File fastaFile;
-    ReferenceDataSourceProgressListener progress;  // interface that provides a method for updating user on progress of reading file
+    final public File fastaFile;
+    final boolean printProgress;
 
     // keep track of location in file
     long bytesRead, endOfLastLine, lastTimestamp, fileLength;  // initialized to -1 to keep 0-indexed position in file;
@@ -55,10 +54,10 @@ public class FastaSequenceIndexBuilder {
     public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT }
     Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum?
 
-    public FastaSequenceIndexBuilder(File fastaFile, ReferenceDataSourceProgressListener progress) {
-        this.progress = progress;
+    public FastaSequenceIndexBuilder(File fastaFile, boolean printProgress) {
         this.fastaFile = fastaFile;
         fileLength = fastaFile.length();
+        this.printProgress = printProgress;
     }
 
     /**
@@ -252,8 +251,8 @@ private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
 
         if (System.currentTimeMillis() - lastTimestamp > 10000) {
             int percentProgress = (int) (100*bytesRead/fileLength);
-            if (progress != null)
-                progress.percentProgress(percentProgress);
+            if (printProgress)
+                System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percentProgress));
             lastTimestamp = System.currentTimeMillis();
         }
     }