Skip to content

Commit cd5bb8f

Browse files
author
Yossi Farjoun
authored
CrosscheckFingerprint speedup, extra functionality and NIO enabling (broadinstitute#1086)
Getting CrosscheckFingeprints to run on Paths, and a few more changes that were needed to crosscheck 20K samples. - several speed-ups to the fingerprinting code, especially Crosscheck - added option to enable mapping samplenames before crosschecking. This option makes it possible to change what the algorithm considers "expected". It is only available for the case where checking INPUT against SECOND_INPUT. One can map either INPUT or SECOND_INPUT names (or both). Some sanity checks are included. - modified build to make cloud support more simple (compile cloudJar to get cloud support bundled)
1 parent 3984f99 commit cd5bb8f

29 files changed

+1094
-392
lines changed

README.md

+31-2
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,38 @@ During development in Picard, it is sometimes necessary to build locally against
8383

8484
#### Releasing Picard
8585

86-
Full instructions on how to create a new release of Picard are [here](https://github.com/broadinstitute/picard/wiki/How-to-release-Picard)
86+
Full instructions on how to create a new release of
87+
Picard are [here](https://github.com/broadinstitute/picard/wiki/How-to-release-Picard)
8788

88-
----
89+
#### Path providers
90+
91+
Picard has limited support for reading from Path providers.
92+
Currently only google's api is supported, and only a few tools support this.
93+
To run with this support you need to compile the cloudJar target with gradle:
94+
```bash
95+
./gradlew cloudJar
96+
97+
```
98+
then run picard as follows:
99+
100+
```bash
101+
java -jar build/lib/picardcloud.jar <Picard arguments starting from program>
102+
```
103+
For example:
104+
105+
```bash
106+
java -jar build/lib/picardcloud.jar CrosscheckFingerprints \
107+
I=gs://sample1.vcf \
108+
I=gs://sample2.vcf \
109+
CROSSCHECK_BY=FILE \
110+
H=Haplotype_db.txt \
111+
O=crosscheck.out
112+
```
113+
114+
Alternatively, you can run the tool via the [GATK](https://software.broadinstitute.org/gatk/download/) which bundles the Google-Cloud
115+
jar, and should thus "Just Work".
116+
117+
#### GA4GH API
89118

90119
It's also possible to build a version of Picard that supports reading from
91120
GA4GH API, e.g. Google Genomics:

build.gradle

+23
Original file line numberDiff line numberDiff line change
@@ -48,26 +48,40 @@ jacoco {
4848
}
4949

5050
final htsjdkVersion = System.getProperty('htsjdk.version', '2.14.3')
51+
final googleNioVersion= "0.28.0-alpha:shaded"
5152

5253
// Get the jdk files we need to run javaDoc. We need to use these during compile, testCompile,
5354
// test execution, and gatkDoc generation, but we don't want them as part of the runtime
5455
// classpath and we don't want to redistribute them in the uber jar.
5556
final javadocJDKFiles = files(((URLClassLoader) ToolProvider.getSystemToolClassLoader()).getURLs())
5657

58+
configurations {
59+
cloudConfiguration {
60+
extendsFrom runtime
61+
}
62+
}
63+
5764
dependencies {
5865
compile('com.intel.gkl:gkl:0.8.2') {
5966
exclude module: 'htsjdk'
6067
}
6168
compile 'com.google.guava:guava:15.0'
6269
compile 'com.github.samtools:htsjdk:' + htsjdkVersion
6370
compile 'org.broadinstitute:barclay:2.0.0'
71+
configurations.cloudConfiguration {
72+
compile ('com.google.cloud:google-cloud-nio:' + googleNioVersion)
73+
}
74+
75+
compileOnly 'com.google.cloud:google-cloud-nio:' + googleNioVersion
6476

6577
// javadoc utilities; compile/test only to prevent redistribution of sdk jars
6678
compileOnly(javadocJDKFiles)
6779
testCompile(javadocJDKFiles)
6880

6981
testCompile 'org.testng:testng:6.9.10'
7082
testCompile 'org.apache.commons:commons-lang3:3.6'
83+
84+
7185
}
7286

7387
configurations.all {
@@ -192,6 +206,15 @@ shadowJar {
192206
}
193207
}
194208

209+
210+
211+
task cloudJar(type: com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar) {
212+
configurations = [project.configurations.cloudConfiguration]
213+
from project.sourceSets.main.output
214+
archiveName 'picardcloud.jar'
215+
}
216+
217+
195218
// Create picardBarclay.jar, which is a identical to picard.jar, but contains a .properties
196219
// file that tells Picard to use the Barclay command line parser instead of the Picard
197220
// command line parser.

src/main/java/picard/cmdline/CommandLineProgram.java

+14-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import picard.cmdline.argumentcollections.OptionalReferenceArgumentCollection;
5252
import picard.cmdline.argumentcollections.ReferenceArgumentCollection;
5353
import picard.cmdline.argumentcollections.RequiredReferenceArgumentCollection;
54+
import picard.nio.PathHelper;
5455
import picard.util.PropertyUtils;
5556

5657
import java.io.File;
@@ -243,21 +244,33 @@ public int instanceMain(final String[] argv) {
243244
System.setProperty("java.io.tmpdir", f.getAbsolutePath()); // in loop so that last one takes effect
244245
}
245246

247+
PathHelper.initilizeAll();
248+
246249
if (!QUIET) {
247250
System.err.println("[" + new Date() + "] " + commandLine);
248251

249252
// Output a one liner about who/where and what software/os we're running on
250253
try {
254+
final StringBuilder pathProvidersBuilder = new StringBuilder();
255+
256+
for (PathHelper.PathProviders providers : PathHelper.PathProviders.values()) {
257+
pathProvidersBuilder.append(String.format("Provider %s is%s available; ",
258+
providers.name(), providers.isAvailable() ? "" : " not"));
259+
}
260+
final int lastSpacePos = pathProvidersBuilder.lastIndexOf(" ");
261+
pathProvidersBuilder.delete(lastSpacePos, lastSpacePos + 1);
262+
251263
final boolean usingIntelDeflater = (BlockCompressedOutputStream.getDefaultDeflaterFactory() instanceof IntelDeflaterFactory &&
252264
((IntelDeflaterFactory)BlockCompressedOutputStream.getDefaultDeflaterFactory()).usingIntelDeflater());
253265
final boolean usingIntelInflater = (BlockGunzipper.getDefaultInflaterFactory() instanceof IntelInflaterFactory &&
254266
((IntelInflaterFactory)BlockGunzipper.getDefaultInflaterFactory()).usingIntelInflater());
255267
final String msg = String.format(
256-
"[%s] Executing as %s@%s on %s %s %s; %s %s; Deflater: %s; Inflater: %s; Picard version: %s",
268+
"[%s] Executing as %s@%s on %s %s %s; %s %s; Deflater: %s; Inflater: %s; %s Picard version: %s",
257269
new Date(), System.getProperty("user.name"), InetAddress.getLocalHost().getHostName(),
258270
System.getProperty("os.name"), System.getProperty("os.version"), System.getProperty("os.arch"),
259271
System.getProperty("java.vm.name"), System.getProperty("java.runtime.version"),
260272
usingIntelDeflater ? "Intel" : "Jdk", usingIntelInflater ? "Intel" : "Jdk",
273+
pathProvidersBuilder.toString(),
261274
getCommandLineParser().getVersion());
262275
System.err.println(msg);
263276
}

src/main/java/picard/fingerprint/CheckFingerprint.java

+44-31
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
4848

4949
import java.io.File;
50+
import java.io.IOException;
51+
import java.nio.file.Path;
5052
import java.util.Collections;
5153
import java.util.List;
5254

@@ -169,7 +171,7 @@ public class CheckFingerprint extends CommandLineProgram {
169171
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file SAM/BAM or VCF. If a VCF is used, " +
170172
"it must have at least one sample. If there are more than one samples in the VCF, the parameter OBSERVED_SAMPLE_ALIAS must " +
171173
"be provided in order to indicate which sample's data to use. If there are no samples in the VCF, an exception will be thrown.")
172-
public File INPUT;
174+
public String INPUT;
173175

174176
@Argument(optional = true, doc = "If the input is a VCF, this parameters used to select which sample's data in the VCF to use.")
175177
public String OBSERVED_SAMPLE_ALIAS;
@@ -187,7 +189,7 @@ public class CheckFingerprint extends CommandLineProgram {
187189

188190
@Argument(shortName = "G", doc = "File of genotypes (VCF) to be used in comparison. May contain " +
189191
"any number of genotypes; CheckFingerprint will use only those that are usable for fingerprinting.")
190-
public File GENOTYPES;
192+
public String GENOTYPES;
191193

192194
@Argument(shortName = "SAMPLE_ALIAS", optional = true, doc = "This parameter can be used to specify which sample's genotypes to use from the " +
193195
"expected VCF file (the GENOTYPES file). If it is not supplied, the sample name from the input " +
@@ -212,10 +214,8 @@ public class CheckFingerprint extends CommandLineProgram {
212214
public static final String FINGERPRINT_SUMMARY_FILE_SUFFIX = "fingerprinting_summary_metrics";
213215
public static final String FINGERPRINT_DETAIL_FILE_SUFFIX = "fingerprinting_detail_metrics";
214216

215-
// Stock main method
216-
public static void main(final String[] args) {
217-
new CheckFingerprint().instanceMainWithExit(args);
218-
}
217+
private Path inputPath;
218+
private Path genotypesPath;
219219

220220
@Override
221221
protected int doWork() {
@@ -229,28 +229,34 @@ protected int doWork() {
229229
outputSummaryMetricsFile = new File(OUTPUT + FINGERPRINT_SUMMARY_FILE_SUFFIX);
230230
}
231231

232-
IOUtil.assertFileIsReadable(INPUT);
232+
try {
233+
inputPath = IOUtil.getPath(INPUT);
234+
genotypesPath = IOUtil.getPath(GENOTYPES);
235+
} catch (IOException e) {
236+
throw new IllegalArgumentException(e);
237+
}
238+
IOUtil.assertFileIsReadable(inputPath);
233239
IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
234-
IOUtil.assertFileIsReadable(GENOTYPES);
240+
IOUtil.assertFileIsReadable(genotypesPath);
235241
IOUtil.assertFileIsWritable(outputDetailMetricsFile);
236242
IOUtil.assertFileIsWritable(outputSummaryMetricsFile);
237243

238244
final FingerprintChecker checker = new FingerprintChecker(HAPLOTYPE_MAP);
239245
List<FingerprintResults> results;
240246

241247
String observedSampleAlias = null;
242-
final boolean isBamOrSamFile = isBamOrSamFile(INPUT);
248+
final boolean isBamOrSamFile = isBamOrSam(inputPath);
243249
if (isBamOrSamFile) {
244-
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT.toPath()), SAMSequenceDictionaryExtractor.extractDictionary(GENOTYPES.toPath()), true);
245-
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(INPUT.toPath()), checker.getHeader().getSequenceDictionary(), true);
250+
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(inputPath), SAMSequenceDictionaryExtractor.extractDictionary(genotypesPath), true);
251+
SequenceUtil.assertSequenceDictionariesEqual(SAMSequenceDictionaryExtractor.extractDictionary(inputPath), checker.getHeader().getSequenceDictionary(), true);
246252

247253
// Verify that there's only one sample in the SAM/BAM.
248-
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT.toPath());
254+
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inputPath);
249255
for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) {
250256
if (observedSampleAlias == null) {
251257
observedSampleAlias = rec.getSample();
252258
} else if (!observedSampleAlias.equals(rec.getSample())) {
253-
throw new PicardException("INPUT SAM/BAM file must not contain data from multiple samples.");
259+
throw new PicardException("inputPath SAM/BAM file must not contain data from multiple samples.");
254260
}
255261
}
256262
CloserUtil.close(in);
@@ -261,28 +267,28 @@ protected int doWork() {
261267
}
262268

263269
results = checker.checkFingerprints(
264-
Collections.singletonList(INPUT),
265-
Collections.singletonList(GENOTYPES),
270+
Collections.singletonList(inputPath),
271+
Collections.singletonList(genotypesPath),
266272
EXPECTED_SAMPLE_ALIAS,
267273
IGNORE_READ_GROUPS);
268274
} else { // Input is a VCF
269275
// Note that FingerprintChecker.loadFingerprints() verifies that the VCF's Sequence Dictionaries agree with that of the Haplotye Map File
270276

271277
// Verify that there is only one sample in the VCF
272-
final VCFFileReader fileReader = new VCFFileReader(INPUT, false);
278+
final VCFFileReader fileReader = new VCFFileReader(inputPath, false);
273279
final VCFHeader fileHeader = fileReader.getFileHeader();
274280
if (fileHeader.getNGenotypeSamples() < 1) {
275-
throw new PicardException("INPUT VCF file must contain at least one sample.");
281+
throw new PicardException("inputPath VCF file must contain at least one sample.");
276282
}
277283
if ((fileHeader.getNGenotypeSamples() > 1) && (OBSERVED_SAMPLE_ALIAS == null)) {
278-
throw new PicardException("INPUT VCF file contains multiple samples and yet the OBSERVED_SAMPLE_ALIAS parameter is not set.");
284+
throw new PicardException("inputPath VCF file contains multiple samples and yet the OBSERVED_SAMPLE_ALIAS parameter is not set.");
279285
}
280286
// set observedSampleAlias to the parameter, if set. Otherwise, if here, this must be a single sample VCF, get it's sample
281287
observedSampleAlias = (OBSERVED_SAMPLE_ALIAS != null) ? OBSERVED_SAMPLE_ALIAS : fileHeader.getGenotypeSamples().get(0);
282288

283289
// Now verify that observedSampleAlias is, in fact, in the VCF
284290
if (!fileHeader.getGenotypeSamples().contains(observedSampleAlias)) {
285-
throw new PicardException("INPUT VCF file does not contain OBSERVED_SAMPLE_ALIAS: " + observedSampleAlias);
291+
throw new PicardException("inputPath VCF file does not contain OBSERVED_SAMPLE_ALIAS: " + observedSampleAlias);
286292
}
287293

288294
if (OBSERVED_SAMPLE_ALIAS == null) {
@@ -295,9 +301,9 @@ protected int doWork() {
295301
EXPECTED_SAMPLE_ALIAS = observedSampleAlias;
296302
}
297303

298-
results = checker.checkFingerprints(
299-
Collections.singletonList(INPUT),
300-
Collections.singletonList(GENOTYPES),
304+
results = checker.checkFingerprintsFromPaths(
305+
Collections.singletonList(inputPath),
306+
Collections.singletonList(genotypesPath),
301307
observedSampleAlias,
302308
EXPECTED_SAMPLE_ALIAS);
303309
}
@@ -368,19 +374,26 @@ protected int doWork() {
368374
}
369375

370376
protected String[] customCommandLineValidation() {
371-
IOUtil.assertFileIsReadable(INPUT);
372377

373-
boolean isBamOrSamFile = isBamOrSamFile(INPUT);
374-
if (!isBamOrSamFile && IGNORE_READ_GROUPS) {
375-
return new String[]{"The parameter IGNORE_READ_GROUPS can only be used with BAM/SAM inputs."};
376-
}
377-
if (isBamOrSamFile && OBSERVED_SAMPLE_ALIAS != null) {
378-
return new String[]{"The parameter OBSERVED_SAMPLE_ALIAS can only be used with a VCF input."};
378+
try {
379+
final boolean isBamOrSamFile = isBamOrSam(IOUtil.getPath(INPUT));
380+
if (!isBamOrSamFile && IGNORE_READ_GROUPS) {
381+
return new String[]{"The parameter IGNORE_READ_GROUPS can only be used with BAM/SAM inputs."};
382+
}
383+
if (isBamOrSamFile && OBSERVED_SAMPLE_ALIAS != null) {
384+
return new String[]{"The parameter OBSERVED_SAMPLE_ALIAS can only be used with a VCF input."};
385+
}
386+
} catch (IOException e) {
387+
e.printStackTrace();
379388
}
380389
return super.customCommandLineValidation();
381390
}
382391

383-
static boolean isBamOrSamFile(final File f) {
384-
return (BamFileIoUtils.isBamFile(f) || f.getName().endsWith(IOUtil.SAM_FILE_EXTENSION));
392+
static boolean isBamOrSam(final File f) {
393+
return isBamOrSam(f.toPath());
394+
}
395+
396+
static boolean isBamOrSam(final Path p) {
397+
return (p.toUri().getRawPath().endsWith(BamFileIoUtils.BAM_FILE_EXTENSION) || p.toUri().getRawPath().endsWith(IOUtil.SAM_FILE_EXTENSION));
385398
}
386399
}

0 commit comments

Comments
 (0)