From 4af10d9d23276369aa56d0191723d7f37d05068a Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 14:28:19 +0000 Subject: [PATCH 01/27] Support for custom -i/-e perl script filtering This feature can be enabled by supplying the --enable-perl-filters option to configure at compile time. Note this changes the license from MIT to GPL, therefore it is not enabled by default. An example can be found in misc/demo-flt.pl, use for example as bcftools query -f '%CHROM:%POS \t %CSQ\n' -i 'perl:misc/demo-flt.pl; perl.severity(INFO/CSQ) > 10' --- INSTALL | 15 +++ Makefile | 5 +- config.mk.in | 2 + configure.ac | 19 +++ doc/bcftools.txt | 32 +++-- filter.c | 269 ++++++++++++++++++++++++++++++++++--- misc/demo-flt.pl | 82 +++++++++++ plugins/fill-from-fasta.mk | 2 +- plugins/setGT.mk | 2 +- test/perl-flt.vcf | 41 ++++++ 10 files changed, 439 insertions(+), 30 deletions(-) create mode 100644 misc/demo-flt.pl create mode 100644 test/perl-flt.vcf diff --git a/INSTALL b/INSTALL index 82493fcee..2f9046acf 100644 --- a/INSTALL +++ b/INSTALL @@ -76,6 +76,21 @@ to an HTSlib source tree or installation in DIR; or --with-htslib=system to use a system-installed HTSlib. +Optional Compilation with Perl +============================== + +The '-i' and '-e' options can take external perl scripts for a more +sophisticated filtering. This option can be enabled by supplying the +--enable-perl-filters option to configure before running make: + + ./configure --enable-perl-filters + +Note that enabling this option changes the license from MIT to GPL +because bcftools need to be built with + + perl -MExtUtils::Embed -e ccopts -e ldopts + + Optional Compilation with GSL ============================= diff --git a/Makefile b/Makefile index 5cb007431..d254feacb 100644 --- a/Makefile +++ b/Makefile @@ -135,7 +135,7 @@ ifdef USE_GPL endif bcftools: $(OBJS) $(HTSLIB) - $(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS) + $(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS) $(PERL_LIBS) # Plugin rules ifneq "$(PLUGINS_ENABLED)" "no" @@ -212,7 +212,8 @@ ccall.o: ccall.c $(htslib_kfunc_h) $(call_h) kmin.h $(prob1_h) convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h) tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h) em.o: em.c $(htslib_vcf_h) kmin.h $(call_h) -filter.o: filter.c $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h) +filter.o: filter.c config.h $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h) + $(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) -c -o $@ $< gvcf.o: gvcf.c gvcf.h $(call_h) kmin.o: kmin.c kmin.h mcall.o: mcall.c $(htslib_kfunc_h) $(call_h) diff --git a/config.mk.in b/config.mk.in index 8b543687a..7e0c84061 100644 --- a/config.mk.in +++ b/config.mk.in @@ -45,6 +45,8 @@ DYNAMIC_FLAGS = @CC_RDYNAMIC_OPT@ USE_GPL = @USE_GPL@ GSL_LIBS = @GSL_LIBS@ +PERL_CFLAGS = @PERL_CCOPTS@ +PERL_LIBS = @PERL_LIBS@ PLATFORM = @PLATFORM@ PLUGINS_ENABLED = @enable_bcftools_plugins@ diff --git a/configure.ac b/configure.ac index 256264f74..a6adcc808 100644 --- a/configure.ac +++ b/configure.ac @@ -66,6 +66,12 @@ AC_ARG_WITH([bcf-plugin-path], [with_bcf_plugin_path=$with_bcf_plugin_dir]) AC_SUBST([PLUGINPATH], $with_bcf_plugin_path) +AC_ARG_ENABLE([perl-filters], + [AS_HELP_STRING([--enable-perl-filters], + [do not build support for PERL scripts in -i/-e filtering expressions])], + [], [enable_perl_filters=no]) +AC_SUBST(enable_perl_filters) + AC_ARG_ENABLE([libgsl], [AS_HELP_STRING([--enable-libgsl], [build options that require the GNU scientific library (changes bcftools license to GPL, see INSTALL for details)])], @@ -199,6 +205,19 @@ Either configure with --disable-libgsl or resolve this error to build bcftools. AC_SUBST([GSL_LIBS]) ]) +AS_IF([test "$enable_perl_filters" != "no" ], [dnl + if ! perl -MExtUtils::Embed -e ccopts -e ldopts &>/dev/null ; then + AC_MSG_ERROR([Cannot `perl -MExtUtils::Embed -e ccopts -e ldopts`]) + fi + AC_DEFINE([ENABLE_PERL_FILTERS], 1, [Define if BCFtools should enable for support PERL scripts in -i/-e filtering expressions.]) + PERL_CCOPTS="`perl -MExtUtils::Embed -e ccopts`" + PERL_LIBS="`perl -MExtUtils::Embed -e ldopts`" + AC_SUBST([PERL_CCOPTS]) + AC_SUBST([PERL_LIBS]) + USE_GPL=1 + AC_SUBST([USE_GPL]) +]) + AC_CONFIG_FILES([config.mk]) AC_OUTPUT diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 94c085f58..deebfd1c4 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1255,7 +1255,10 @@ And similarly here, the second is filtered: [[gtcheck]] === bcftools gtcheck ['OPTIONS'] [*-g* 'genotypes.vcf.gz'] 'query.vcf.gz' -Checks sample identity or, without *-g*, multi-sample cross-check is performed. +Checks sample identity. The program can operate in two modes. If the *-g* +option is given, the identity of the *-s* sample from 'query.vcf.gz' +is checked against the samples in the *-g* file. +Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' is performed. *-a, --all-sites*:: output for all sites @@ -1319,17 +1322,16 @@ Checks sample identity or, without *-g*, multi-sample cross-check is performed. *-G*, the value '1' can be used instead; the discordance value then gives exactly the number of differing genotypes. - SM, Average Discordance;; - Average discordance between sample 'a' and all other samples. + ERR, error rate;; + Pairwise error rate calculated as number of differences divided + by the total number of comparisons. - SM, Average Depth;; - Average depth at evaluated sites, or 1 if FORMAT/DP field is not - present. - - SM, Average Number of sites;; - The average number of sites used to calculate the discordance. In - other words, the average number of non-missing PLs/genotypes seen - both samples. + CLUSTER, TH, DOT;; + In presence of multiple samples, related samples and outliers can be + identified by clustering samples by error rate. A simple hierarchical + clustering based on minimization of standard deviation is used. This is + useful to detect sample swaps, for example in situations where one + sample has been sequenced in multiple runs. // MD, Maximum Deviation;; // The maximum absolute deviation from average score of the sample @@ -2769,6 +2771,14 @@ number of samples with missing genotype; fraction of samples with missing genoty N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING +* custom perl filtering. Note that this command is not compiled in by default, see +the section *Optional Compilation with Perl* in the INSTALL file for help +and misc/demo-flt.pl for a working example. The demo defined the perl subroutine +"severity" which can be invoked from the command line as follows: + + perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3 + + .Notes: diff --git a/filter.c b/filter.c index 53e8dc774..e3a9cb759 100644 --- a/filter.c +++ b/filter.c @@ -27,18 +27,29 @@ THE SOFTWARE. */ #include #include #include -#include #include #include -#include "filter.h" -#include "bcftools.h" #include #include +#include "config.h" +#include "filter.h" +#include "bcftools.h" + +#if ENABLE_PERL_FILTERS +# define filter_t perl_filter_t +# include +# include +# undef filter_t +# define my_perl perl +#endif + #ifndef __FUNCTION__ # define __FUNCTION__ __func__ #endif +static filter_ninit = 0; + uint64_t bcf_double_missing = 0x7ff0000000000001; uint64_t bcf_double_vector_end = 0x7ff0000000000002; static inline void bcf_double_set(double *ptr, uint64_t value) @@ -63,6 +74,7 @@ typedef struct _token_t { // read-only values, same for all VCF lines int tok_type; // one of the TOK_* keys below + int nargs; // used only TOK_PERLSUB, the first argument is the name of the subroutine char *key; // set only for string constants, otherwise NULL char *tag; // for debugging and printout only, VCF tag name double threshold; // filtering threshold @@ -100,6 +112,9 @@ struct _filter_t float *tmpf; kstring_t tmps; int max_unpack, mtmpi, mtmpf, nsamples; +#if ENABLE_PERL_FILTERS + PerlInterpreter *perl; +#endif }; @@ -130,11 +145,12 @@ struct _filter_t #define TOK_LEN 24 #define TOK_FUNC 25 #define TOK_CNT 26 +#define TOK_PERLSUB 27 -// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 -// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l -static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8}; -#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f" +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 +// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p +static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8}; +#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp" static int filters_next_token(char **str, int *len) { @@ -169,6 +185,7 @@ static int filters_next_token(char **str, int *len) if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5; if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7; if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4; + if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return TOK_PERLSUB; } if ( tmp[0]=='@' ) // file name { @@ -250,6 +267,53 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } + +/* + Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. + + Based on jkb's staden code with some adjustements. + https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123 +*/ +char *expand_path(char *path) +{ +#ifdef _WIN32 + return strdup(path); // windows expansion: todo +#endif + + kstring_t str = {0,0,0}; + int i; + + if ( path[0] == '~' ) + { + if ( !path[1] || path[1] == '/' ) + { + // ~ or ~/path + kputs(getenv("HOME"), &str); + if ( path[1] ) kputs(path+1, &str); + } + else + { + // user name: ~pd3/path + char *end = path; + while ( *end && *end!='/' ) end++; + kputsn(path+1, end-path-1, &str); + struct passwd *pwentry = getpwnam(str.s); + str.l = 0; + + if ( !pwentry ) kputsn(path, end-path, &str); + else kputs(pwentry->pw_dir, &str); + kputs(end, &str); + } + return str.s; + } + if ( path[0] == '$' ) + { + char *var = getenv(path+1); + if ( var ) path = var; + } + return strdup(path); +} + static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok) { float *ptr = &line->qual; @@ -1711,13 +1775,11 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) tok->tag = (char*) calloc(len+1,sizeof(char)); memcpy(tok->tag,str,len); tok->tag[len] = 0; - wordexp_t wexp; - wordexp(tok->tag+1, &wexp, 0); - if ( !wexp.we_wordc ) error("No such file: %s\n", tok->tag+1); + char *fname = expand_path(tok->tag); int i, n; - char **list = hts_readlist(wexp.we_wordv[0], 1, &n); - if ( !list ) error("Could not read: %s\n", wexp.we_wordv[0]); - wordfree(&wexp); + char **list = hts_readlist(fname, 1, &n); + if ( !list ) error("Could not read: %s\n", fname); + free(fname); tok->hash = khash_str2int_init(); for (i=0; inargs == nstack ); + PerlInterpreter *perl = flt->perl; + + dSP; + ENTER; + SAVETMPS; + + PUSHMARK(SP); + int i,j; + for (i=1; iis_str ) + XPUSHs(sv_2mortal(newSVpvn(tok->str_value.s,tok->str_value.l))); + else if ( tok->nvalues==1 ) + XPUSHs(sv_2mortal(newSVnv(tok->values[0]))); + else if ( tok->nvalues>1 ) + { + AV *av = newAV(); + for (j=0; jnvalues; j++) av_push(av, newSVnv(tok->values[j])); + SV *rv = newRV_inc((SV*)av); + XPUSHs(rv); + } + else + { + bcf_double_set_missing(tok->values[0]); + XPUSHs(sv_2mortal(newSVnv(tok->values[0]))); + } + } + PUTBACK; + + // A possible future todo: provide a means to select samples and indexes, + // expressions like this don't work yet + // perl.filter(FMT/AD)[1:0] + + int nret = call_pv(stack[0]->str_value.s, G_ARRAY); + + SPAGAIN; + + rtok->nvalues = nret; + hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); + for (i=nret; i>0; i--) + { + rtok->values[i-1] = (double) POPn; + if ( isnan(rtok->values[i-1]) ) bcf_double_set_missing(rtok->values[i-1]); + } + + PUTBACK; + FREETMPS; + LEAVE; + +#else + error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n"); +#endif + return nstack; +} +static void perl_init(filter_t *filter, char **str) +{ + char *beg = *str; + while ( *beg && isspace(*beg) ) beg++; + if ( !*beg ) return; + if ( strncasecmp("perl:", beg, 5) ) return; +#if ENABLE_PERL_FILTERS + beg += 5; + char *end = beg; + while ( *end && *end!=';' ) end++; // for now not escaping semicolons + *str = end+1; + + if ( ++filter_ninit == 1 ) + { + // must be executed only once, even for multiple filters; first time here + PERL_SYS_INIT3(0, NULL, NULL); + } + + filter->perl = perl_alloc(); + PerlInterpreter *perl = filter->perl; + + if ( !perl ) error("perl_alloc failed\n"); + perl_construct(perl); + + // name of the perl script to run + char *rmme = (char*) calloc(end - beg + 1,1); + memcpy(rmme, beg, end - beg); + char *argv[] = { "", "" }; + argv[1] = expand_path(rmme); + free(rmme); + + PL_origalen = 1; // don't allow $0 change + int ret = perl_parse(filter->perl, NULL, 2, argv, NULL); + PL_exit_flags |= PERL_EXIT_DESTRUCT_END; + if ( ret ) error("Failed to parse: %s\n", argv[1]); + free(argv[1]); + + perl_run(perl); +#else + error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n"); +#endif +} +static void perl_destroy(filter_t *filter) +{ +#if ENABLE_PERL_FILTERS + if ( !filter->perl ) return; + + PerlInterpreter *perl = filter->perl; + perl_destruct(perl); + perl_free(perl); + if ( --filter_ninit <= 0 ) + { + // similarly to PERL_SYS_INIT3, can must be executed only once? todo: test + PERL_SYS_TERM(); + } +#endif +} // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm @@ -2008,6 +2187,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) int nout = 0, mout = 0; // filter tokens, RPN token_t *out = NULL; char *tmp = filter->str; + perl_init(filter, &tmp); + int last_op = -1; while ( *tmp ) { @@ -2016,7 +2197,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) if ( ret==-1 ) error("Missing quotes in: %s\n", str); // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); - // int i; for (i=0; ithreshold = -1.0; ret = TOK_MULT; } + else if ( ret==TOK_PERLSUB ) + { + tmp += len; + char *beg = tmp; + while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++; + if ( *beg!='(' ) error("Could not parse the expression: %s\n", str); + char *end = beg; + while ( *end && *end!=')' ) end++; + if ( !*end ) error("Could not parse the expression: %s\n", str); + kstring_t rmme = {0,0,0}; + + // the subroutine name + kputc('"', &rmme); + kputsn(tmp, beg-tmp, &rmme); + kputc('"', &rmme); + nout++; + hts_expand0(token_t, nout, mout, out); + filters_init1(filter, rmme.s, rmme.l, &out[nout-1]); + + // subroutine arguments + rmme.l = 0; + kputsn(beg+1, end-beg-1, &rmme); + int i, nargs; + char **rmme_list = hts_readlist(rmme.s, 0, &nargs); + for (i=0; itok_type = TOK_PERLSUB; + tok->nargs = nargs + 1; + tok->hdr_id = -1; + tok->pass_site = -1; + tok->threshold = -1.0; + + tmp = end + 1; + continue; + } else { while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] ) @@ -2073,7 +2300,10 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) { nout++; hts_expand0(token_t, nout, mout, out); - filters_init1(filter, tmp, len, &out[nout-1]); + if ( tmp[len-1]==',' ) + filters_init1(filter, tmp, len-1, &out[nout-1]); + else + filters_init1(filter, tmp, len, &out[nout-1]); tmp += len; } last_op = ret; @@ -2240,6 +2470,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) void filter_destroy(filter_t *filter) { + perl_destroy(filter); int i; for (i=0; infilters; i++) { @@ -2300,6 +2531,14 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) if ( --nargs > 0 ) nstack -= nargs; continue; } + else if ( filter->filters[i].tok_type == TOK_PERLSUB ) + { + int nargs = filter->filters[i].nargs; + perl_exec(filter, line, &filter->filters[i], filter->flt_stack, nstack); + filter->flt_stack[nstack-nargs] = &filter->filters[i]; + nstack -= nargs - 1; + continue; + } if ( nstack<2 ) error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack diff --git a/misc/demo-flt.pl b/misc/demo-flt.pl new file mode 100644 index 000000000..802dd7e77 --- /dev/null +++ b/misc/demo-flt.pl @@ -0,0 +1,82 @@ +# This demo code shows how to write a custom perl code and use it in +# the -i/-e filtering expressions. +# +# In this example we want to take variant consequences generated by `bcftools csq`, +# rank them by severity, filter by the most severe, and print the list using the +# following command: +# +# bcftools query -f '%CHROM:%POS \t %CSQ\n' -i 'perl:misc/demo-flt.pl; perl.severity(INFO/CSQ) > 10' test/perl-flt.vcf +# + + +# There can be multiple subroutines in the same script and they +# can be referenced from the command line by prefixing "perl.subroutine_name()" +# +sub severity +{ + # Arbitrary number of arguments can be provided. + my (@args) = @_; + + # Note that string arrays are passed to perl in the form of a single + # comma-separated string, but numeric arrays must be dereferenced, for + # example as shown in this example: + # + # for my $arg (@args) + # { + # if ( ref($arg) eq 'ARRAY' ) { print "array: ".join(',',@$arg)."\n"; } + # else { print "scalar: $arg\n"; } + # } + # + + # In our case there should be only one parameter passed to the subroutine; + # check if this is the case + if ( scalar @args != 1 ) { error("Incorrect use, expected one argument, got ".scalar @args."!\n"); } + + # Create a lookup table from consequences to an arbitrary severity score + my %severity = + ( + "intergenic" => 1, + "intron" => 2, + "non_coding" => 3, + "5_prime_utr" => 4, + "3_prime_utr" => 5, + "stop_retained" => 6, + "synonymous" => 7, + "coding_sequence" => 8, + "missense" => 9, + "splice_region" => 10, + "inframe_altering" => 11, + "inframe_deletion" => 12, + "inframe_insertion" => 13, + "splice_acceptor" => 14, + "splice_donor" => 15, + "stop_lost" => 16, + "stop_gained" => 17, + "start_lost" => 18, + "frameshift" => 19, + ); + + # Split multiple consequences into an array + my @csq = split(/,/, $args[0]); + + # Find the most severe consequence. The consequence string may look like this: + # inframe_deletion|ABC|ENST00000000002|protein_coding|-|5YV>5T|144TAC>T+148TA>T + my $max = 0; + for my $csq (@csq) + { + my @fields = split(/\|/, $csq); + my $string = $fields[0]; + my $score = exists($severity{$string}) ? $severity{$string} : 0; + if ( $max < $score ) { $max = $score; } + } + + return $max; +} + +sub error +{ + my (@msg) = @_; + print STDERR @msg; + exit; +} + diff --git a/plugins/fill-from-fasta.mk b/plugins/fill-from-fasta.mk index 688df7124..8014961ce 100644 --- a/plugins/fill-from-fasta.mk +++ b/plugins/fill-from-fasta.mk @@ -1,2 +1,2 @@ plugins/fill-from-fasta.so: plugins/fill-from-fasta.c version.h version.c filter.h filter.c - $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS) + $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS) diff --git a/plugins/setGT.mk b/plugins/setGT.mk index 75afe5122..2d97acfff 100644 --- a/plugins/setGT.mk +++ b/plugins/setGT.mk @@ -1,2 +1,2 @@ plugins/setGT.so: plugins/setGT.c version.h version.c filter.h filter.c - $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS) + $(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS) diff --git a/test/perl-flt.vcf b/test/perl-flt.vcf new file mode 100644 index 000000000..fe598af4c --- /dev/null +++ b/test/perl-flt.vcf @@ -0,0 +1,41 @@ +##fileformat=VCFv4.2 +##FORMAT= +##reference=file://some/path/human_g1k_v37.fasta +##INFO= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SmplAAA SmplBBB +1 90 . C T . . CSQ=5_prime_utr|XYZ|ENST00000000001|protein_coding GT 1|0 1|1 +1 102 . C T 1 . CSQ=synonymous|XYZ|ENST00000000001|protein_coding|+|1Y|102C>T GT 1|0 1|0 +1 103 . G A 1 . CSQ=missense|XYZ|ENST00000000001|protein_coding|+|2V>2I|103G>A GT 1|0 0|0 +1 103 . G C 1 . CSQ=missense|XYZ|ENST00000000001|protein_coding|+|2V>2L|103G>C GT 0|0 1|0 +1 107 . G A 1 . CSQ=missense|XYZ|ENST00000000001|protein_coding|+|3R>3Q|107G>A+108T>A GT 1|0 1|0 +1 108 . T A 1 . CSQ=splice_region|XYZ|ENST00000000001|protein_coding,@107 GT 1|0 1|0 +1 121 . ACG A . . CSQ=inframe_deletion|XYZ|ENST00000000001|protein_coding|+|5TY>5I|121ACG>A+124TA>T,splice_region|XYZ|ENST00000000001|protein_coding GT 1|0 1|0 +1 124 . TA T . . CSQ=@121 GT 1|0 1|0 +1 128 . T C 1 . CSQ=missense|XYZ|ENST00000000001|protein_coding|+|7V>6A|128T>C+129A>C,splice_region|XYZ|ENST00000000001|protein_coding GT 1|0 0/0 +1 129 . A C 1 . CSQ=splice_region|XYZ|ENST00000000001|protein_coding,@128 GT 1|0 0/0 +1 140 . TA AACG . . CSQ=inframe_insertion|XYZ|ENST00000000001|protein_coding|+|8LR>7QRR|140TA>AACG+142C>CC,splice_region|XYZ|ENST00000000001|protein_coding GT 1|0 0|0 +1 142 . C CC . . CSQ=splice_region|XYZ|ENST00000000001|protein_coding,@140 GT 1|0 0|0 +1 145 . AC TA . . CSQ=stop_gained|XYZ|ENST00000000001|protein_coding|+|10T>10*|145AC>TA GT 1|0 0|0 +1 160 . TA T . . CSQ=*frameshift|XYZ|ENST00000000001|protein_coding|+|12YVRT>12SYV|160TA>T,splice_region|XYZ|ENST00000000001|protein_coding GT 1|0 0|0 +1 190 . C T . . CSQ=3_prime_utr|XYZ|ENST00000000001|protein_coding GT 1|0 0|0 +2 97 . A C . . CSQ=3_prime_utr|ABC|ENST00000000002|protein_coding GT 1|0 0|0 +2 105 . AC A . . CSQ=@121 GT 1|0 0|0 +2 121 . AC A . . CSQ=frameshift|ABC|ENST00000000002|protein_coding|-|11VVRTY>11*|105AC>A+121AC>A,splice_region|ABC|ENST00000000002|protein_coding GT 1|0 0|0 +2 126 . C CTT . . CSQ=@127 GT 1|0 0|0 +2 127 . G GG . . CSQ=inframe_insertion|ABC|ENST00000000002|protein_coding|-|9T>8TK|126C>CTT+127G>GG GT 1|0 0|0 +2 144 . TAC T . . CSQ=@148 GT 1|0 0|0 +2 148 . TA T . . CSQ=inframe_deletion|ABC|ENST00000000002|protein_coding|-|5YV>5T|144TAC>T+148TA>T,splice_region|ABC|ENST00000000002|protein_coding GT 1|0 0|0 +2 164 . T G . . CSQ=missense|ABC|ENST00000000002|protein_coding|-|3T>3P|164T>G GT 1|0 0|0 +2 165 . A C . . CSQ=synonymous|ABC|ENST00000000002|protein_coding|-|2R|165A>C GT 1|0 0|0 +2 169 . A G . . CSQ=@170 GT 1|0 0|0 +2 170 . C T . . CSQ=missense|ABC|ENST00000000002|protein_coding|-|1V>1T|169A>G+170C>T GT 1|0 0|0 +2 199 . G T . . CSQ=5_prime_utr|ABC|ENST00000000002|protein_coding GT 1|0 0|0 +3 20 . T A . . CSQ=non_coding|mir-007||lincRNA GT 1|0 0|0 +3 109 . ACGTACGT A 1 . CSQ=splice_acceptor|QWRTY|ENST00000000003|protein_coding GT 1|0 0|0 +3 113 . A T . . CSQ=splice_region|QWRTY|ENST00000000003|protein_coding GT 1|0 0|0 +3 120 . T A . . CSQ=intron|QWRTY||protein_coding GT 1|0 0|0 +3 152 . T A . . CSQ=splice_region|QWRTY|ENST00000000003|protein_coding GT 1|0 0|0 +3 159 . G A . . CSQ=splice_donor|QWRTY|ENST00000000003|protein_coding GT 1|0 0|0 From b41f87bb11f4d57fdbbc15c77946a600edf6f989 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 15:09:05 +0000 Subject: [PATCH 02/27] Correctly transfer Number=G annotations, the number of alleles may be different and they can come in different order --- test/annotate11.vcf | 11 ++ test/annotate15.out | 17 +++ test/annots11.vcf | 15 +++ vcfannotate.c | 255 +++++++++++++++++++++++++++++++++++++++++++- vcmp.c | 27 ++++- vcmp.h | 1 + 6 files changed, 322 insertions(+), 4 deletions(-) create mode 100644 test/annotate11.vcf create mode 100644 test/annotate15.out create mode 100644 test/annots11.vcf diff --git a/test/annotate11.vcf b/test/annotate11.vcf new file mode 100644 index 000000000..1da05f8c4 --- /dev/null +++ b/test/annotate11.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A +1 3000001 . C T . . . GT . . +1 3000002 . C T . . . GT 1|1 0 +1 3000003 . C CT,CTT . . . GT 1|1 0 +X 3000001 . C T . . . GT . . +X 3000002 . C T . . . GT 1|1 0 +X 3000003 . C CT,CTT . . . GT 1|1 0 diff --git a/test/annotate15.out b/test/annotate15.out new file mode 100644 index 000000000..e3365edb4 --- /dev/null +++ b/test/annotate15.out @@ -0,0 +1,17 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##FORMAT= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A +1 3000001 . C T . . . GT:AD:AF:PL:GL .:8,9:8,9:10,11,12:10,11,12 .:1,2:1,2:3,4:3,4 +1 3000002 . C T . . . GT:AD:AF:PL:GL 0/1:10,12:10,12:10,13,15:10,13,15 1/0:1,3:1,3:1,4,6:1,4,6 +1 3000003 . C CT,CTT . . . GT:AD:AF:PL:GL 0/1:4,.,5:4,.,5:4,.,.,5,.,6:4,.,.,5,.,6 1/0:1,.,2:1,.,2:1,.,.,2,.,3:1,.,.,2,.,3 +X 3000001 . C T . . . GT:AD:AF:PL:GL .:8,9:8,9:10,11,12:10,11,12 .:1,2:1,2:3,4:3,4 +X 3000002 . C T . . . GT:AD:AF:PL:GL 0/1:10,12:10,12:10,13,15:10,13,15 0:1,3:1,3:1,3:1,3 +X 3000003 . C CT,CTT . . . GT:AD:AF:PL:GL 0/1:4,.,5:4,.,5:4,.,.,5,.,6:4,.,.,5,.,6 0:1,.,2:1,.,2:1,.,2:1,.,2 diff --git a/test/annots11.vcf b/test/annots11.vcf new file mode 100644 index 000000000..2a81bfa87 --- /dev/null +++ b/test/annots11.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.2 +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 3000001 . C T . . . GT:AD:AF:PL:GL .:1,2:1,2:3,4:3,4 .:8,9:8,9:10,11,12:10,11,12 +1 3000002 . C CT,T . . . GT:AD:AF:PL:GL 1/0:1,2,3:1,2,3:1,2,3,4,5,6:1,2,3,4,5,6 0/1:10,11,12:10,11,12:10,11,12,13,14,15:10,11,12,13,14,15 +1 3000003 . C CTT . . . GT:AD:AF:PL:GL 1/0:1,2:1,2:1,2,3:1,2,3 0/1:4,5:4,5:4,5,6:4,5,6 +X 3000001 . C T . . . GT:AD:AF:PL:GL .:1,2:1,2:3,4:3,4 .:8,9:8,9:10,11,12:10,11,12 +X 3000002 . C CT,T . . . GT:AD:AF:PL:GL 0:1,2,3:1,2,3:1,2,3:1,2,3 0/1:10,11,12:10,11,12:10,11,12,13,14,15:10,11,12,13,14,15 +X 3000003 . C CTT . . . GT:AD:AF:PL:GL 0:1,2:1,2:1,2:1,2 0/1:4,5:4,5:4,5,6:4,5,6 diff --git a/vcfannotate.c b/vcfannotate.c index abea98f90..3de9eb5cf 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -114,6 +114,7 @@ typedef struct _args_t int nsmpl_annot; int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc + uint8_t *src_smpl_pld, *dst_smpl_pld; // for Number=G format fields int mtmpi, mtmpf, mtmps; int mtmpi2, mtmpf2, mtmps2; int mtmpi3, mtmpf3, mtmps3; @@ -1113,21 +1114,261 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void return core_setter_format_str(args,line,col,args->tmpp); } +static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl) +{ + int i, j, ndip = nals*(nals+1)/2, max_ploidy = 0; + for (i=0; ifiles->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi); if ( nsrc==-3 ) return 0; // the tag is not present if ( nsrc<=0 ) return 1; // error - return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header)); + int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); + int nsrc1 = nsrc / nsmpl_src; + if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A ) + return core_setter_format_int(args,line,col,args->tmpi,nsrc1); + + // create mapping from src to dst genotypes, haploid and diploid version + int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; + int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); + if ( !map_hap ) error("REF alleles not compatible at %s:%d\n"); + + int i, j; + if ( rec->n_allele==line->n_allele ) + { + // alleles unchanged? + for (i=0; in_allele; i++) if ( map_hap[i]!=i ) break; + if ( i==rec->n_allele ) + return core_setter_format_int(args,line,col,args->tmpi,nsrc1); + } + + int nsmpl_dst = rec->n_sample; + int ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2); + int ndst1 = ndst / nsmpl_dst; + if ( ndst <= 0 ) + { + if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->number==BCF_VL_G ) + ndst1 = line->n_allele*(line->n_allele+1)/2; + else + ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; + hts_expand(int, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); + for (i=0; itmpi2 + i*ndst1; + for (j=0; jnumber==BCF_VL_G ) + { + map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip); + if ( !args->src_smpl_pld ) + { + args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src); + args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst); + } + int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); + if ( pld_src<0 ) + error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); + int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); + if ( pld_dst<0 ) + error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); + + int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; + if ( ndst1_new != ndst1 ) + { + if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); + ndst1 = ndst1_new; + hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); + } + } + else if ( !ndst1 ) + { + ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; + hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2); + } + + for (i=0; isample_map ? args->sample_map[i] : i; + int32_t *ptr_src = args->tmpi + i*nsrc1; + int32_t *ptr_dst = args->tmpi2 + ii*ndst1; + + if ( col->number==BCF_VL_G ) + { + if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) + error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); + if ( !args->dst_smpl_pld[i] ) + for (j=0; jnumber!=BCF_VL_G || args->src_smpl_pld[i]==1 ) + { + for (j=0; j=0 ) ptr_dst[k] = ptr_src[j]; + } + if ( col->number==BCF_VL_G ) + for (j=line->n_allele; j=0 ) ptr_dst[k] = ptr_src[j]; + } + } + } + return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nsmpl_dst*ndst1); } static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { + bcf1_t *rec = (bcf1_t*) data; int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf); if ( nsrc==-3 ) return 0; // the tag is not present if ( nsrc<=0 ) return 1; // error - return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header)); + int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header); + int nsrc1 = nsrc / nsmpl_src; + if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A ) + return core_setter_format_real(args,line,col,args->tmpf,nsrc1); + + // create mapping from src to dst genotypes, haploid and diploid version + int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1; + int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele); + if ( !map_hap ) error("REF alleles not compatible at %s:%d\n"); + + int i, j; + if ( rec->n_allele==line->n_allele ) + { + // alleles unchanged? + for (i=0; in_allele; i++) if ( map_hap[i]!=i ) break; + if ( i==rec->n_allele ) + return core_setter_format_real(args,line,col,args->tmpf,nsrc1); + } + + int nsmpl_dst = rec->n_sample; + int ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2); + int ndst1 = ndst / nsmpl_dst; + if ( ndst <= 0 ) + { + if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present + if ( col->number==BCF_VL_G ) + ndst1 = line->n_allele*(line->n_allele+1)/2; + else + ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; + hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); + for (i=0; itmpf2 + i*ndst1; + for (j=0; jnumber==BCF_VL_G ) + { + map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip); + if ( !args->src_smpl_pld ) + { + args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src); + args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst); + } + int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src); + if ( pld_src<0 ) + error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1); + int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst); + if ( pld_dst<0 ) + error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1); + + int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2; + if ( ndst1_new != ndst1 ) + { + if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1); + ndst1 = ndst1_new; + hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); + } + } + else if ( !ndst1 ) + { + ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele; + hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2); + } + + for (i=0; isample_map ? args->sample_map[i] : i; + float *ptr_src = args->tmpf + i*nsrc1; + float *ptr_dst = args->tmpf2 + ii*ndst1; + + if ( col->number==BCF_VL_G ) + { + if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] ) + error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1); + if ( !args->dst_smpl_pld[i] ) + for (j=0; jnumber!=BCF_VL_G || args->src_smpl_pld[i]==1 ) + { + for (j=0; j=0 ) + { + if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]); + else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); + else ptr_dst[k] = ptr_src[j]; + } + } + if ( col->number==BCF_VL_G ) + for (j=line->n_allele; j=0 ) + { + if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]); + else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); + else ptr_dst[k] = ptr_src[j]; + } + } + } + } + return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1); + } static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -1458,6 +1699,8 @@ static void init_columns(args_t *args) case BCF_HT_STR: col->setter = vcf_setter_format_str; has_fmt_str = 1; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id)); } + hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, hrec->vals[k]); + col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id); } } else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) ) @@ -1506,6 +1749,12 @@ static void init_columns(args_t *args) case BCF_HT_STR: col->setter = args->tgts_is_vcf ? vcf_setter_format_str : setter_format_str; has_fmt_str = 1; break; default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id)); } + if ( args->tgts_is_vcf ) + { + bcf_hdr_t *tgts_hdr = args->files->readers[1].header; + hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, col->hdr_key_src); + col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id); + } } else { @@ -1697,6 +1946,8 @@ static void destroy_data(args_t *args) free(args->tmpp2); free(args->tmpi3); free(args->tmpf3); + free(args->src_smpl_pld); + free(args->dst_smpl_pld); if ( args->set_ids ) convert_destroy(args->set_ids); if ( args->filter ) diff --git a/vcmp.c b/vcmp.c index 8d04b8942..7d3b0f9ea 100644 --- a/vcmp.c +++ b/vcmp.c @@ -26,6 +26,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include "vcmp.h" @@ -34,7 +35,8 @@ struct _vcmp_t char *dref; int ndref, mdref; // ndref: positive when ref1 longer, negative when ref2 is longer int nmatch; - int *map, mmap; + int *map, mmap, nmap; + int *map_dip, mmap_dip, nmap_dip; }; vcmp_t *vcmp_init() @@ -44,6 +46,7 @@ vcmp_t *vcmp_init() void vcmp_destroy(vcmp_t *vcmp) { + free(vcmp->map_dip); free(vcmp->map); free(vcmp->dref); free(vcmp); @@ -120,7 +123,8 @@ int *vcmp_map_ARvalues(vcmp_t *vcmp, int n, int nals1, char **als1, int nals2, c { if ( vcmp_set_ref(vcmp,als1[0],als2[0]) < 0 ) return NULL; - vcmp->map = (int*) realloc(vcmp->map,sizeof(int)*n); + vcmp->nmap = n; + hts_expand(int, vcmp->nmap, vcmp->mmap, vcmp->map); int i, ifrom = n==nals2 ? 0 : 1; for (i=ifrom; imap; } +int *vcmp_map_dipGvalues(vcmp_t *vcmp, int *nmap) +{ + vcmp->nmap_dip = vcmp->nmap*(vcmp->nmap+1)/2; + hts_expand(int, vcmp->nmap_dip, vcmp->mmap_dip, vcmp->map_dip); + + int i, j, k = 0; + for (i=0; inmap; i++) + { + for (j=0; j<=i; j++) + { + vcmp->map_dip[k] = vcmp->map[i]>=0 && vcmp->map[j]>=0 ? bcf_alleles2gt(vcmp->map[i],vcmp->map[j]) : -1; + k++; + } + } + *nmap = k; + return vcmp->map_dip; +} + + diff --git a/vcmp.h b/vcmp.h index 031770404..9c6370ce2 100644 --- a/vcmp.h +++ b/vcmp.h @@ -58,5 +58,6 @@ int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2); */ int *vcmp_map_ARvalues(vcmp_t *vcmp, int number, int nals1, char **als1, int nals2, char **als2); +int *vcmp_map_dipGvalues(vcmp_t *vcmp, int *nmap); #endif From 3a63b849081e19557a7c06570e962efc8cbb1d44 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 15:11:17 +0000 Subject: [PATCH 03/27] New +contrast plugin --- plugins/contrast.c | 364 +++++++++++++++++++++++++++++++++++++++++++++ test/contrast.out | 11 ++ test/contrast.vcf | 8 + test/test.pl | 2 + 4 files changed, 385 insertions(+) create mode 100644 plugins/contrast.c create mode 100644 test/contrast.out create mode 100644 test/contrast.vcf diff --git a/plugins/contrast.c b/plugins/contrast.c new file mode 100644 index 000000000..71cb1723f --- /dev/null +++ b/plugins/contrast.c @@ -0,0 +1,364 @@ +/* The MIT License + + Copyright (c) 2018 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include // for isatty +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "filter.h" + + +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 + +typedef struct +{ + int argc, filter_logic, regions_is_file, targets_is_file, output_type; + char **argv, *output_fname, *fname, *regions, *targets, *filter_str; + char *bg_samples_str, *novel_samples_str; + int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl; + filter_t *filter; + bcf_srs_t *sr; + bcf_hdr_t *hdr, *hdr_out; + htsFile *out_fh; + int32_t *gts; + int mgts; + uint32_t *bg_gts; + int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt; + kstring_t novel_als_smpl, novel_gts_smpl; +} +args_t; + +args_t args; + +const char *about(void) +{ + return "Find novel alleles and genotypes in two groups of samples.\n"; +} + +static const char *usage_text(void) +{ + return + "\n" + "About: Finds novel alleles and genotypes in two groups of samples. Adds\n" + " an annotation which lists samples with a novel allele (INFO/NOVELAL)\n" + " or a novel genotype (INFO/NOVELGT)\n" + "Usage: bcftools +contrast [Plugin Options]\n" + "Plugin options:\n" + " -0, --bg-samples list of background samples\n" + " -1, --novel-samples list of samples where novel allele or genotype are expected\n" + " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" + " -i, --include EXPR include sites and samples for which the expression is true\n" + " -o, --output FILE output file name [stdout]\n" + " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" + " -r, --regions REG restrict to comma-separated list of regions\n" + " -R, --regions-file FILE restrict to regions listed in a file\n" + " -t, --targets REG similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + "\n" + "Example:\n" + " # Test if any of the samples a,b is different from the samples c,d,e\n" + " bcftools +contrast -0 c,d,e -1 a,b file.bcf\n" + "\n"; +} + +static void init_data(args_t *args) +{ + args->sr = bcf_sr_init(); + if ( args->regions ) + { + args->sr->require_index = 1; + if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions); + } + if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets); + if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); + args->hdr = bcf_sr_get_header(args->sr,0); + args->hdr_out = bcf_hdr_dup(args->hdr); + bcf_hdr_append(args->hdr_out, "##INFO="); + bcf_hdr_append(args->hdr_out, "##INFO="); + + if ( args->filter_str ) + args->filter = filter_init(args->hdr, args->filter_str); + + int i; + char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl); + args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl); + for (i=0; inbg_smpl; i++) + { + args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); + if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); + free(smpl[i]); + } + free(smpl); + + smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl); + args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl); + for (i=0; innovel_smpl; i++) + { + args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]); + if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]); + free(smpl[i]); + } + free(smpl); + + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); + bcf_hdr_write(args->out_fh, args->hdr_out); +} +static void destroy_data(args_t *args) +{ + bcf_hdr_destroy(args->hdr_out); + hts_close(args->out_fh); + free(args->novel_als_smpl.s); + free(args->novel_gts_smpl.s); + free(args->gts); + free(args->bg_gts); + free(args->bg_smpl); + free(args->novel_smpl); + if ( args->filter ) filter_destroy(args->filter); + bcf_sr_destroy(args->sr); + free(args); +} +static inline int binary_search(uint32_t val, uint32_t *dat, int ndat) +{ + int i = -1, imin = 0, imax = ndat - 1; + while ( imin<=imax ) + { + i = (imin+imax)/2; + if ( dat[i] < val ) imin = i + 1; + else if ( dat[i] > val ) imax = i - 1; + else return 1; + } + return 0; +} +static inline void binary_insert(uint32_t val, uint32_t **dat, int *ndat, int *mdat) +{ + int i = -1, imin = 0, imax = *ndat - 1; + while ( imin<=imax ) + { + i = (imin+imax)/2; + if ( (*dat)[i] < val ) imin = i + 1; + else if ( (*dat)[i] > val ) imax = i - 1; + else return; + } + while ( i>=0 && (*dat)[i]>val ) i--; + + (*ndat)++; + hts_expand(uint32_t, (*ndat), (*mdat), (*dat)); + + if ( *ndat > 1 ) + memmove(*dat + i + 1, *dat + i, sizeof(uint32_t)*(*ndat - i - 1)); + + (*dat)[i+1] = val; +} +static int process_record(args_t *args, bcf1_t *rec) +{ + args->ntotal++; + + static int warned = 0; + int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->mgts); + ngts /= rec->n_sample; + if ( ngts>2 ) error("todo: ploidy=%d\n", ngts); + + args->nbg_gts = 0; + uint32_t bg_als = 0; + int i,j; + for (i=0; inbg_smpl; i++) + { + uint32_t gt = 0; + int32_t *ptr = args->gts + args->bg_smpl[i]*ngts; + for (j=0; j 31 ) + { + if ( !warned ) + { + fprintf(stderr,"Too many alleles (>32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); + warned = 1; + } + args->nskipped++; + return -1; + } + bg_als |= 1<bg_gts, &args->nbg_gts, &args->mbg_gts); + } + if ( !bg_als ) + { + // all are missing + args->nskipped++; + return -1; + } + + args->novel_als_smpl.l = 0; + args->novel_gts_smpl.l = 0; + + int has_gt = 0; + for (i=0; innovel_smpl; i++) + { + int novel_al = 0; + uint32_t gt = 0; + int32_t *ptr = args->gts + args->novel_smpl[i]*ngts; + for (j=0; j 31 ) + { + if ( !warned ) + { + fprintf(stderr,"Too many alleles (>32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1); + warned = 1; + } + args->nskipped++; + return -1; + } + if ( !(bg_als & (1<hdr->samples[ args->novel_smpl[i] ]; + if ( novel_al ) + { + if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl); + kputs(smpl, &args->novel_als_smpl); + } + else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) ) + { + if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl); + kputs(smpl, &args->novel_gts_smpl); + } + } + if ( !has_gt ) + { + // all are missing + args->nskipped++; + return -1; + } + if ( args->novel_als_smpl.l ) + { + bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s); + args->nnovel_al++; + } + if ( args->novel_gts_smpl.l ) + { + bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s); + args->nnovel_gt++; + } + args->ntested++; + return 0; +} + +int run(int argc, char **argv) +{ + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->output_fname = "-"; + static struct option loptions[] = + { + {"bg-samples",required_argument,0,'0'}, + {"novel-samples",required_argument,0,'1'}, + {"include",required_argument,0,'i'}, + {"exclude",required_argument,0,'e'}, + {"output",required_argument,NULL,'o'}, + {"output-type",required_argument,NULL,'O'}, + {"regions",1,0,'r'}, + {"regions-file",1,0,'R'}, + {"targets",1,0,'t'}, + {"targets-file",1,0,'T'}, + {NULL,0,NULL,0} + }; + int c; + while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0) + { + switch (c) + { + case '0': args->bg_samples_str = optarg; break; + case '1': args->novel_samples_str = optarg; break; + case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 't': args->targets = optarg; break; + case 'T': args->targets = optarg; args->targets_is_file = 1; break; + case 'r': args->regions = optarg; break; + case 'R': args->regions = optarg; args->regions_is_file = 1; break; + case 'o': args->output_fname = optarg; break; + case 'O': + switch (optarg[0]) { + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: error("The output type \"%s\" not recognised\n", optarg); + }; + break; + case 'h': + case '?': + default: error("%s", usage_text()); break; + } + } + if ( optind==argc ) + { + if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin + else { error(usage_text()); } + } + else if ( optind+1!=argc ) error(usage_text()); + else args->fname = argv[optind]; + + init_data(args); + + while ( bcf_sr_next_line(args->sr) ) + { + bcf1_t *rec = bcf_sr_get_line(args->sr,0); + if ( args->filter ) + { + int pass = filter_test(args->filter, rec, NULL); + if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; + if ( !pass ) continue; + } + process_record(args, rec); + bcf_write(args->out_fh, args->hdr_out, rec); + } + + fprintf(stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt); + destroy_data(args); + + return 0; +} diff --git a/test/contrast.out b/test/contrast.out new file mode 100644 index 000000000..7707bc004 --- /dev/null +++ b/test/contrast.out @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##contig= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b c +1 100 . A G . . . GT 0/0 0/0 0/0 +1 101 . A G . . NOVELAL=c GT 0/0 0/0 0/1 +1 102 . A G . . NOVELGT=c GT 0/0 0/1 1/1 +1 103 . A G . . NOVELAL=c GT 1/1 1/1 0/1 diff --git a/test/contrast.vcf b/test/contrast.vcf new file mode 100644 index 000000000..59941d274 --- /dev/null +++ b/test/contrast.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b c +1 100 . A G . . . GT 0/0 0/0 0/0 +1 101 . A G . . . GT 0/0 0/0 0/1 +1 102 . A G . . . GT 0/0 0/1 1/1 +1 103 . A G . . . GT 1/1 1/1 0/1 diff --git a/test/test.pl b/test/test.pl index e6a862b04..f2d801176 100755 --- a/test/test.pl +++ b/test/test.pl @@ -249,6 +249,7 @@ test_vcf_annotate($opts,in=>'annotate2',vcf=>'annots2',out=>'annotate12.out',args=>'-c AAA:=IINT,FMT/BBB:=FMT/FINT'); test_vcf_annotate($opts,in=>'annotate2',vcf=>'annots2',out=>'annotate13.out',args=>'-x INFO -c INFO/IINT'); test_vcf_annotate($opts,in=>'annotate2',vcf=>'annots2',out=>'annotate14.out',args=>q[-x INFO -c INFO/IINT -e'POS=3000001' -k]); +test_vcf_annotate($opts,in=>'annotate11',vcf=>'annots11',out=>'annotate15.out',args=>q[-c FMT]); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref --no-version'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT --no-version',args=>'-- -t . -n 0'); test_vcf_plugin($opts,in=>'setGT',out=>'setGT.1.out',cmd=>'+setGT --no-version',args=>'-- -t q -n 0 -i \'GT~"." && FMT/DP=30 && GQ=150\''); @@ -285,6 +286,7 @@ test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.1.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -d'); test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.2.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -l+'); test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -lx'); +test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-0 a,b -1 c'); test_vcf_concat($opts,in=>['concat.1.a','concat.1.b'],out=>'concat.1.vcf.out',do_bcf=>0,args=>''); test_vcf_concat($opts,in=>['concat.1.a','concat.1.b'],out=>'concat.1.bcf.out',do_bcf=>1,args=>''); test_vcf_concat($opts,in=>['concat.2.a','concat.2.b'],out=>'concat.2.vcf.out',do_bcf=>0,args=>'-a'); From c320cd89fe93b3c2180c49e9ed5e6415ddae4f10 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 15:11:38 +0000 Subject: [PATCH 04/27] Print informative message if no arguments are given --- vcfnorm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vcfnorm.c b/vcfnorm.c index ead1e8951..ac0321954 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1944,9 +1944,7 @@ int main_vcfnorm(int argc, char *argv[]) default: error("Unknown argument: %s\n", optarg); } } - if ( argc>optind+1 ) usage(); - if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) usage(); - if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n"); + char *fname = NULL; if ( optind>=argc ) { @@ -1955,6 +1953,9 @@ int main_vcfnorm(int argc, char *argv[]) } else fname = argv[optind]; + if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); + if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n"); + if ( args->region ) { if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 ) From 76babbc42c03e81b132cf3eaabcbb215df464e57 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 15:19:37 +0000 Subject: [PATCH 05/27] Fix the warning string, `-D` is in fact an alias of `-d none` --- vcfnorm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcfnorm.c b/vcfnorm.c index ac0321954..fb2aadb42 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1924,7 +1924,7 @@ int main_vcfnorm(int argc, char *argv[]) break; case 'o': args->output_fname = optarg; break; case 'D': - fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); + fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); args->rmdup = BCF_SR_PAIR_EXACT; break; case 's': args->strict_filter = 1; break; From 4dca79a0d64035b7ae0a1b54a00469a7903ef54b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 15:48:47 +0000 Subject: [PATCH 06/27] Sort records with the same chr:pos lexicographically by ref,alt --- test/sort.out | 32 +++++++++++++++++--------------- test/sort.vcf | 4 +++- test/test.pl | 4 ++-- vcfsort.c | 20 +++++++++++++++++--- 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/test/sort.out b/test/sort.out index fa723cda8..1c31ceb7b 100644 --- a/test/sort.out +++ b/test/sort.out @@ -1,15 +1,17 @@ -1 101 -1 102 -1 103 -1 104 -1 105 -2 101 -2 102 -2 103 -2 104 -2 105 -3 101 -3 102 -3 103 -3 104 -3 105 +1 101 T,C,A +1 101 T,CC,A +1 101 T,TC +1 102 T,C +1 103 T,C +1 104 T,C +1 105 T,C +2 101 T,C +2 102 T,C +2 103 T,C +2 104 T,C +2 105 T,C +3 101 T,C +3 102 T,C +3 103 T,C +3 104 T,C +3 105 T,C diff --git a/test/sort.vcf b/test/sort.vcf index 649bd01e8..b86c97bdb 100644 --- a/test/sort.vcf +++ b/test/sort.vcf @@ -18,4 +18,6 @@ 1 104 . T C . . . 1 103 . T C . . . 1 102 . T C . . . -1 101 . T C . . . +1 101 . T TC . . . +1 101 . T C,A . . . +1 101 . T CC,A . . . diff --git a/test/test.pl b/test/test.pl index f2d801176..6c7e54b15 100755 --- a/test/test.pl +++ b/test/test.pl @@ -231,8 +231,8 @@ test_vcf_filter($opts,in=>'filter.5',out=>'filter.24.out',args=>q[-i'AD[0:0]=="."'],fmt=>'%POS[\\t%AD]\\n'); test_vcf_filter($opts,in=>'filter.5',out=>'filter.25.out',args=>q[-i'AD[0:0]!="."'],fmt=>'%POS[\\t%AD]\\n'); test_vcf_filter($opts,in=>'filter.5',out=>'filter.26.out',args=>q[-i'QUAL=="."'],fmt=>'%POS\\t%QUAL\\n'); -test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\n'); -test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\n'); +test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); +test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); test_vcf_regions($opts,in=>'regions'); test_vcf_annotate($opts,in=>'annotate',tab=>'annotate',out=>'annotate.out',args=>'-c CHROM,POS,REF,ALT,ID,QUAL,INFO/T_INT,INFO/T_FLOAT,INDEL'); test_vcf_annotate($opts,in=>'annotate',tab=>'annotate2',out=>'annotate2.out',args=>'-c CHROM,FROM,TO,T_STR'); diff --git a/vcfsort.c b/vcfsort.c index e41b628c9..78f7adbe2 100644 --- a/vcfsort.c +++ b/vcfsort.c @@ -67,6 +67,21 @@ int cmp_bcf_pos(const void *aptr, const void *bptr) if ( a->rid > b->rid ) return 1; if ( a->pos < b->pos ) return -1; if ( a->pos > b->pos ) return 1; + + // Sort the same chr:pos records lexicographically by ref,alt. + // This will be called rarely so should not slow the sorting down + // noticeably. + + if ( !a->unpacked ) bcf_unpack(a, BCF_UN_STR); + if ( !b->unpacked ) bcf_unpack(b, BCF_UN_STR); + int i, ret; + for (i=0; in_allele; i++) + { + if ( i >= b->n_allele ) return 1; + int ret = strcasecmp(a->d.allele[i],b->d.allele[i]); + if ( ret ) return ret; + } + if ( a->n_allele < b->n_allele ) return -1; return 0; } @@ -138,9 +153,8 @@ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr) { blk_t *a = *aptr; blk_t *b = *bptr; - if ( a->rec->rid < b->rec->rid ) return 1; - if ( a->rec->rid > b->rec->rid ) return 0; - if ( a->rec->pos < b->rec->pos ) return 1; + int ret = cmp_bcf_pos(&a->rec, &b->rec); + if ( ret < 0 ) return 1; return 0; } KHEAP_INIT(blk, blk_t*, blk_is_smaller) From 347a508c54b1de7a0ac005830ecad81b5f7fe2cd Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Mar 2018 15:50:19 +0000 Subject: [PATCH 07/27] Updated documentation --- doc/bcftools.1 | 65 ++++++++++++++++++++++++++++++----------------- doc/bcftools.html | 59 +++++++++++++++++++++++++----------------- doc/bcftools.txt | 24 +++++++++-------- 3 files changed, 91 insertions(+), 57 deletions(-) diff --git a/doc/bcftools.1 b/doc/bcftools.1 index c0e70b8c2..dd40d8d75 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -2,12 +2,12 @@ .\" Title: bcftools .\" Author: [see the "AUTHORS" section] .\" Generator: DocBook XSL Stylesheets v1.76.1 -.\" Date: 2018-02-12 +.\" Date: 2018-03-15 15:49 GMT .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2018\-02\-12" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2018\-03\-15 15:49 GMT" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Define some portability stuff .\" ----------------------------------------------------------------- @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&. .SS "VERSION" .sp -This manual page was last updated \fB2018\-02\-12\fR and refers to bcftools git version \fB1\&.7\fR\&. +This manual page was last updated \fB2018\-03\-15 15:49 GMT\fR and refers to bcftools git version \fB1\&.7\-7\-g4dca79a+\fR\&. .SS "BCF1" .sp The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&. @@ -2096,7 +2096,7 @@ see .RE .SS "bcftools gtcheck [\fIOPTIONS\fR] [\-g \fIgenotypes\&.vcf\&.gz\fR] \fIquery\&.vcf\&.gz\fR" .sp -Checks sample identity or, without \fB\-g\fR, multi\-sample cross\-check is performed\&. +Checks sample identity\&. The program can operate in two modes\&. If the \fB\-g\fR option is given, the identity of the \fB\-s\fR sample from \fIquery\&.vcf\&.gz\fR is checked against the samples in the \fB\-g\fR file\&. Without the \fB\-g\fR option, multi\-sample cross\-check of samples in \fIquery\&.vcf\&.gz\fR is performed\&. .PP \fB\-a, \-\-all\-sites\fR .RS 4 @@ -2227,21 +2227,14 @@ is used for the unseen genotypes\&. With can be used instead; the discordance value then gives exactly the number of differing genotypes\&. .RE .PP -SM, Average Discordance +ERR, error rate .RS 4 -Average discordance between sample -\fIa\fR -and all other samples\&. -.RE -.PP -SM, Average Depth -.RS 4 -Average depth at evaluated sites, or 1 if FORMAT/DP field is not present\&. +Pairwise error rate calculated as number of differences divided by the total number of comparisons\&. .RE .PP -SM, Average Number of sites +CLUSTER, TH, DOT .RS 4 -The average number of sites used to calculate the discordance\&. In other words, the average number of non\-missing PLs/genotypes seen both samples\&. +In presence of multiple samples, related samples and outliers can be identified by clustering samples by error rate\&. A simple hierarchical clustering based on minimization of standard deviation is used\&. This is useful to detect sample swaps, for example in situations where one sample has been sequenced in multiple runs\&. .RE .RE .SS "bcftools index [\fIOPTIONS\fR] \fIin\&.bcf\fR|\fIin\&.vcf\&.gz\fR" @@ -2986,7 +2979,7 @@ and can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields\&. .RE .PP -\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIany\fR +\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fInone\fR .RS 4 If a record is present multiple times, output only the first instance, see \fB\-\-collapse\fR @@ -2997,8 +2990,7 @@ in \fB\-D, \-\-remove\-duplicates\fR .RS 4 If a record is present in multiple files, output only the first instance\&. Alias for -\fB\-d none\fR\&. Requires -\fB\-a, \-\-allow\-overlaps\fR\&. +\fB\-d none\fR, deprecated\&. .RE .PP \fB\-f, \-\-fasta\-ref\fR \fIFILE\fR @@ -4680,15 +4672,17 @@ TYPE!~"snp" .sp -1 .IP \(bu 2.3 .\} -array subscripts (0\-based), "*" for any field, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield +array subscripts (0\-based), "*" for any element, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and an element of the vector, as shown in the examples below .sp .if n \{\ .RS 4 .\} .nf -(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3 -DP4[*] == 0 -CSQ[*] ~ "missense_variant\&.*deleterious" +INFO/AF[0] > 0\&.3 \&.\&. first AF value bigger than 0\&.3 +FORMAT/AD[0:0] > 30 \&.\&. first AD value of the first sample bigger than 30 +FORMAT/AD[0:1] \&.\&. first sample, second AD value +FORMAT/AD[1:0] \&.\&. second sample, first AD value +DP4[*] == 0 \&.\&. any DP4 value FORMAT/DP[0] > 30 \&.\&. DP of the first sample bigger than 30 FORMAT/DP[1\-3] > 10 \&.\&. samples 2\-4 FORMAT/DP[1\-] < 7 \&.\&. all samples but the first @@ -4696,6 +4690,8 @@ FORMAT/DP[0,2\-4] > 20 \&.\&. samples 1, 3\-5 FORMAT/AD[0:1] \&.\&. first sample, second AD field FORMAT/AD[0:*], AD[0:] or AD[0] \&.\&. first sample, any AD field FORMAT/AD[*:1] or AD[:1] \&.\&. any sample, second AD field +(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3 +CSQ[*] ~ "missense_variant\&.*deleterious" .fi .if n \{\ .RE @@ -4743,6 +4739,29 @@ N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING .RE .\} .RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} +custom perl filtering\&. Note that this command is not compiled in by default, see the section +\fBOptional Compilation with Perl\fR +in the INSTALL file for help and misc/demo\-flt\&.pl for a working example\&. The demo defined the perl subroutine "severity" which can be invoked from the command line as follows: +.sp +.if n \{\ +.RS 4 +.\} +.nf +perl:path/to/script\&.pl; perl\&.severity(INFO/CSQ) > 3 +.fi +.if n \{\ +.RE +.\} +.RE .PP \fBNotes:\fR .sp @@ -4776,7 +4795,7 @@ Variables and function names are case\-insensitive, but not tag names\&. For exa .sp -1 .IP \(bu 2.3 .\} -When querying multiple subfields, all subfields are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows: +When querying multiple values, all elements are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows: .sp .if n \{\ .RS 4 diff --git a/doc/bcftools.html b/doc/bcftools.html index b01c7c336..58331e5dd 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -1,6 +1,6 @@ -bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant +bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.

Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF @@ -8,7 +8,7 @@ work in most, but not all situations. In general, whenever multiple VCFs are read simultaneously, they must be indexed and therefore also compressed.

BCFtools is designed to work on a stream. It regards an input file "-" as the standard input (stdin) and outputs to the standard output (stdout). Several -commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2018-02-12 and refers to bcftools git version 1.7.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not +commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2018-03-15 15:49 GMT and refers to bcftools git version 1.7-7-g4dca79a+.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -1232,7 +1232,10 @@ --threads INT

see Common Options -

bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz

Checks sample identity or, without -g, multi-sample cross-check is performed.

+

bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz

Checks sample identity. The program can operate in two modes. If the -g +option is given, the identity of the -s sample from query.vcf.gz +is checked against the samples in the -g file. +Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.

-a, --all-sites
output for all sites @@ -1304,20 +1307,18 @@ -G, the value 1 can be used instead; the discordance value then gives exactly the number of differing genotypes.
-SM, Average Discordance +ERR, error rate
- Average discordance between sample a and all other samples. + Pairwise error rate calculated as number of differences divided + by the total number of comparisons.
-SM, Average Depth +CLUSTER, TH, DOT
- Average depth at evaluated sites, or 1 if FORMAT/DP field is not - present. -
-SM, Average Number of sites -
- The average number of sites used to calculate the discordance. In - other words, the average number of non-missing PLs/genotypes seen - both samples. + In presence of multiple samples, related samples and outliers can be + identified by clustering samples by error rate. A simple hierarchical + clustering based on minimization of standard deviation is used. This is + useful to detect sample swaps, for example in situations where one + sample has been sequenced in multiple runs.

bcftools index [OPTIONS] in.bcf|in.vcf.gz

Creates index for bgzip compressed VCF/BCF files for random access. CSI (coordinate-sorted index) is created by default. The CSI format supports indexing of chromosomes up to length 2^31. TBI (tabix index) @@ -1778,7 +1779,7 @@ can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields.

--d, --rm-dup snps|indels|both|any +-d, --rm-dup snps|indels|both|all|none
If a record is present multiple times, output only the first instance, see --collapse in Common Options. @@ -1786,7 +1787,7 @@ -D, --remove-duplicates
If a record is present in multiple files, output only the first instance. - Alias for -d none. Requires -a, --allow-overlaps. + Alias for -d none, deprecated.
-f, --fasta-ref FILE
@@ -2707,18 +2708,23 @@ TYPE~"snp" TYPE!="snp" TYPE!~"snp"
  • -array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that -for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield -

    (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
    -DP4[*] == 0
    -CSQ[*] ~ "missense_variant.*deleterious"
    +array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
    +for querying FORMAT vectors, the colon ":" can be used to select a sample and an
    +element of the vector, as shown in the examples below
    +

    INFO/AF[0] > 0.3             .. first AF value bigger than 0.3
    +FORMAT/AD[0:0] > 30          .. first AD value of the first sample bigger than 30
    +FORMAT/AD[0:1]               .. first sample, second AD value
    +FORMAT/AD[1:0]               .. second sample, first AD value
    +DP4[*] == 0                  .. any DP4 value
     FORMAT/DP[0]   > 30         .. DP of the first sample bigger than 30
     FORMAT/DP[1-3] > 10         .. samples 2-4
     FORMAT/DP[1-]  < 7          .. all samples but the first
     FORMAT/DP[0,2-4] > 20       .. samples 1, 3-5
     FORMAT/AD[0:1]              .. first sample, second AD field
     FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
    -FORMAT/AD[*:1] or AD[:1]        .. any sample, second AD field
  • +FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field +(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3 +CSQ[*] ~ "missense_variant.*deleterious"

  • function on FORMAT tags (over samples) and INFO tags (over vector fields)

    MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT
  • variables calculated on the fly if not present: number of alternate alleles; @@ -2726,14 +2732,19 @@ AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN); frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes; number of samples with missing genotype; fraction of samples with missing genotype -

    N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
  • Notes:

    • +

      N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
    • +custom perl filtering. Note that this command is not compiled in by default, see +the section Optional Compilation with Perl in the INSTALL file for help +and misc/demo-flt.pl for a working example. The demo defined the perl subroutine +"severity" which can be invoked from the command line as follows: +

      perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3

    Notes:

    • String comparisons and regular expressions are case-insensitive
    • Variables and function names are case-insensitive, but not tag names. For example, "qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but not "dp" instead of "DP".
    • -When querying multiple subfields, all subfields are tested and the OR logic is +When querying multiple values, all elements are tested and the OR logic is used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:

      -i 'TAG[*]=1'   .. true, the record will be printed
       -i 'TAG[*]!=1'  .. true
      diff --git a/doc/bcftools.txt b/doc/bcftools.txt
      index deebfd1c4..a5633be8b 100644
      --- a/doc/bcftools.txt
      +++ b/doc/bcftools.txt
      @@ -1809,13 +1809,13 @@ the *<>* option is supplied.
           can swap alleles and will update genotypes (GT) and AC counts,
           but will not attempt to fix PL or other fields.
       
      -*-d, --rm-dup* 'snps'|'indels'|'both'|'any'::
      +*-d, --rm-dup* 'snps'|'indels'|'both'|'all'|'none'::
           If a record is present multiple times, output only the first instance,
           see *--collapse* in *<>*.
       
       *-D, --remove-duplicates*::
           If a record is present in multiple files, output only the first instance.
      -    Alias for *-d none*.  Requires *-a, --allow-overlaps*.
      +    Alias for *-d none*, deprecated.
       
       *-f, --fasta-ref* 'FILE'[[fasta_ref]]::
           reference sequence. Supplying this option will turn on left-alignment
      @@ -2745,12 +2745,15 @@ to require that all alleles are of the given type. Compare
               TYPE!="snp"
               TYPE!~"snp"
       
      -* array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
      -for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
      -
      -        (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
      -        DP4[*] == 0
      -        CSQ[*] ~ "missense_variant.*deleterious"
      +* array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
      +for querying FORMAT vectors, the colon ":" can be used to select a sample and an
      +element of the vector, as shown in the examples below
      +    
      +        INFO/AF[0] > 0.3             .. first AF value bigger than 0.3
      +        FORMAT/AD[0:0] > 30          .. first AD value of the first sample bigger than 30
      +        FORMAT/AD[0:1]               .. first sample, second AD value
      +        FORMAT/AD[1:0]               .. second sample, first AD value
      +        DP4[*] == 0                  .. any DP4 value
               FORMAT/DP[0]   > 30         .. DP of the first sample bigger than 30
               FORMAT/DP[1-3] > 10         .. samples 2-4
               FORMAT/DP[1-]  < 7          .. all samples but the first
      @@ -2758,6 +2761,8 @@ for querying FORMAT vectors, the colon ":" can be used to select a sample and a
               FORMAT/AD[0:1]              .. first sample, second AD field
               FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
               FORMAT/AD[*:1] or AD[:1]        .. any sample, second AD field
      +        (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
      +        CSQ[*] ~ "missense_variant.*deleterious"
       
       * function on FORMAT tags (over samples) and INFO tags (over vector fields)
       
      @@ -2779,14 +2784,13 @@ and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
               perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3
       
       
      -
       .Notes:
       
       * String comparisons and regular expressions are case-insensitive
       * Variables and function names are case-insensitive, but not tag names. For example,
       "qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
       not "dp" instead of "DP".
      -* When querying multiple subfields, all subfields are tested and the OR logic is
      +* When querying multiple values, all elements are tested and the OR logic is
       used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
       
           -i 'TAG[*]=1'   .. true, the record will be printed
      
      From 5c6d98ed9167696cfb3f591843a4e0f71332880a Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 15 Mar 2018 16:14:23 +0000
      Subject: [PATCH 08/27] Fix compilation
      
      ---
       filter.c | 2 ++
       1 file changed, 2 insertions(+)
      
      diff --git a/filter.c b/filter.c
      index e3a9cb759..0b4053841 100644
      --- a/filter.c
      +++ b/filter.c
      @@ -27,6 +27,8 @@ THE SOFTWARE.  */
       #include 
       #include 
       #include 
      +#include 
      +#include 
       #include 
       #include 
       #include 
      
      From 96b9067d1d028295e8cbf0560235c4b215f6f71f Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Sat, 17 Mar 2018 09:35:23 +0000
      Subject: [PATCH 09/27] Support for -i/-e filtering in roh and updated scripts
      
      ---
       misc/plot-roh.py | 96 ++++++++++++++++++++++++++++--------------------
       misc/run-roh.pl  | 21 +++++++++--
       vcfroh.c         | 61 ++++++++++++++++++++++++------
       3 files changed, 125 insertions(+), 53 deletions(-)
      
      diff --git a/misc/plot-roh.py b/misc/plot-roh.py
      index 546099047..41d9a212a 100755
      --- a/misc/plot-roh.py
      +++ b/misc/plot-roh.py
      @@ -1,4 +1,4 @@
      -#!/usr/bin/python
      +#!/usr/bin/env python
       #  
       #  The MIT License
       #  
      @@ -41,36 +41,39 @@ def usage(msg=None):
               print '   -s, --samples                   List of samples to show, rename or group: "name[\\tnew_name[\\tgroup]]"'
               print '   -h, --help                            This usage text'
               print 'Matplotlib options:'
      -        print '   +adj, --adjust      Set plot adjust [bottom=0.18,left=0.07,right=0.98]'
      -        print '   +dpi, --dpi         Set bitmap DPI [150]'
      -        print '   +sxt, --show-xticks      Show x-ticks (genomic coordinate)'
      -        print '   +xlb, --xlabel      Set x-label'
      -        print '   +xli, --xlimit      Extend x-range by this fraction [0.05]'
      +        print '   +adj, --adjust           Set plot adjust [bottom=0.18,left=0.07,right=0.98]'
      +        print '   +dpi, --dpi              Set bitmap DPI [150]'
      +        print '   +sxt, --show-xticks           Show x-ticks (genomic coordinate)'
      +        print '   +twh, --track-wh     Set track width and height [20,1]'
      +        print '   +xlb, --xlabel           Set x-label'
      +        print '   +xli, --xlimit           Extend x-range by this fraction [0.05]'
           else:
               print msg
           sys.exit(1)
       
      -dir         = None
      -regs        = None
      -min_length  = 0
      -min_markers = 0
      -min_qual    = 0
      -interactive = False
      -sample_file = None
      -highlight   = None
      -outfile     = None
      -adjust      = 'bottom=0.18,left=0.07,right=0.98'
      -dpi         = 150
      -xlim        = 0.05
      -show_xticks = False
      -xlabel      = None
      +dir          = None
      +plot_regions = None
      +min_length   = 0
      +min_markers  = 0
      +min_qual     = 0
      +interactive  = False
      +sample_file  = None
      +highlight    = None
      +outfile      = None
      +adjust       = 'bottom=0.18,left=0.07,right=0.98'
      +dpi          = 150
      +xlim         = 0.05
      +show_xticks  = False
      +xlabel       = None
      +track_width  = 20
      +track_height = 1
       
       if len(sys.argv) < 2: usage()
       args = sys.argv[1:]
       while len(args):
           if args[0]=='-r' or args[0]=='--region': 
               args = args[1:]
      -        regs = args[0]
      +        plot_regions = args[0]
           elif args[0]=='-i' or args[0]=='--interactive': 
               interactive = True
           elif args[0]=='-l' or args[0]=='--min-length': 
      @@ -99,11 +102,16 @@ def usage(msg=None):
           elif args[0]=='+dpi' or args[0]=='--dpi': 
               args = args[1:]
               dpi = float(args[0])
      +    elif args[0]=='+sxt' or args[0]=='--show-xticks': 
      +        show_xticks = True
      +    elif args[0]=='+twh' or args[0]=='--track-wh': 
      +        args = args[1:]
      +        (track_width,track_height) = args[0].split(',')
      +        track_height = -float(track_height)    # will be used as if negative, no auto-magic
      +        track_width  = float(track_width)
           elif args[0]=='+xlb' or args[0]=='--xlabel': 
               args = args[1:]
               xlabel = args[0]
      -    elif args[0]=='+sxt' or args[0]=='--show-xticks': 
      -        show_xticks = True
           elif args[0]=='+xli' or args[0]=='--xlimit': 
               args = args[1:]
               xlim = float(args[0])
      @@ -119,17 +127,21 @@ def wrap_hash(**args): return args
       
       
       import matplotlib as mpl
      -for gui in ['TKAgg','GTKAgg','Qt4Agg','WXAgg','MacOSX']:
      -    try:
      -        mpl.use(gui,warn=False, force=True)
      -        import matplotlib.pyplot as plt
      -        import matplotlib.patches as patches
      -        break
      -    except:
      -        continue
      +if interactive==False:
      +    mpl.use('Agg')
      +    import matplotlib.pyplot as plt
      +    import matplotlib.patches as patches
      +else:
      +    for gui in ['TKAgg','GTKAgg','Qt4Agg','WXAgg','MacOSX']:
      +        try:
      +            mpl.use(gui,warn=False, force=True)
      +            import matplotlib.pyplot as plt
      +            import matplotlib.patches as patches
      +            break
      +        except:
      +            continue
       
       cols = [ '#337ab7', '#5cb85c', '#5bc0de', '#f0ad4e', '#d9534f', 'grey', 'black' ]
      -mpl.rcParams['axes.color_cycle'] = cols
       
       globstr = os.path.join(dir, '*.txt.gz')
       fnames = glob.glob(globstr)
      @@ -272,7 +284,7 @@ def parse_samples(fname,highlight):
           if highlight==None: groups = None
           return (samples,groups,smpl2y)
       
      -regs = parse_regions(regs)
      +plot_regions = parse_regions(plot_regions)
       (samples,groups,smpl2y) = parse_samples(sample_file,highlight)
       
       dat_gt = {}
      @@ -285,7 +297,7 @@ def parse_samples(fname,highlight):
               if row[0]=='GT':
                   chr  = row[1]
                   pos  = int(row[2])
      -            reg  = region_overlap(regs,chr,pos,pos)
      +            reg  = region_overlap(plot_regions,chr,pos,pos)
                   if reg==None: continue
                   for i in range(3,len(row),2):
                       smpl = row[i]
      @@ -317,7 +329,8 @@ def parse_samples(fname,highlight):
                   if length < min_length: continue
                   if nmark < min_markers : continue
                   if qual < min_qual : continue
      -            reg = region_overlap(regs,chr,beg,end)
      +            reg = region_overlap(plot_regions,chr,beg,end)
      +            if reg==None: continue
                   if chr not in dat_rg: dat_rg[chr] = {}
                   if smpl not in dat_rg[chr]: dat_rg[chr][smpl] = []
                   if reg!=None:
      @@ -352,9 +365,11 @@ def parse_samples(fname,highlight):
           off += max_pos + off_sep
           off_list.append(off)
       
      -height = len(smpl2y)
      -if len(smpl2y)>5: heigth = 5
      -wh = 20,height
      +wh = track_width,len(smpl2y)
      +if track_height < 0:
      +    wh = track_width,-track_height*len(smpl2y)
      +elif len(smpl2y)>5: 
      +    wh = track_width,5
       
       def bignum(num):
           s = str(num); out = ''; slen = len(s)
      @@ -379,6 +394,7 @@ def format_coord(x, y):
       xtick_lbl = []
       xtick_pos = []
       max_x = 0
      +min_x = -1
       for chr in dat_gt:
           off  = off_hash[chr]
           icol = 0
      @@ -394,6 +410,7 @@ def format_coord(x, y):
                       rect = patches.Rectangle((rg[0]+off,3*y+0.5), rg[1]-rg[0]+1, 2, color='#d9534f')
                       ax1.add_patch(rect)
               ax1.plot([x[0]+off for x in dat_gt[chr][smpl]],[x[1]+3*y for x in dat_gt[chr][smpl]],'.',color=cols[icol])
      +        if min_x==-1 or min_x > dat_gt[chr][smpl][0][0]+off: min_x = dat_gt[chr][smpl][0][0]+off
               if max_x < dat_gt[chr][smpl][-1][0]+off: max_x = dat_gt[chr][smpl][-1][0]+off
               if max < dat_gt[chr][smpl][-1][0]: max = dat_gt[chr][smpl][-1][0]
               icol += 1
      @@ -408,7 +425,8 @@ def format_coord(x, y):
               ytick_pos.append(3*smpl2y[smpl]+1)
           break
       if xlim!=0:
      -    ax1.set_xlim(0,max_x+xlim*max_x)
      +    if min_x==-1: min_x = 0
      +    ax1.set_xlim(min_x,max_x+xlim*max_x)
       lbl_pos = 3*(len(smpl2y)-1)
       ax1.annotate('   HomAlt ',xy=(max_x,lbl_pos-1),xycoords='data',va='center')
       ax1.annotate('   Het',xy=(max_x,lbl_pos-2),xycoords='data',va='center')
      diff --git a/misc/run-roh.pl b/misc/run-roh.pl
      index 8c76bf842..a70ea9639 100755
      --- a/misc/run-roh.pl
      +++ b/misc/run-roh.pl
      @@ -51,6 +51,8 @@ sub error
               "Options:\n",
               "   -a, --af-annots       Allele frequency annotations [1000GP-AFs/AFs.tab.gz]\n",
               "   -i, --indir            Input directory with VCF files\n",
      +        "       --include         Select sites for which the expression is true\n",
      +        "       --exclude         Exclude sites for which the epxression is true\n",
               "   -l, --min-length       Filter input regions shorter than this [1e6]\n",
               "   -m, --genmap           Directory with genetic map in IMPUTE2 format (optional)\n",
               "   -M, --rec-rate       constant recombination rate per bp (optional)\n",
      @@ -74,6 +76,8 @@ sub parse_params
           };
           while (defined(my $arg=shift(@ARGV)))
           {
      +        if (                 $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next }
      +        if (                 $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next }
               if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next }
               if ( $arg eq '-l' || $arg eq '--min-length' ) { $$opts{min_length}=shift(@ARGV); next }
               if ( $arg eq '-n' || $arg eq '--min-markers' ) { $$opts{min_markers}=shift(@ARGV); next }
      @@ -92,6 +96,8 @@ sub parse_params
           if ( ! -e "$$opts{af_annots}.tbi" ) { error("The annotation file is not indexed: $$opts{af_annots}.tbi\n"); }
           if ( ! -e "$$opts{af_annots}.hdr" ) { error("The annotation file has no header: $$opts{af_annots}.hdr\n"); }
           if ( exists($$opts{genmap}) && ! -d "$$opts{genmap}" ) { error("The directory with genetic maps does not exist: $$opts{genmap}\n"); }
      +    if ( exists($$opts{include_expr}) ) { $$opts{include_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -i '$$opts{include_expr}']; }
      +    if ( exists($$opts{exclude_expr}) ) { $$opts{exclude_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -e '$$opts{exclude_expr}']; }
           return $opts;
       }
       
      @@ -196,10 +202,19 @@ sub run_roh
               my $outfile = "$$opts{outdir}/$`.bcf";
               push @files,$outfile;
               if ( -e $outfile ) { next; }
      -        cmd(
      +
      +        my $cmd = 
                   "bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " .
      -            "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} -Ob -o $outfile.part && " .
      -            "mv $outfile.part $outfile",%$opts);
      +            "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} ";
      +
      +        if ( exists($$opts{inc_exc}) )
      +        {
      +            $cmd .= " -Ou | bcftools view $$opts{inc_exc} ";
      +        }
      +        $cmd .= "-Ob -o $outfile.part && ";
      +        $cmd .= "mv $outfile.part $outfile";
      +
      +        cmd($cmd, %$opts);
           }
           closedir($dh) or error("close failed: $$opts{indir}");
       
      diff --git a/vcfroh.c b/vcfroh.c
      index 626f97538..a96a968b2 100644
      --- a/vcfroh.c
      +++ b/vcfroh.c
      @@ -35,6 +35,7 @@ THE SOFTWARE.  */
       #include "bcftools.h"
       #include "HMM.h"
       #include "smpl_ilist.h"
      +#include "filter.h"
       
       #define STATE_HW 0        // normal state, follows Hardy-Weinberg allele frequencies
       #define STATE_AZ 1        // autozygous state
      @@ -43,6 +44,11 @@ THE SOFTWARE.  */
       #define OUTPUT_RG (1<<2)
       #define OUTPUT_GZ (1<<3)
       
      +// Logic of the filters: include or exclude sites which match the filters?
      +#define FLT_INCLUDE 1
      +#define FLT_EXCLUDE 2
      +
      +
       /** Genetic map */
       typedef struct
       {
      @@ -94,6 +100,9 @@ typedef struct _args_t
           int32_t skip_rid, prev_rid, prev_pos;
       
           int ntot;                   // some stats to detect if things didn't go wrong
      +    int nno_af;                 // number of sites rejected because AF could not be determined
      +    int nfiltered;              // .. because of filters
      +    int nnot_biallelic, ndup;
           smpl_t *smpl;               // HMM data for each sample
           smpl_ilist_t *af_smpl;      // list of samples to estimate AF from (--estimate-AF)
           smpl_ilist_t *roh_smpl;     // list of samples to analyze (--samples, --samples-file)
      @@ -103,6 +112,10 @@ typedef struct _args_t
           int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
           BGZF *out;
           kstring_t str;
      +
      +    int filter_logic;
      +    filter_t *filter;
      +    char *filter_str;
       }
       args_t;
       
      @@ -125,6 +138,9 @@ static void init_data(args_t *args)
       
           if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
       
      +    if ( args->filter_str )
      +        args->filter = filter_init(args->hdr, args->filter_str);
      +
           if ( !args->fake_PLs )
           {
               args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
      @@ -318,6 +334,7 @@ static void init_data(args_t *args)
       
       static void destroy_data(args_t *args)
       {
      +    if ( args->filter ) filter_destroy(args->filter);
           if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname);
           int i;
           for (i=0; iroh_smpl->n; i++)
      @@ -854,8 +871,9 @@ int process_line(args_t *args, bcf1_t *line, int ial)
                   alt_freq = (double) AC/AN;
           }
       
      -    if ( ret<0 ) return ret;
      -    if ( alt_freq==0.0 ) return -1;
      +    if ( args->dflt_AF>0 && (ret<0 || alt_freq==0.0) ) alt_freq = args->dflt_AF;
      +    else if ( ret<0 ) { args->nno_af++; return ret; }
      +    else if ( alt_freq==0.0 ) { args->nno_af++; return -1; }
       
           int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
           if ( args->fake_PLs )
      @@ -970,12 +988,11 @@ static void vcfroh(args_t *args, bcf1_t *line)
               for (i=0; iroh_smpl->n; i++) flush_viterbi(args, i);
               return; 
           }
      -    args->ntot++;
       
           // Skip unwanted lines, for simplicity we consider only biallelic sites 
           if ( line->rid == args->skip_rid ) return;
      -    if ( line->n_allele==1 ) return;    // no ALT allele
      -    if ( line->n_allele > 3 ) return;   // cannot be bi-allelic, even with <*>
      +    if ( line->n_allele==1 ) { args->nnot_biallelic++; return; }   // no ALT allele
      +    if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; }   // cannot be bi-allelic, even with <*>
       
           // This can be raw callable VCF with the symbolic unseen allele <*>
           int ial = 0;
      @@ -983,7 +1000,7 @@ static void vcfroh(args_t *args, bcf1_t *line)
               if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
           if ( ial==0 )    // normal VCF, the symbolic allele is not present
           {
      -        if ( line->n_allele!=2 ) return;    // not biallelic
      +        if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; }   // not biallelic
               ial = 1;
           }
           else
      @@ -1017,7 +1034,11 @@ static void vcfroh(args_t *args, bcf1_t *line)
               args->prev_pos = line->pos;
               skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
           }
      -    else if ( args->prev_pos == line->pos ) return;     // skip duplicate positions
      +    else if ( args->prev_pos == line->pos ) 
      +    {
      +        args->ndup++;
      +        return;     // skip duplicate positions
      +    }
       
           if ( skip_rid )
           {
      @@ -1044,14 +1065,16 @@ static void usage(args_t *args)
           fprintf(stderr, "General Options:\n");
           fprintf(stderr, "        --AF-dflt               if AF is not known, use this allele frequency [skip]\n");
           fprintf(stderr, "        --AF-tag                  use TAG for allele frequency\n");
      -    fprintf(stderr, "        --AF-file                read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
      +    fprintf(stderr, "        --AF-file                read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n");
           fprintf(stderr, "    -b  --buffer-size       buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
           fprintf(stderr, "                                           If the first number is negative, it is interpreted as the maximum memory to\n");
           fprintf(stderr, "                                           use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
           fprintf(stderr, "    -e, --estimate-AF [TAG],     estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n");
           fprintf(stderr, "                                            in . If TAG is not given, the frequency is estimated from GT by default\n");
      +    fprintf(stderr, "        --exclude                exclude sites for which the expression is true\n");
           fprintf(stderr, "    -G, --GTs-only              use GTs and ignore PLs, instead using  for PL of the two least likely genotypes.\n");
           fprintf(stderr, "                                           Safe value to use is 30 to account for GT errors.\n");
      +    fprintf(stderr, "        --include                select sites for which the expression is true\n");
           fprintf(stderr, "    -i, --ignore-homref                skip hom-ref genotypes (0/0)\n");
           fprintf(stderr, "    -I, --skip-indels                  skip indels as their genotypes are enriched for errors\n");
           fprintf(stderr, "    -m, --genetic-map            genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
      @@ -1091,6 +1114,8 @@ int main_vcfroh(int argc, char *argv[])
               {"AF-tag",1,0,0},
               {"AF-file",1,0,1},
               {"AF-dflt",1,0,2},
      +        {"include",1,0,3},
      +        {"exclude",1,0,4},
               {"buffer-size",1,0,'b'},
               {"ignore-homref",0,0,'i'},
               {"estimate-AF",1,0,'e'},
      @@ -1123,6 +1148,8 @@ int main_vcfroh(int argc, char *argv[])
                       args->dflt_AF = strtod(optarg,&tmp);
                       if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
                       break;
      +            case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
      +            case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
                   case 'o': args->output_fname = optarg; break;
                   case 'O': 
                       if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
      @@ -1206,16 +1233,28 @@ int main_vcfroh(int argc, char *argv[])
           init_data(args);
           while ( bcf_sr_next_line(args->files) )
           {
      -        vcfroh(args, args->files->readers[0].buffer[0]);
      +        args->ntot++;
      +        bcf1_t *line = bcf_sr_get_line(args->files,0);
      +        if ( args->filter )
      +        {
      +            int pass = filter_test(args->filter, line, NULL);
      +            if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
      +            if ( !pass ) { args->nfiltered++; continue; }
      +        }
      +        vcfroh(args, line);
           }
           vcfroh(args, NULL);
           int i, nmin = 0;
           for (i=0; iroh_smpl->n; i++)
               if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused;
      -    fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
      +    if ( args->af_fname )
      +        fprintf(stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin);
      +    else
      +        fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
      +    fprintf(stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup);
           if ( nmin==0 )
           {
      -        fprintf(stderr,"No usable sites were found.");
      +        fprintf(stderr,"No usable sites were found.\n", args->nno_af);
               if ( !naf_opts && !args->dflt_AF ) fprintf(stderr, " Consider using one of the AF options.\n");
           }
           destroy_data(args);
      
      From 9224bf1b35da00ccd65e6ae9b6b0b56acbe7a2dd Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Sat, 17 Mar 2018 09:37:04 +0000
      Subject: [PATCH 10/27] Update documentation
      
      ---
       doc/bcftools.txt | 8 ++++++++
       1 file changed, 8 insertions(+)
      
      diff --git a/doc/bcftools.txt b/doc/bcftools.txt
      index a5633be8b..44a15373f 100644
      --- a/doc/bcftools.txt
      +++ b/doc/bcftools.txt
      @@ -2312,11 +2312,19 @@ Transition probabilities:
           If neither *-e* nor the other *--AF-...* options are given, the allele frequency is
           estimated from AC and AN counts which are already present in the INFO field.
       
      +*--exclude* 'EXPRESSION'::
      +    exclude sites for which 'EXPRESSION' is true. For valid expressions see
      +    *<>*.
      +
       *-G, --GTs-only* 'FLOAT'::
           use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL),
           setting PL of unseen genotypes to 'FLOAT'. Safe value to use is 30 to
           account for GT errors.
       
      +*--include* 'EXPRESSION'::
      +    include only sites for which 'EXPRESSION' is true. For valid expressions see
      +    *<>*.
      +
       *-I, --skip-indels*::
           skip indels as their genotypes are usually enriched for errors
       
      
      From 32122e95e789bcf65278aa3ca99bdee9dcbde5e6 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Mon, 19 Mar 2018 12:52:31 +0000
      Subject: [PATCH 11/27] Fix a small memory leak
      
      ---
       consensus.c | 1 +
       1 file changed, 1 insertion(+)
      
      diff --git a/consensus.c b/consensus.c
      index d67a05219..4d555c88c 100644
      --- a/consensus.c
      +++ b/consensus.c
      @@ -280,6 +280,7 @@ static void init_region(args_t *args, char *line)
                   else to--;
               }
           }
      +    free(args->chr);
           args->chr = strdup(line);
           args->rid = bcf_hdr_name2id(args->hdr,line);
           if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
      
      From 6400857948e0d8beef598651489ce5181252c423 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Mon, 19 Mar 2018 12:53:16 +0000
      Subject: [PATCH 12/27] Support for vcf,vcf.gz regidx parser. Fixes #745
      
      ---
       regidx.c | 13 ++++++++++++-
       regidx.h |  1 +
       2 files changed, 13 insertions(+), 1 deletion(-)
      
      diff --git a/regidx.c b/regidx.c
      index 9b2c66d71..d7d8a812a 100644
      --- a/regidx.c
      +++ b/regidx.c
      @@ -232,6 +232,10 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
                       parser = regidx_parse_bed;
                   else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
                       parser = regidx_parse_bed;
      +            else if ( len>=4 && !strcasecmp(".vcf",fname+len-4) )
      +                parser = regidx_parse_vcf;
      +            else if ( len>=4 && !strcasecmp(".vcf.gz",fname+len-7) )
      +                parser = regidx_parse_vcf;
                   else
                       parser = regidx_parse_tab;
               }
      @@ -488,13 +492,20 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t
           {
               ss = se+1;
               *end = strtod(ss, &se);
      -        if ( ss==se ) *end = *beg;
      +        if ( ss==se || (*se && !isspace(*se)) ) *end = *beg;
               else if ( *end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
               else (*end)--;
           }
           return 0;
       }
       
      +int regidx_parse_vcf(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
      +{
      +    int ret = regidx_parse_tab(line, chr_beg, chr_end, beg, end, payload, usr);
      +    if ( !ret ) *end = *beg;
      +    return ret;
      +}
      +
       int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
       {
           char *ss = (char*) line;
      diff --git a/regidx.h b/regidx.h
      index fe0a897e4..eec978b6c 100644
      --- a/regidx.h
      +++ b/regidx.h
      @@ -106,6 +106,7 @@ typedef void (*regidx_free_f)(void *payload);
       int regidx_parse_bed(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);   // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open)
       int regidx_parse_tab(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);   // CHROM or whitespace-separated CHROM,POS (1-based, inclusive)
       int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);   // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive)
      +int regidx_parse_vcf(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
       
       /*
        *  regidx_init() - creates new index
      
      From fb94d291e30b2237f0645051119a55d737d81355 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Mon, 19 Mar 2018 13:08:30 +0000
      Subject: [PATCH 13/27] Fix a copy-and-paste error
      
      ---
       regidx.c | 2 +-
       1 file changed, 1 insertion(+), 1 deletion(-)
      
      diff --git a/regidx.c b/regidx.c
      index d7d8a812a..3e4620ff1 100644
      --- a/regidx.c
      +++ b/regidx.c
      @@ -234,7 +234,7 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
                       parser = regidx_parse_bed;
                   else if ( len>=4 && !strcasecmp(".vcf",fname+len-4) )
                       parser = regidx_parse_vcf;
      -            else if ( len>=4 && !strcasecmp(".vcf.gz",fname+len-7) )
      +            else if ( len>=7 && !strcasecmp(".vcf.gz",fname+len-7) )
                       parser = regidx_parse_vcf;
                   else
                       parser = regidx_parse_tab;
      
      From d78e524c53aa50cfb52765eb4197f1028b5621c9 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Mon, 19 Mar 2018 13:31:04 +0000
      Subject: [PATCH 14/27] Fix --haplegendsample on purely haploid genotypes.
       Resolves #730
      
      ---
       convert.c                     | 23 ++++++++++++++++++++++-
       test/convert.hap-missing.haps |  5 +++++
       test/convert.hap-missing.vcf  |  9 +++++++++
       test/test.pl                  |  1 +
       4 files changed, 37 insertions(+), 1 deletion(-)
       create mode 100644 test/convert.hap-missing.haps
       create mode 100644 test/convert.hap-missing.vcf
      
      diff --git a/convert.c b/convert.c
      index d7ed0abc1..dce4529fd 100644
      --- a/convert.c
      +++ b/convert.c
      @@ -760,12 +760,33 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
       
           if ( fmt_gt->type!=BCF_BT_INT8 )    // todo: use BRANCH_INT if the VCF is valid
               error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
      +    if ( fmt_gt->n!=1 && fmt_gt->n!=2 )
      +        error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1);
       
           int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
           for (i=0; insamples; i++)
           {
               ptr += fmt_gt->n;
      -        if ( ptr[0]==2 )
      +        if ( fmt_gt->n==1 ) // haploid genotypes
      +        {
      +            if ( ptr[0]==2 ) /* 0 */
      +            {
      +                str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
      +            }
      +            else if ( ptr[0]==bcf_int32_missing )   /* . */
      +            {
      +                str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
      +            }
      +            else if ( ptr[0]==4 ) /* 1 */
      +            {
      +                str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
      +            }
      +            else
      +            {
      +                kputw(bcf_gt_allele(ptr[0]),str); str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
      +            }
      +        }
      +        else if ( ptr[0]==2 )
               {
                   if ( ptr[1]==3 ) /* 0|0 */
                   {
      diff --git a/test/convert.hap-missing.haps b/test/convert.hap-missing.haps
      new file mode 100644
      index 000000000..952e13c5e
      --- /dev/null
      +++ b/test/convert.hap-missing.haps
      @@ -0,0 +1,5 @@
      +0 0 0 0 1 0
      +0 - 0 - 1 -
      +? ? ? ? ? ?
      +? ? ? ? ? ?
      +0 0 0 0 1 0
      diff --git a/test/convert.hap-missing.vcf b/test/convert.hap-missing.vcf
      new file mode 100644
      index 000000000..eac4d155c
      --- /dev/null
      +++ b/test/convert.hap-missing.vcf
      @@ -0,0 +1,9 @@
      +##fileformat=VCFv4.1
      +##FORMAT=
      +##contig=
      +#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
      +X	2699450	.	A	C	.	.	.	GT	0|0	0|0	1|0
      +X	2699451	.	A	C	.	.	.	GT	0	0	1
      +X	2699968	.	A	G	.	.	.	GT	.|.	0|.	1|.
      +X	2699970	.	T	C	.	.	.	GT	.|0	0|.	1|.
      +X	2699990	.	C		.	.	.	GT	0|0	0|0	1|0
      diff --git a/test/test.pl b/test/test.pl
      index 6c7e54b15..7ce31e3bf 100755
      --- a/test/test.pl
      +++ b/test/test.pl
      @@ -320,6 +320,7 @@
       test_vcf_convert_hs2vcf($opts,h=>'convert.hs.gt.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.vcf',args=>'--hapsample2vcf');
       test_vcf_convert($opts,in=>'convert',out=>'convert.hs.hap',args=>'--hapsample -,.');
       test_vcf_convert($opts,in=>'convert',out=>'convert.hs.sample',args=>'--hapsample .,-');
      +test_vcf_convert($opts,in=>'convert.hap-missing',out=>'convert.hap-missing.haps',args=>'--haplegendsample -,.,.');
       test_vcf_convert_gvcf($opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf');
       test_vcf_convert_tsv2vcf($opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme');
       test_vcf_consensus($opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'');
      
      From ba084390383af977cf8e9d07b0fd224b24c9f594 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Mon, 19 Mar 2018 13:55:15 +0000
      Subject: [PATCH 15/27] Add threading support for BCF reheader. Resolves #722
      
      ---
       reheader.c | 31 ++++++++++++++++++++++++++++---
       1 file changed, 28 insertions(+), 3 deletions(-)
      
      diff --git a/reheader.c b/reheader.c
      index 277606900..863c4ff46 100644
      --- a/reheader.c
      +++ b/reheader.c
      @@ -1,6 +1,6 @@
       /*  reheader.c -- reheader subcommand.
       
      -    Copyright (C) 2014-2017 Genome Research Ltd.
      +    Copyright (C) 2014-2018 Genome Research Ltd.
       
           Author: Petr Danecek 
       
      @@ -36,6 +36,7 @@ THE SOFTWARE.  */
       #include 
       #include  // for hts_get_bgzfp()
       #include 
      +#include 
       #include "bcftools.h"
       #include "khash_str2str.h"
       
      @@ -44,7 +45,8 @@ typedef struct _args_t
           char **argv, *fname, *samples_fname, *header_fname, *output_fname;
           htsFile *fp;
           htsFormat type;
      -    int argc;
      +    htsThreadPool *threads;
      +    int argc, n_threads;
       }
       args_t;
       
      @@ -369,6 +371,16 @@ static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst)
       static void reheader_bcf(args_t *args, int is_compressed)
       {
           htsFile *fp = args->fp;
      +
      +    if ( args->n_threads > 0 )
      +    {
      +        args->threads = calloc(1, sizeof(*args->threads));
      +        if ( !args->threads ) error("Could not allocate memory\n");
      +        if ( !(args->threads->pool = hts_tpool_init(args->n_threads)) ) error("Could not initialize threading\n");
      +        BGZF *bgzf = hts_get_bgzfp(fp);
      +        if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize);
      +    }
      +
           bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname);
           kstring_t htxt = {0,0,0};
           bcf_hdr_format(hdr, 1, &htxt);
      @@ -396,6 +408,11 @@ static void reheader_bcf(args_t *args, int is_compressed)
           // write the header and the body
           htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu");
           if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno));
      +    if ( args->threads )
      +    {
      +        BGZF *bgzf = hts_get_bgzfp(fp_out);
      +        if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize);
      +    }
           bcf_hdr_write(fp_out, hdr_out);
       
           bcf1_t *rec = bcf_init();
      @@ -450,6 +467,11 @@ static void reheader_bcf(args_t *args, int is_compressed)
           hts_close(fp);
           bcf_hdr_destroy(hdr_out);
           bcf_hdr_destroy(hdr);
      +    if ( args->threads )
      +    {
      +        hts_tpool_destroy(args->threads->pool);
      +        free(args->threads);
      +    }
       }
       
       
      @@ -463,6 +485,7 @@ static void usage(args_t *args)
           fprintf(stderr, "    -h, --header      new header\n");
           fprintf(stderr, "    -o, --output      write output to a file [standard output]\n");
           fprintf(stderr, "    -s, --samples     new sample names\n");
      +    fprintf(stderr, "        --threads      number of extra compression threads (BCF only) [0]\n");
           fprintf(stderr, "\n");
           exit(1);
       }
      @@ -472,18 +495,20 @@ int main_reheader(int argc, char *argv[])
           int c;
           args_t *args  = (args_t*) calloc(1,sizeof(args_t));
           args->argc    = argc; args->argv = argv;
      -
      +    
           static struct option loptions[] =
           {
               {"output",1,0,'o'},
               {"header",1,0,'h'},
               {"samples",1,0,'s'},
      +        {"threads",1,NULL,1},
               {0,0,0,0}
           };
           while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0)
           {
               switch (c)
               {
      +            case  1 : args->n_threads = strtol(optarg, 0, 0); break;
                   case 'o': args->output_fname = optarg; break;
                   case 's': args->samples_fname = optarg; break;
                   case 'h': args->header_fname = optarg; break;
      
      From 799c73979db70e90b7f59d6c22b0d9846963f69c Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Mon, 19 Mar 2018 14:37:41 +0000
      Subject: [PATCH 16/27] Exit nicely on non-existent FMT/TAGs. Resolves #696
      
      ---
       vcfannotate.c | 1 +
       1 file changed, 1 insertion(+)
      
      diff --git a/vcfannotate.c b/vcfannotate.c
      index 3de9eb5cf..06f56c6b3 100644
      --- a/vcfannotate.c
      +++ b/vcfannotate.c
      @@ -1720,6 +1720,7 @@ static void init_columns(args_t *args)
                   if ( args->tgts_is_vcf )
                   {
                       bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
      +                if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname);
                       tmp.l = 0;
                       bcf_hrec_format_rename(hrec, key_dst, &tmp);
                       bcf_hdr_append(args->hdr_out, tmp.s);
      
      From 2c6162eb565f318c9dfe77becea6298d900aaab9 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Tue, 20 Mar 2018 14:59:51 +0000
      Subject: [PATCH 17/27] Annotate with 1-based R2 coordinate, not 0-based.
       Resolves #760
      
      ---
       plugins/prune.c | 3 ++-
       1 file changed, 2 insertions(+), 1 deletion(-)
      
      diff --git a/plugins/prune.c b/plugins/prune.c
      index 5c4f506d6..8b1ee52b1 100644
      --- a/plugins/prune.c
      +++ b/plugins/prune.c
      @@ -182,8 +182,9 @@ static void process(args_t *args)
               if ( ld_rec && args->info_r2 )
               {
                   float tmp = ld_val;
      +            int32_t tmp_pos = ld_rec->pos + 1;
                   bcf_update_info_float(args->hdr, rec, args->info_r2, &tmp, 1);
      -            bcf_update_info_int32(args->hdr, rec, args->info_pos, &ld_rec->pos, 1);
      +            bcf_update_info_int32(args->hdr, rec, args->info_pos, &tmp_pos, 1);
               }
           }
           sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1);
      
      From 0d1ee0baa1ce180893a733cb9e904e12c9f8a96c Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 22 Mar 2018 17:24:56 +0000
      Subject: [PATCH 18/27] The indexed file must be BGZF-compressed. Resolves #761
      
      ---
       vcfindex.c | 1 +
       1 file changed, 1 insertion(+)
      
      diff --git a/vcfindex.c b/vcfindex.c
      index 807fedd5f..c19587dc5 100644
      --- a/vcfindex.c
      +++ b/vcfindex.c
      @@ -213,6 +213,7 @@ int main_vcfindex(int argc, char *argv[])
               // check for truncated files, allow only with -f
               BGZF *fp = bgzf_open(fname, "r");
               if ( !fp ) error("index: failed to open %s\n", fname);
      +        if ( bgzf_compression(fp)!=2 ) error("index: the file is not BGZF compressed, cannot index: %s\n", fname);
               if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
               if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
           }
      
      From 9d94f7bfded90baf04a9dc9c0942c69120f53116 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 22 Mar 2018 17:31:19 +0000
      Subject: [PATCH 19/27] Add options for simpler ploidy usage
      
      ---
       plugins/fixploidy.c | 58 ++++++++++++++++++++++++++++++---------------
       1 file changed, 39 insertions(+), 19 deletions(-)
      
      diff --git a/plugins/fixploidy.c b/plugins/fixploidy.c
      index 04cc07677..039d0f48b 100644
      --- a/plugins/fixploidy.c
      +++ b/plugins/fixploidy.c
      @@ -38,6 +38,7 @@ static int *sample2sex = NULL;
       static int n_sample = 0, nsex = 0, *sex2ploidy = NULL;
       static int32_t ngt_arr = 0, *gt_arr = NULL, *gt_arr2 = NULL, ngt_arr2 = 0;
       static ploidy_t *ploidy = NULL;
      +static int force_ploidy = -1;
       
       const char *about(void)
       {
      @@ -54,9 +55,11 @@ const char *usage(void)
               "   run \"bcftools plugin\" for a list of common options\n"
               "\n"
               "Plugin options:\n"
      -        "   -p, --ploidy    space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"
      -        "   -s, --sex       list of samples, \"NAME SEX\"\n"
      -        "   -t, --tags      VCF tags to fix [GT]\n"
      +        "   -d, --default-ploidy   default ploidy for regions unlisted in -p [2]\n"
      +        "   -f, --force-ploidy     ignore -p, set the same ploidy for all genotypes\n"
      +        "   -p, --ploidy          space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"
      +        "   -s, --sex             list of samples, \"NAME SEX\"\n"
      +        "   -t, --tags            VCF tags to fix [GT]\n"
               "\n"
               "Example:\n"
               "   # Default ploidy, if -p not given. Unlisted regions have ploidy 2\n"
      @@ -109,21 +112,32 @@ void set_samples(char *fname, bcf_hdr_t *hdr, ploidy_t *ploidy, int *sample2sex)
       
       int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
       {
      -    int c;
      +    int c, default_ploidy = 2;
           char *tags_str = "GT";
           char *ploidy_fname = NULL, *sex_fname = NULL;
       
           static struct option loptions[] =
           {
      +        {"default-ploidy",1,0,'d'},
      +        {"force-ploidy",1,0,'f'},
               {"ploidy",1,0,'p'},
               {"sex",1,0,'s'},
               {"tags",1,0,'t'},
               {0,0,0,0}
           };
      -    while ((c = getopt_long(argc, argv, "?ht:s:p:",loptions,NULL)) >= 0)
      +    char *tmp;
      +    while ((c = getopt_long(argc, argv, "?ht:s:p:d:f:",loptions,NULL)) >= 0)
           {
               switch (c) 
               {
      +            case 'd': 
      +                default_ploidy = strtod(optarg,&tmp);
      +                if ( *tmp ) error("Could not parse: -d %s\n", optarg);
      +                break;
      +            case 'f': 
      +                force_ploidy = strtod(optarg,&tmp);
      +                if ( *tmp ) error("Could not parse: -f %s\n", optarg);
      +                break;
                   case 'p': ploidy_fname = optarg; break;
                   case 's': sex_fname = optarg; break;
                   case 't': tags_str = optarg; break;
      @@ -140,8 +154,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
           out_hdr    = out;
       
           if ( ploidy_fname )
      -        ploidy = ploidy_init(ploidy_fname, 2);
      -    else
      +        ploidy = ploidy_init(ploidy_fname, default_ploidy);
      +    else if ( force_ploidy==-1 )
           {
               ploidy = ploidy_init_string(
                       "X 1 60000 M 1\n"
      @@ -151,14 +165,17 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
                       "MT 1 16569 M 1\n"
                       "MT 1 16569 F 1\n", 2);
           }
      -    if ( !ploidy ) return -1;
      -
      -    // add default sex in case it was not included
      -    int i, dflt_sex_id = ploidy_add_sex(ploidy, "F");
      -    for (i=0; ipos+1);
       
      -    ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy);
      +    if ( force_ploidy==-1 )
      +        ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy);
      +    else
      +        max_ploidy = force_ploidy;
       
           ngts /= n_sample;
           if ( ngts < max_ploidy )
      @@ -183,7 +203,7 @@ bcf1_t *process(bcf1_t *rec)
               hts_expand(int32_t,max_ploidy*n_sample,ngt_arr2,gt_arr2);
               for (i=0; i
      Date: Thu, 22 Mar 2018 17:34:13 +0000
      Subject: [PATCH 20/27] Do not rely on FMT/GT size in BCF, instead check the
       maximum ploidy. Fixes #705
      
      ---
       vcfmerge.c | 40 +++++++++++++++++++++++++++++++++++++---
       1 file changed, 37 insertions(+), 3 deletions(-)
      
      diff --git a/vcfmerge.c b/vcfmerge.c
      index 31f5dad51..a69e2cb06 100644
      --- a/vcfmerge.c
      +++ b/vcfmerge.c
      @@ -1281,6 +1281,37 @@ void update_AN_AC(bcf_hdr_t *hdr, bcf1_t *line)
           free(tmp);
       }
       
      +static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
      +{
      +    int i,j, max_ploidy = 0;
      +
      +    #define BRANCH(type_t, vector_end) { \
      +        type_t *ptr  = (type_t*) fmt->p; \
      +        for (i=0; in; j++) \
      +                if ( ptr[j]==vector_end ) break; \
      +            if ( j==fmt->n ) \
      +            { \
      +                /* all fields were used */ \
      +                if ( max_ploidy < j ) max_ploidy = j; \
      +                break; \
      +            } \
      +            if ( max_ploidy < j ) max_ploidy = j; \
      +            ptr += fmt->n; \
      +        } \
      +    }
      +    switch (fmt->type)
      +    {
      +        case BCF_BT_INT8:  BRANCH(int8_t,   bcf_int8_vector_end); break;
      +        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
      +        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
      +        default: error("Unexpected case: %d\n", fmt->type);
      +    }
      +    #undef BRANCH
      +    return max_ploidy;
      +}
      +
       void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
       {
           bcf_srs_t *files = args->files;
      @@ -1291,9 +1322,12 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
           int nsize = 0, msize = sizeof(int32_t);
           for (i=0; inreaders; i++)
           {
      -        if ( !fmt_map[i] ) continue;
      -        if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
      +        bcf_fmt_t *fmt = fmt_map[i];
      +        if ( !fmt ) continue;
      +        int pld = max_used_gt_ploidy(fmt_map[i], bcf_hdr_nsamples(bcf_sr_get_header(args->files,i)));
      +        if ( nsize < pld ) nsize = pld;
           }
      +    if ( nsize==0 ) nsize = 1;
       
           if ( ma->ntmp_arr < nsamples*nsize*msize )
           {
      @@ -1311,7 +1345,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
               int32_t *tmp  = (int32_t *) ma->tmp_arr + ismpl*nsize;
               int irec = ma->buf[i].cur;
       
      -        int j, k;
      +        int j,k;
               if ( !fmt_ori )
               {
                   // missing values: assume maximum ploidy
      
      From f4928e69f24a9c100413fa2df727feadd95cc057 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 22 Mar 2018 17:35:18 +0000
      Subject: [PATCH 21/27] Added a new option for passing parameters to bcftools
       roh
      
      ---
       misc/run-roh.pl | 5 ++++-
       1 file changed, 4 insertions(+), 1 deletion(-)
      
      diff --git a/misc/run-roh.pl b/misc/run-roh.pl
      index a70ea9639..c980c9b44 100755
      --- a/misc/run-roh.pl
      +++ b/misc/run-roh.pl
      @@ -59,6 +59,7 @@ sub error
               "   -n, --min-markers      Filter input regions with fewer marker than this [100]\n",
               "   -o, --outdir           Output directory\n",
               "   -q, --min-qual         Filter input regions with quality smaller than this [10]\n",
      +        "       --roh-args      Extra arguments to pass to bcftools roh\n",
               "   -s, --silent                Quiet output, do not print commands\n",
               "   -h, -?, --help              This help message\n",
               "\n";
      @@ -73,9 +74,11 @@ sub parse_params
               min_length  => 1e6,
               min_markers => 100,
               min_qual    => 10,
      +        roh_args    => '',
           };
           while (defined(my $arg=shift(@ARGV)))
           {
      +        if (                 $arg eq '--roh-args' ) { $$opts{roh_args}=shift(@ARGV); next }
               if (                 $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next }
               if (                 $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next }
               if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next }
      @@ -224,7 +227,7 @@ sub run_roh
           for my $file (@files)
           {
               if ( -e "$file.txt.gz" ) { next; }
      -        my @out = cmd("bcftools roh --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
      +        my @out = cmd("bcftools roh $$opts{roh_args} --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
               for my $line (@out)
               {
                   if ( !($line=~m{total/processed:\s+(\d+)/(\d+)}) ) { next; }
      
      From ed286e95783a19fbd67b1e9258dbbc4644fcd9ce Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 29 Mar 2018 17:00:40 +0100
      Subject: [PATCH 22/27] Set all genotypes to phased with +setGT -t a -n p
      
      ---
       plugins/setGT.c | 17 ++++++++++++++++-
       1 file changed, 16 insertions(+), 1 deletion(-)
      
      diff --git a/plugins/setGT.c b/plugins/setGT.c
      index efa2bcb1e..15976622a 100644
      --- a/plugins/setGT.c
      +++ b/plugins/setGT.c
      @@ -195,7 +195,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
               {
                   case 'i': args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
                   case 'e': args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
      -            case 'n': args->new_mask = bcf_gt_phased(0); 
      +            case 'n': args->new_mask = 0;
                       if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING;
                       if ( strchr(optarg,'0') ) args->new_mask |= GT_REF;
                       if ( strchr(optarg,'M') ) args->new_mask |= GT_MAJOR;
      @@ -232,6 +232,19 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
           return 0;
       }
       
      +static inline int phase_gt(int32_t *ptr, int ngts)
      +{
      +    int j, changed = 0;
      +    for (j=0; jnew_mask>_UNPHASED )
                       changed += unphase_gt(ptr, ngts);
      +            else if ( args->new_mask==GT_PHASED )
      +                changed += phase_gt(ptr, ngts);
                   else
                       changed += set_gt(ptr, ngts, args->new_gt);
               }
      
      From fd12d5127794883df930ae1c4eb734c9ff1df6a2 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 29 Mar 2018 17:10:13 +0100
      Subject: [PATCH 23/27] Prevent dropping AC,AN INFO fields on -i/-e MAF,AF
       queries
      
      The query command asks htslib to skip the expensive parsing of INFO and FORMAT
      fields when it is not needed in the formatting and the filtering expression.
      This backfired for fields calculated on the fly, commands like
      
          bcftools query -e 'MAF[0]<0.05' -f '%POS\n'
      
      would not bee able to calculate MAF because INFO/AC, INFO/AN is not accessible.
      
      This is now fixed, but with a caveat: if the header contains INFO/AC,AN line
      but the annotations are not filled in the data lines, the program will not
      be able to determine AC, AN from FORMAT/GT. In such case, fill-tags can
      be used to update the INFO annotations or the tags can be stripped from the
      header using `annotate` or `reheader`.
      
      Resolves #762.
      ---
       filter.c | 17 +++++++++++++++++
       1 file changed, 17 insertions(+)
      
      diff --git a/filter.c b/filter.c
      index 0b4053841..fe17634f3 100644
      --- a/filter.c
      +++ b/filter.c
      @@ -1748,6 +1748,18 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
               for (i=0; inidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
           }
       }
      +static int max_ac_an_unpack(bcf_hdr_t *hdr)
      +{
      +    int hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AC");
      +    if ( hdr_id<0 ) return BCF_UN_FMT;
      +    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
      +
      +    hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AN");
      +    if ( hdr_id<0 ) return BCF_UN_FMT;
      +    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
      +
      +    return BCF_UN_INFO;
      +}
       static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
       {
           tok->tok_type  = TOK_VAL;
      @@ -1980,6 +1992,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
           }
           else if ( !strcasecmp(tmp.s,"AN") )
           {
      +        filter->max_unpack |= BCF_UN_FMT;
               tok->setter = &filters_set_an;
               tok->tag = strdup("AN");
               free(tmp.s);
      @@ -1987,6 +2000,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
           }
           else if ( !strcasecmp(tmp.s,"AC") )
           {
      +        filter->max_unpack |= BCF_UN_FMT;
               tok->setter = &filters_set_ac;
               tok->tag = strdup("AC");
               free(tmp.s);
      @@ -1994,6 +2008,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
           }
           else if ( !strcasecmp(tmp.s,"MAC") )
           {
      +        filter->max_unpack |= max_ac_an_unpack(filter->hdr);
               tok->setter = &filters_set_mac;
               tok->tag = strdup("MAC");
               free(tmp.s);
      @@ -2001,6 +2016,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
           }
           else if ( !strcasecmp(tmp.s,"AF") )
           {
      +        filter->max_unpack |= max_ac_an_unpack(filter->hdr);
               tok->setter = &filters_set_af;
               tok->tag = strdup("AF");
               free(tmp.s);
      @@ -2008,6 +2024,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
           }
           else if ( !strcasecmp(tmp.s,"MAF") )
           {
      +        filter->max_unpack |= max_ac_an_unpack(filter->hdr);
               tok->setter = &filters_set_maf;
               tok->tag = strdup("MAF");
               free(tmp.s);
      
      From e12f43d709feca601d15ab0f93f9f16142c388b6 Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Thu, 29 Mar 2018 17:28:08 +0100
      Subject: [PATCH 24/27] Apply ed286e9 in all the other branches
      
      ---
       plugins/setGT.c | 4 ++++
       1 file changed, 4 insertions(+)
      
      diff --git a/plugins/setGT.c b/plugins/setGT.c
      index 15976622a..80943ff08 100644
      --- a/plugins/setGT.c
      +++ b/plugins/setGT.c
      @@ -361,6 +361,8 @@ bcf1_t *process(bcf1_t *rec)
       
                   if ( args->new_mask>_UNPHASED )
                       changed += unphase_gt(ptr, ngts);
      +            else if ( args->new_mask==GT_PHASED )
      +                changed += phase_gt(ptr, ngts);
                   else
                       changed += set_gt(ptr, ngts, args->new_gt);
               }
      @@ -392,6 +394,8 @@ bcf1_t *process(bcf1_t *rec)
                   if ( !args->smpl_pass[i] ) continue;
                   if ( args->new_mask>_UNPHASED )
                       changed += unphase_gt(args->gts + i*ngts, ngts);
      +            else if ( args->new_mask==GT_PHASED )
      +                changed += phase_gt(args->gts + i*ngts, ngts);
                   else
                       changed += set_gt(args->gts + i*ngts, ngts, args->new_gt);
               }
      
      From 4479210a0c0182db2e35330eab865cd30b60cceb Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Sat, 31 Mar 2018 09:25:12 +0100
      Subject: [PATCH 25/27] Fix bug in indexed ALT access and let STRLEN(".")
       evaluate to 0
      
      - a spurious line was reset the index to 0 after first access
        so that querying ALT[9] resulted in querying ALT[0] after
        the first line
      
      - strlen(ALT[9])==1 was always true because missing value is
        filled. Arguably, the missing string has zero length.
      
      Fixes #767
      ---
       filter.c | 6 ++++--
       1 file changed, 4 insertions(+), 2 deletions(-)
      
      diff --git a/filter.c b/filter.c
      index fe17634f3..1c7881ee4 100644
      --- a/filter.c
      +++ b/filter.c
      @@ -922,7 +922,6 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
                   kputs(line->d.allele[tok->idx + 1], &tok->str_value);
               else
                   kputc('.', &tok->str_value);
      -        tok->idx = 0;
           }
           else if ( tok->idx==-2 )
           {
      @@ -1184,7 +1183,10 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
           }
           else
           {
      -        rtok->values[0] = strlen(tok->str_value.s);
      +        if ( !tok->str_value.s[1] && tok->str_value.s[0]=='.' )
      +            rtok->values[0] = 0;
      +        else
      +            rtok->values[0] = strlen(tok->str_value.s);
               rtok->nvalues = 1;
           }
           return 1;
      
      From 591bf02a9463b69ca8a07bec097fceebfd704f2f Mon Sep 17 00:00:00 2001
      From: Petr Danecek 
      Date: Sat, 31 Mar 2018 09:37:03 +0100
      Subject: [PATCH 26/27] Expand docs to include the csq --gff-annot file format
       description
      
      ---
       csq.c             |  2 +-
       doc/bcftools.1    | 61 +++++++++++++++++++++++++++++++++++++++++++----
       doc/bcftools.html | 46 +++++++++++++++++++++++++++++++----
       doc/bcftools.txt  | 33 ++++++++++++++++++++++++-
       4 files changed, 132 insertions(+), 10 deletions(-)
      
      diff --git a/csq.c b/csq.c
      index b36ebc32d..848d1eaa8 100644
      --- a/csq.c
      +++ b/csq.c
      @@ -71,7 +71,7 @@
                   A .. gene line with a supported biotype
                           A.ID=~/^gene:/
       
      -            B .. transcript line referencing A
      +            B .. transcript line referencing A with supported biotype
                           B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
       
                   C .. corresponding CDS, exon, and UTR lines:
      diff --git a/doc/bcftools.1 b/doc/bcftools.1
      index dd40d8d75..92c4e10f8 100644
      --- a/doc/bcftools.1
      +++ b/doc/bcftools.1
      @@ -2,12 +2,12 @@
       .\"     Title: bcftools
       .\"    Author: [see the "AUTHORS" section]
       .\" Generator: DocBook XSL Stylesheets v1.76.1 
      -.\"      Date: 2018-03-15 15:49 GMT
      +.\"      Date: 2018-03-30 14:15 BST
       .\"    Manual: \ \&
       .\"    Source: \ \&
       .\"  Language: English
       .\"
      -.TH "BCFTOOLS" "1" "2018\-03\-15 15:49 GMT" "\ \&" "\ \&"
      +.TH "BCFTOOLS" "1" "2018\-03\-30 14:15 BST" "\ \&" "\ \&"
       .\" -----------------------------------------------------------------
       .\" * Define some portability stuff
       .\" -----------------------------------------------------------------
      @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
       BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
       .SS "VERSION"
       .sp
      -This manual page was last updated \fB2018\-03\-15 15:49 GMT\fR and refers to bcftools git version \fB1\&.7\-7\-g4dca79a+\fR\&.
      +This manual page was last updated \fB2018\-03\-30 14:15 BST\fR and refers to bcftools git version \fB1\&.7\-25\-ge12f43d+\fR\&.
       .SS "BCF1"
       .sp
       The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
      @@ -1796,8 +1796,45 @@ reference sequence in fasta format (required)
       \fB\-g, \-\-gff\-annot\fR \fIFILE\fR
       .RS 4
       GFF3 annotation file (required), such as
      -ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens/
      +ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens\&. An example of a minimal working GFF file:
       .RE
      +.sp
      +.if n \{\
      +.RS 4
      +.\}
      +.nf
      +    # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
      +    # looks up their parent transcript (determined from the "Parent=transcript:" attribute),
      +    # the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype
      +    # (the most interesting is "protein_coding")\&.
      +    #
      +    # Attributes required for
      +    #   gene lines:
      +    #   \- ID=gene:
      +    #   \- biotype=
      +    #   \- Name=      [optional]
      +    #
      +    #   transcript lines:
      +    #   \- ID=transcript:
      +    #   \- Parent=gene:
      +    #   \- biotype=
      +    #
      +    #   other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
      +    #   \- Parent=transcript:
      +    #
      +    # Supported biotypes:
      +    #   \- see the function gff_parse_biotype() in bcftools/csq\&.c
      +
      +    1   ignored_field  gene            21  2148  \&.   \-   \&.   ID=gene:GeneId;biotype=protein_coding;Name=GeneName
      +    1   ignored_field  transcript      21  2148  \&.   \-   \&.   ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
      +    1   ignored_field  three_prime_UTR 21  2054  \&.   \-   \&.   Parent=transcript:TranscriptId
      +    1   ignored_field  exon            21  2148  \&.   \-   \&.   Parent=transcript:TranscriptId
      +    1   ignored_field  CDS             21  2148  \&.   \-   1   Parent=transcript:TranscriptId
      +    1   ignored_field  five_prime_UTR  210 2148  \&.   \-   \&.   Parent=transcript:TranscriptId
      +.fi
      +.if n \{\
      +.RE
      +.\}
       .PP
       \fB\-i, \-\-include\fR \fIEXPRESSION\fR
       .RS 4
      @@ -3856,12 +3893,28 @@ nor the other
       options are given, the allele frequency is estimated from AC and AN counts which are already present in the INFO field\&.
       .RE
       .PP
      +\fB\-\-exclude\fR \fIEXPRESSION\fR
      +.RS 4
      +exclude sites for which
      +\fIEXPRESSION\fR
      +is true\&. For valid expressions see
      +\fBEXPRESSIONS\fR\&.
      +.RE
      +.PP
       \fB\-G, \-\-GTs\-only\fR \fIFLOAT\fR
       .RS 4
       use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL), setting PL of unseen genotypes to
       \fIFLOAT\fR\&. Safe value to use is 30 to account for GT errors\&.
       .RE
       .PP
      +\fB\-\-include\fR \fIEXPRESSION\fR
      +.RS 4
      +include only sites for which
      +\fIEXPRESSION\fR
      +is true\&. For valid expressions see
      +\fBEXPRESSIONS\fR\&.
      +.RE
      +.PP
       \fB\-I, \-\-skip\-indels\fR
       .RS 4
       skip indels as their genotypes are usually enriched for errors
      diff --git a/doc/bcftools.html b/doc/bcftools.html
      index 58331e5dd..b722cb27a 100644
      --- a/doc/bcftools.html
      +++ b/doc/bcftools.html
      @@ -1,6 +1,6 @@
       
       
      -bcftools

      Name

      bcftools — utilities for variant calling and manipulating VCFs and BCFs.

      Synopsis

      bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

      DESCRIPTION

      BCFtools is a set of utilities that manipulate variant calls in the Variant +bcftools

      Name

      bcftools — utilities for variant calling and manipulating VCFs and BCFs.

      Synopsis

      bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

      DESCRIPTION

      BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.

      Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF @@ -8,7 +8,7 @@ work in most, but not all situations. In general, whenever multiple VCFs are read simultaneously, they must be indexed and therefore also compressed.

      BCFtools is designed to work on a stream. It regards an input file "-" as the standard input (stdin) and outputs to the standard output (stdout). Several -commands can thus be combined with Unix pipes.

      VERSION

      This manual page was last updated 2018-03-15 15:49 GMT and refers to bcftools git version 1.7-7-g4dca79a+.

      BCF1

      The BCF1 format output by versions of samtools <= 0.1.19 is not +commands can thus be combined with Unix pipes.

      VERSION

      This manual page was last updated 2018-03-30 14:15 BST and refers to bcftools git version 1.7-25-ge12f43d+.

      BCF1

      The BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -1036,8 +1036,36 @@

      -g, --gff-annot FILE
      - GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/ -
      + GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. + An example of a minimal working GFF file: +
          # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
      +    # looks up their parent transcript (determined from the "Parent=transcript:" attribute),
      +    # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
      +    # (the most interesting is "protein_coding").
      +    #
      +    # Attributes required for
      +    #   gene lines:
      +    #   - ID=gene:<gene_id>
      +    #   - biotype=<biotype>
      +    #   - Name=<gene_name>      [optional]
      +    #
      +    #   transcript lines:
      +    #   - ID=transcript:<transcript_id>
      +    #   - Parent=gene:<gene_id>
      +    #   - biotype=<biotype>
      +    #
      +    #   other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
      +    #   - Parent=transcript:<transcript_id>
      +    #
      +    # Supported biotypes:
      +    #   - see the function gff_parse_biotype() in bcftools/csq.c
      +
      +    1   ignored_field  gene            21  2148  .   -   .   ID=gene:GeneId;biotype=protein_coding;Name=GeneName
      +    1   ignored_field  transcript      21  2148  .   -   .   ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
      +    1   ignored_field  three_prime_UTR 21  2054  .   -   .   Parent=transcript:TranscriptId
      +    1   ignored_field  exon            21  2148  .   -   .   Parent=transcript:TranscriptId
      +    1   ignored_field  CDS             21  2148  .   -   1   Parent=transcript:TranscriptId
      +    1   ignored_field  five_prime_UTR  210 2148  .   -   .   Parent=transcript:TranscriptId
      -i, --include EXPRESSION
      include only sites for which EXPRESSION is true. For valid expressions see @@ -2279,12 +2307,22 @@ If neither -e nor the other --AF-… options are given, the allele frequency is estimated from AC and AN counts which are already present in the INFO field.
      +--exclude EXPRESSION +
      + exclude sites for which EXPRESSION is true. For valid expressions see + EXPRESSIONS. +
      -G, --GTs-only FLOAT
      use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL), setting PL of unseen genotypes to FLOAT. Safe value to use is 30 to account for GT errors.
      +--include EXPRESSION +
      + include only sites for which EXPRESSION is true. For valid expressions see + EXPRESSIONS. +
      -I, --skip-indels
      skip indels as their genotypes are usually enriched for errors diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 44a15373f..7d4578fe3 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1064,7 +1064,38 @@ output VCF and are ignored for the prediction analysis. reference sequence in fasta format (required) *-g, --gff-annot* 'FILE':: - GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/ + GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens. + An example of a minimal working GFF file: +---- + # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines, + # looks up their parent transcript (determined from the "Parent=transcript:" attribute), + # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype + # (the most interesting is "protein_coding"). + # + # Attributes required for + # gene lines: + # - ID=gene: + # - biotype= + # - Name= [optional] + # + # transcript lines: + # - ID=transcript: + # - Parent=gene: + # - biotype= + # + # other lines (CDS, exon, five_prime_UTR, three_prime_UTR): + # - Parent=transcript: + # + # Supported biotypes: + # - see the function gff_parse_biotype() in bcftools/csq.c + + 1 ignored_field gene 21 2148 . - . ID=gene:GeneId;biotype=protein_coding;Name=GeneName + 1 ignored_field transcript 21 2148 . - . ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding + 1 ignored_field three_prime_UTR 21 2054 . - . Parent=transcript:TranscriptId + 1 ignored_field exon 21 2148 . - . Parent=transcript:TranscriptId + 1 ignored_field CDS 21 2148 . - 1 Parent=transcript:TranscriptId + 1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId +---- *-i, --include* 'EXPRESSION':: include only sites for which 'EXPRESSION' is true. For valid expressions see From ad7d585f09d83dc58c5c80c3dd6d0a318abfa259 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Sat, 31 Mar 2018 09:49:17 +0100 Subject: [PATCH 27/27] Update NEWS --- NEWS | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/NEWS b/NEWS index ced627f20..454960de5 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,20 @@ ## Release a.b +## Release 1.8 (April 2018) + +* `-i, -e` filtering: Support for custom perl scripts + +* `+contrast`: New plugin to annotate genotype differences between groups of samples + +* `+fixploidy`: New options for simpler ploidy usage + +* `+setGT`: Target genotypes can be set to phased by giving `--new-gt p` + +* `run-roh.pl`: Allow to pass options directly to `bcftools roh` + +* Number of bug fixes + + ## Release 1.7 (February 2018) * `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields