From 4af10d9d23276369aa56d0191723d7f37d05068a Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 14:28:19 +0000
Subject: [PATCH 01/27] Support for custom -i/-e perl script filtering

This feature can be enabled by supplying the --enable-perl-filters
option to configure at compile time. Note this changes the license
from MIT to GPL, therefore it is not enabled by default.

An example can be found in misc/demo-flt.pl, use for example as

    bcftools query -f '%CHROM:%POS \t %CSQ\n' -i 'perl:misc/demo-flt.pl; perl.severity(INFO/CSQ) > 10'
---
 INSTALL                    |  15 +++
 Makefile                   |   5 +-
 config.mk.in               |   2 +
 configure.ac               |  19 +++
 doc/bcftools.txt           |  32 +++--
 filter.c                   | 269 ++++++++++++++++++++++++++++++++++---
 misc/demo-flt.pl           |  82 +++++++++++
 plugins/fill-from-fasta.mk |   2 +-
 plugins/setGT.mk           |   2 +-
 test/perl-flt.vcf          |  41 ++++++
 10 files changed, 439 insertions(+), 30 deletions(-)
 create mode 100644 misc/demo-flt.pl
 create mode 100644 test/perl-flt.vcf

diff --git a/INSTALL b/INSTALL
index 82493fcee..2f9046acf 100644
--- a/INSTALL
+++ b/INSTALL
@@ -76,6 +76,21 @@ to an HTSlib source tree or installation in DIR; or --with-htslib=system
 to use a system-installed HTSlib.
 
 
+Optional Compilation with Perl
+==============================
+
+The '-i' and '-e' options can take external perl scripts for a more
+sophisticated filtering. This option can be enabled by supplying the
+--enable-perl-filters option to configure before running make:
+
+    ./configure --enable-perl-filters
+
+Note that enabling this option changes the license from MIT to GPL
+because bcftools need to be built with
+
+    perl -MExtUtils::Embed -e ccopts -e ldopts
+
+
 Optional Compilation with GSL
 =============================
 
diff --git a/Makefile b/Makefile
index 5cb007431..d254feacb 100644
--- a/Makefile
+++ b/Makefile
@@ -135,7 +135,7 @@ ifdef USE_GPL
 endif
 
 bcftools: $(OBJS) $(HTSLIB)
-	$(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS)
+	$(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS) $(PERL_LIBS)
 
 # Plugin rules
 ifneq "$(PLUGINS_ENABLED)" "no"
@@ -212,7 +212,8 @@ ccall.o: ccall.c $(htslib_kfunc_h) $(call_h) kmin.h $(prob1_h)
 convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
 tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h)
 em.o: em.c $(htslib_vcf_h) kmin.h $(call_h)
-filter.o: filter.c $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
+filter.o: filter.c config.h $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
+	$(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) -c -o $@ $<
 gvcf.o: gvcf.c gvcf.h $(call_h)
 kmin.o: kmin.c kmin.h
 mcall.o: mcall.c $(htslib_kfunc_h) $(call_h)
diff --git a/config.mk.in b/config.mk.in
index 8b543687a..7e0c84061 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -45,6 +45,8 @@ DYNAMIC_FLAGS = @CC_RDYNAMIC_OPT@
 
 USE_GPL = @USE_GPL@
 GSL_LIBS = @GSL_LIBS@
+PERL_CFLAGS = @PERL_CCOPTS@
+PERL_LIBS = @PERL_LIBS@
 
 PLATFORM   = @PLATFORM@
 PLUGINS_ENABLED = @enable_bcftools_plugins@
diff --git a/configure.ac b/configure.ac
index 256264f74..a6adcc808 100644
--- a/configure.ac
+++ b/configure.ac
@@ -66,6 +66,12 @@ AC_ARG_WITH([bcf-plugin-path],
   [with_bcf_plugin_path=$with_bcf_plugin_dir])
 AC_SUBST([PLUGINPATH], $with_bcf_plugin_path)
 
+AC_ARG_ENABLE([perl-filters],
+  [AS_HELP_STRING([--enable-perl-filters],
+                  [do not build support for PERL scripts in -i/-e filtering expressions])],
+  [], [enable_perl_filters=no])
+AC_SUBST(enable_perl_filters)
+
 AC_ARG_ENABLE([libgsl],
   [AS_HELP_STRING([--enable-libgsl],
                   [build options that require the GNU scientific library (changes bcftools license to GPL, see INSTALL for details)])],
@@ -199,6 +205,19 @@ Either configure with --disable-libgsl or resolve this error to build bcftools.
   AC_SUBST([GSL_LIBS])
 ])
 
+AS_IF([test "$enable_perl_filters" != "no" ], [dnl
+    if ! perl -MExtUtils::Embed -e ccopts -e ldopts &>/dev/null ; then
+       AC_MSG_ERROR([Cannot `perl -MExtUtils::Embed -e ccopts -e ldopts`])
+    fi
+    AC_DEFINE([ENABLE_PERL_FILTERS], 1, [Define if BCFtools should enable for support PERL scripts in -i/-e filtering expressions.])
+    PERL_CCOPTS="`perl -MExtUtils::Embed -e ccopts`"
+    PERL_LIBS="`perl -MExtUtils::Embed -e ldopts`"
+    AC_SUBST([PERL_CCOPTS])
+    AC_SUBST([PERL_LIBS])
+    USE_GPL=1
+    AC_SUBST([USE_GPL])
+])
+
 AC_CONFIG_FILES([config.mk])
 AC_OUTPUT
 
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index 94c085f58..deebfd1c4 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -1255,7 +1255,10 @@ And similarly here, the second is filtered:
 
 [[gtcheck]]
 === bcftools gtcheck ['OPTIONS'] [*-g* 'genotypes.vcf.gz'] 'query.vcf.gz'
-Checks sample identity or, without *-g*, multi-sample cross-check is performed.
+Checks sample identity. The program can operate in two modes. If the *-g*
+option is given, the identity of the *-s* sample from 'query.vcf.gz'
+is checked against the samples in the *-g* file.
+Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' is performed.
 
 *-a, --all-sites*::
     output for all sites
@@ -1319,17 +1322,16 @@ Checks sample identity or, without *-g*, multi-sample cross-check is performed.
         *-G*, the value '1' can be used instead; the discordance value then
         gives exactly the number of differing genotypes.
 
-    SM, Average Discordance;;
-        Average discordance between sample 'a' and all other samples.
+    ERR, error rate;;
+        Pairwise error rate calculated as number of differences divided
+        by the total number of comparisons.
 
-    SM, Average Depth;;
-        Average depth at evaluated sites, or 1 if FORMAT/DP field is not
-        present.
-
-    SM, Average Number of sites;;
-        The average number of sites used to calculate the discordance.  In
-        other words, the average number of non-missing PLs/genotypes seen
-        both samples.
+    CLUSTER, TH, DOT;;
+        In presence of multiple samples, related samples and outliers can be
+        identified by clustering samples by error rate. A simple hierarchical
+        clustering based on minimization of standard deviation is used. This is
+        useful to detect sample swaps, for example in situations where one
+        sample has been sequenced in multiple runs.
 
 //    MD, Maximum Deviation;;
 //        The maximum absolute deviation from average score of the sample
@@ -2769,6 +2771,14 @@ number of samples with missing genotype; fraction of samples with missing genoty
 
         N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
 
+* custom perl filtering. Note that this command is not compiled in by default, see
+the section *Optional Compilation with Perl* in the INSTALL file for help
+and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
+"severity" which can be invoked from the command line as follows:
+
+        perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3
+
+
 
 .Notes:
 
diff --git a/filter.c b/filter.c
index 53e8dc774..e3a9cb759 100644
--- a/filter.c
+++ b/filter.c
@@ -27,18 +27,29 @@ THE SOFTWARE.  */
 #include <strings.h>
 #include <errno.h>
 #include <math.h>
-#include <wordexp.h>
 #include <regex.h>
 #include <htslib/khash_str2int.h>
-#include "filter.h"
-#include "bcftools.h"
 #include <htslib/hts_defs.h>
 #include <htslib/vcfutils.h>
+#include "config.h"
+#include "filter.h"
+#include "bcftools.h"
+
+#if ENABLE_PERL_FILTERS
+#  define filter_t perl_filter_t
+#  include <EXTERN.h>
+#  include <perl.h>
+#  undef filter_t
+#  define my_perl perl
+#endif
+
 
 #ifndef __FUNCTION__
 #  define __FUNCTION__ __func__
 #endif
 
+static filter_ninit = 0;
+
 uint64_t bcf_double_missing    = 0x7ff0000000000001;
 uint64_t bcf_double_vector_end = 0x7ff0000000000002;
 static inline void bcf_double_set(double *ptr, uint64_t value)
@@ -63,6 +74,7 @@ typedef struct _token_t
 {
     // read-only values, same for all VCF lines
     int tok_type;       // one of the TOK_* keys below
+    int nargs;          // used only TOK_PERLSUB, the first argument is the name of the subroutine
     char *key;          // set only for string constants, otherwise NULL
     char *tag;          // for debugging and printout only, VCF tag name
     double threshold;   // filtering threshold
@@ -100,6 +112,9 @@ struct _filter_t
     float   *tmpf;
     kstring_t tmps;
     int max_unpack, mtmpi, mtmpf, nsamples;
+#if ENABLE_PERL_FILTERS
+    PerlInterpreter *perl;
+#endif
 };
 
 
@@ -130,11 +145,12 @@ struct _filter_t
 #define TOK_LEN     24
 #define TOK_FUNC    25
 #define TOK_CNT     26
+#define TOK_PERLSUB 27
 
-//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
-//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8};
-#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
+//                      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
+//                        ( ) [ < = > ] ! | &  +  -  *  /  M  m  a  A  O  ~  ^  S  .  l  f  c  p
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8};
+#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp"
 
 static int filters_next_token(char **str, int *len)
 {
@@ -169,6 +185,7 @@ static int filters_next_token(char **str, int *len)
     if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5;
     if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7;
     if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4;
+    if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return TOK_PERLSUB; }
 
     if ( tmp[0]=='@' )  // file name
     {
@@ -250,6 +267,53 @@ static int filters_next_token(char **str, int *len)
     return TOK_VAL;
 }
 
+
+/* 
+    Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
+    
+    Based on jkb's staden code with some adjustements.
+    https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123
+*/
+char *expand_path(char *path)
+{
+#ifdef _WIN32
+    return strdup(path);    // windows expansion: todo
+#endif
+
+    kstring_t str = {0,0,0};
+    int i;
+
+    if ( path[0] == '~' )
+    {
+        if ( !path[1] || path[1] == '/' )
+        {
+            // ~ or ~/path
+            kputs(getenv("HOME"), &str);
+            if ( path[1] ) kputs(path+1, &str);
+        }
+        else
+        {
+            // user name: ~pd3/path
+            char *end = path;
+            while ( *end && *end!='/' ) end++;
+            kputsn(path+1, end-path-1, &str);
+            struct passwd *pwentry = getpwnam(str.s);
+            str.l = 0;
+
+            if ( !pwentry ) kputsn(path, end-path, &str);
+            else kputs(pwentry->pw_dir, &str);
+            kputs(end, &str);
+        }
+        return str.s;
+    }
+    if ( path[0] == '$' )
+    {
+        char *var = getenv(path+1);
+        if ( var ) path = var;
+    }
+    return strdup(path);
+}
+
 static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
 {
     float *ptr = &line->qual;
@@ -1711,13 +1775,11 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
         tok->tag = (char*) calloc(len+1,sizeof(char));
         memcpy(tok->tag,str,len);
         tok->tag[len] = 0;
-        wordexp_t wexp;
-        wordexp(tok->tag+1, &wexp, 0);
-        if ( !wexp.we_wordc ) error("No such file: %s\n", tok->tag+1);
+        char *fname = expand_path(tok->tag);
         int i, n;
-        char **list = hts_readlist(wexp.we_wordv[0], 1, &n);
-        if ( !list ) error("Could not read: %s\n", wexp.we_wordv[0]);
-        wordfree(&wexp);
+        char **list = hts_readlist(fname, 1, &n);
+        if ( !list ) error("Could not read: %s\n", fname);
+        free(fname);
         tok->hash = khash_str2int_init();
         for (i=0; i<n; i++)
         {
@@ -1994,6 +2056,123 @@ static void str_to_lower(char *str)
 {
     while ( *str ) { *str = tolower(*str); str++; }
 }
+static int perl_exec(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
+{
+#if ENABLE_PERL_FILTERS
+
+    assert( rtok->nargs == nstack );
+    PerlInterpreter *perl = flt->perl;
+
+    dSP;
+    ENTER;
+    SAVETMPS;
+
+    PUSHMARK(SP);
+    int i,j;
+    for (i=1; i<nstack; i++)
+    {
+        token_t *tok = stack[i];
+        if ( tok->is_str )
+            XPUSHs(sv_2mortal(newSVpvn(tok->str_value.s,tok->str_value.l)));
+        else if ( tok->nvalues==1 )
+            XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
+        else if ( tok->nvalues>1 )
+        {
+            AV *av = newAV();
+            for (j=0; j<tok->nvalues; j++) av_push(av, newSVnv(tok->values[j]));
+            SV *rv = newRV_inc((SV*)av);
+            XPUSHs(rv);
+        }
+        else
+        {
+            bcf_double_set_missing(tok->values[0]);
+            XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
+        }
+    }
+    PUTBACK;
+
+    // A possible future todo: provide a means to select samples and indexes,
+    // expressions like this don't work yet
+    //          perl.filter(FMT/AD)[1:0]
+
+    int nret = call_pv(stack[0]->str_value.s, G_ARRAY);
+
+    SPAGAIN;
+
+    rtok->nvalues = nret;
+    hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
+    for (i=nret; i>0; i--)
+    {
+        rtok->values[i-1] = (double) POPn;
+        if ( isnan(rtok->values[i-1]) ) bcf_double_set_missing(rtok->values[i-1]);
+    }
+
+    PUTBACK;
+    FREETMPS;
+    LEAVE;
+
+#else
+    error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
+#endif
+    return nstack;
+}
+static void perl_init(filter_t *filter, char **str)
+{
+    char *beg = *str;
+    while ( *beg && isspace(*beg) ) beg++;
+    if ( !*beg ) return;
+    if ( strncasecmp("perl:", beg, 5) ) return;
+#if ENABLE_PERL_FILTERS
+    beg += 5;
+    char *end = beg;
+    while ( *end && *end!=';' ) end++;  // for now not escaping semicolons
+    *str = end+1;
+
+    if ( ++filter_ninit == 1 )
+    {
+        // must be executed only once, even for multiple filters; first time here
+        PERL_SYS_INIT3(0, NULL, NULL);
+    }
+    
+    filter->perl = perl_alloc();
+    PerlInterpreter *perl = filter->perl;
+
+    if ( !perl ) error("perl_alloc failed\n");
+    perl_construct(perl);
+
+    // name of the perl script to run
+    char *rmme = (char*) calloc(end - beg + 1,1);
+    memcpy(rmme, beg, end - beg);
+    char *argv[] = { "", "" };
+    argv[1] = expand_path(rmme);
+    free(rmme);
+
+    PL_origalen = 1;    // don't allow $0 change
+    int ret = perl_parse(filter->perl, NULL, 2, argv, NULL);
+    PL_exit_flags |= PERL_EXIT_DESTRUCT_END;
+    if ( ret ) error("Failed to parse: %s\n", argv[1]);
+    free(argv[1]);
+
+    perl_run(perl);
+#else
+    error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
+#endif
+}
+static void perl_destroy(filter_t *filter)
+{
+#if ENABLE_PERL_FILTERS
+    if ( !filter->perl ) return;
+
+    PerlInterpreter *perl = filter->perl;
+    perl_destruct(perl);
+    perl_free(perl);
+    if ( --filter_ninit <= 0  )
+    {
+        // similarly to PERL_SYS_INIT3, can must be executed only once? todo: test
+        PERL_SYS_TERM();
+    }
+#endif
+}
 
 
 // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
@@ -2008,6 +2187,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
     int nout = 0, mout = 0;                 // filter tokens, RPN
     token_t *out = NULL;
     char *tmp = filter->str;
+    perl_init(filter, &tmp);
+
     int last_op = -1;
     while ( *tmp )
     {
@@ -2016,7 +2197,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
         if ( ret==-1 ) error("Missing quotes in: %s\n", str);
 
         // fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
-        // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
+        // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
 
         if ( ret==TOK_LFT )         // left bracket
         {
@@ -2050,6 +2231,52 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
                 tok->threshold = -1.0;
                 ret = TOK_MULT;
             }
+            else if ( ret==TOK_PERLSUB )
+            {
+                tmp += len;
+                char *beg = tmp;
+                while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++;
+                if ( *beg!='(' ) error("Could not parse the expression: %s\n", str);
+                char *end = beg;
+                while ( *end && *end!=')' ) end++;
+                if ( !*end ) error("Could not parse the expression: %s\n", str);
+                kstring_t rmme = {0,0,0};
+
+                // the subroutine name
+                kputc('"', &rmme);
+                kputsn(tmp, beg-tmp, &rmme);
+                kputc('"', &rmme);
+                nout++;
+                hts_expand0(token_t, nout, mout, out);
+                filters_init1(filter, rmme.s, rmme.l, &out[nout-1]);
+
+                // subroutine arguments
+                rmme.l = 0;
+                kputsn(beg+1, end-beg-1, &rmme);
+                int i, nargs;
+                char **rmme_list = hts_readlist(rmme.s, 0, &nargs);
+                for (i=0; i<nargs; i++)
+                {
+                    nout++;
+                    hts_expand0(token_t, nout, mout, out);
+                    filters_init1(filter, rmme_list[i], strlen(rmme_list[i]), &out[nout-1]);
+                    free(rmme_list[i]);
+                }
+                free(rmme_list);
+                free(rmme.s);
+
+                nout++;
+                hts_expand0(token_t, nout, mout, out);
+                token_t *tok = &out[nout-1];
+                tok->tok_type  = TOK_PERLSUB;
+                tok->nargs     = nargs + 1;
+                tok->hdr_id    = -1;
+                tok->pass_site = -1;
+                tok->threshold = -1.0;
+
+                tmp = end + 1;
+                continue;
+            }
             else
             {
                 while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] )
@@ -2073,7 +2300,10 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
         {
             nout++;
             hts_expand0(token_t, nout, mout, out);
-            filters_init1(filter, tmp, len, &out[nout-1]);
+            if ( tmp[len-1]==',' )
+                filters_init1(filter, tmp, len-1, &out[nout-1]);
+            else
+                filters_init1(filter, tmp, len, &out[nout-1]);
             tmp += len;
         }
         last_op = ret;
@@ -2240,6 +2470,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
 
 void filter_destroy(filter_t *filter)
 {
+    perl_destroy(filter);
     int i;
     for (i=0; i<filter->nfilters; i++)
     {
@@ -2300,6 +2531,14 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
             if ( --nargs > 0 ) nstack -= nargs;
             continue;
         }
+        else if ( filter->filters[i].tok_type == TOK_PERLSUB )
+        {
+            int nargs = filter->filters[i].nargs;
+            perl_exec(filter, line, &filter->filters[i], filter->flt_stack, nstack);
+            filter->flt_stack[nstack-nargs] = &filter->filters[i];
+            nstack -= nargs - 1;
+            continue;
+        }
         if ( nstack<2 )
             error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack);  // too few values left on the stack
 
diff --git a/misc/demo-flt.pl b/misc/demo-flt.pl
new file mode 100644
index 000000000..802dd7e77
--- /dev/null
+++ b/misc/demo-flt.pl
@@ -0,0 +1,82 @@
+# This demo code shows how to write a custom perl code and use it in
+# the -i/-e filtering expressions.
+# 
+# In this example we want to take variant consequences generated by `bcftools csq`,
+# rank them by severity, filter by the most severe, and print the list using the
+# following command:
+#
+#   bcftools query -f '%CHROM:%POS \t %CSQ\n' -i 'perl:misc/demo-flt.pl; perl.severity(INFO/CSQ) > 10' test/perl-flt.vcf
+#
+
+
+# There can be multiple subroutines in the same script and they
+# can be referenced from the command line by prefixing "perl.subroutine_name()"
+#
+sub severity
+{
+    # Arbitrary number of arguments can be provided.
+    my (@args) = @_;
+    
+    # Note that string arrays are passed to perl in the form of a single
+    # comma-separated string, but numeric arrays must be dereferenced, for
+    # example as shown in this example:
+    #
+    #   for my $arg (@args)
+    #   {
+    #       if ( ref($arg) eq 'ARRAY' ) { print "array: ".join(',',@$arg)."\n"; }
+    #       else { print "scalar: $arg\n"; }
+    #   }
+    #
+
+    # In our case there should be only one parameter passed to the subroutine;
+    # check if this is the case
+    if ( scalar @args != 1 ) { error("Incorrect use, expected one argument, got ".scalar @args."!\n"); }
+
+    # Create a lookup table from consequences to an arbitrary severity score
+    my %severity = 
+    (
+        "intergenic"        => 1,
+        "intron"            => 2, 
+        "non_coding"        => 3, 
+        "5_prime_utr"       => 4, 
+        "3_prime_utr"       => 5, 
+        "stop_retained"     => 6, 
+        "synonymous"        => 7,
+        "coding_sequence"   => 8,
+        "missense"          => 9, 
+        "splice_region"     => 10, 
+        "inframe_altering"  => 11,
+        "inframe_deletion"  => 12, 
+        "inframe_insertion" => 13, 
+        "splice_acceptor"   => 14, 
+        "splice_donor"      => 15, 
+        "stop_lost"         => 16, 
+        "stop_gained"       => 17, 
+        "start_lost"        => 18, 
+        "frameshift"        => 19, 
+    );
+
+    # Split multiple consequences into an array
+    my @csq = split(/,/, $args[0]);
+
+    # Find the most severe consequence. The consequence string may look like this:
+    #   inframe_deletion|ABC|ENST00000000002|protein_coding|-|5YV>5T|144TAC>T+148TA>T
+    my $max = 0;
+    for my $csq (@csq)
+    {
+        my @fields = split(/\|/, $csq);
+        my $string = $fields[0];
+        my $score  = exists($severity{$string}) ? $severity{$string} : 0;
+        if ( $max < $score ) { $max = $score; }
+    }
+
+    return $max;
+}
+
+sub error
+{
+    my (@msg) = @_;
+    print STDERR @msg;
+    exit;
+}
+
diff --git a/plugins/fill-from-fasta.mk b/plugins/fill-from-fasta.mk
index 688df7124..8014961ce 100644
--- a/plugins/fill-from-fasta.mk
+++ b/plugins/fill-from-fasta.mk
@@ -1,2 +1,2 @@
 plugins/fill-from-fasta.so: plugins/fill-from-fasta.c version.h version.c filter.h filter.c
-	$(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS)
+	$(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS)
diff --git a/plugins/setGT.mk b/plugins/setGT.mk
index 75afe5122..2d97acfff 100644
--- a/plugins/setGT.mk
+++ b/plugins/setGT.mk
@@ -1,2 +1,2 @@
 plugins/setGT.so: plugins/setGT.c version.h version.c filter.h filter.c
-	$(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS)
+	$(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS)
diff --git a/test/perl-flt.vcf b/test/perl-flt.vcf
new file mode 100644
index 000000000..fe598af4c
--- /dev/null
+++ b/test/perl-flt.vcf
@@ -0,0 +1,41 @@
+##fileformat=VCFv4.2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##reference=file://some/path/human_g1k_v37.fasta
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Predicted consequence">
+##contig=<ID=1,length=2147483647>
+##contig=<ID=2,length=2147483647>
+##contig=<ID=3,length=2147483647>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SmplAAA	SmplBBB
+1	90	.	C	T	.	.	CSQ=5_prime_utr|XYZ|ENST00000000001|protein_coding	GT	1|0	1|1
+1	102	.	C	T	1	.	CSQ=synonymous|XYZ|ENST00000000001|protein_coding|+|1Y|102C>T	GT	1|0	1|0
+1	103	.	G	A	1	.	CSQ=missense|XYZ|ENST00000000001|protein_coding|+|2V>2I|103G>A	GT	1|0	0|0
+1	103	.	G	C	1	.	CSQ=missense|XYZ|ENST00000000001|protein_coding|+|2V>2L|103G>C	GT	0|0	1|0
+1	107	.	G	A	1	.	CSQ=missense|XYZ|ENST00000000001|protein_coding|+|3R>3Q|107G>A+108T>A	GT	1|0	1|0
+1	108	.	T	A	1	.	CSQ=splice_region|XYZ|ENST00000000001|protein_coding,@107	GT	1|0	1|0
+1	121	.	ACG	A	.	.	CSQ=inframe_deletion|XYZ|ENST00000000001|protein_coding|+|5TY>5I|121ACG>A+124TA>T,splice_region|XYZ|ENST00000000001|protein_coding	GT	1|0	1|0
+1	124	.	TA	T	.	.	CSQ=@121	GT	1|0	1|0
+1	128	.	T	C	1	.	CSQ=missense|XYZ|ENST00000000001|protein_coding|+|7V>6A|128T>C+129A>C,splice_region|XYZ|ENST00000000001|protein_coding	GT	1|0	0/0
+1	129	.	A	C	1	.	CSQ=splice_region|XYZ|ENST00000000001|protein_coding,@128	GT	1|0	0/0
+1	140	.	TA	AACG	.	.	CSQ=inframe_insertion|XYZ|ENST00000000001|protein_coding|+|8LR>7QRR|140TA>AACG+142C>CC,splice_region|XYZ|ENST00000000001|protein_coding	GT	1|0	0|0
+1	142	.	C	CC	.	.	CSQ=splice_region|XYZ|ENST00000000001|protein_coding,@140	GT	1|0	0|0
+1	145	.	AC	TA	.	.	CSQ=stop_gained|XYZ|ENST00000000001|protein_coding|+|10T>10*|145AC>TA	GT	1|0	0|0
+1	160	.	TA	T	.	.	CSQ=*frameshift|XYZ|ENST00000000001|protein_coding|+|12YVRT>12SYV|160TA>T,splice_region|XYZ|ENST00000000001|protein_coding	GT	1|0	0|0
+1	190	.	C	T	.	.	CSQ=3_prime_utr|XYZ|ENST00000000001|protein_coding	GT	1|0	0|0
+2	97	.	A	C	.	.	CSQ=3_prime_utr|ABC|ENST00000000002|protein_coding	GT	1|0	0|0
+2	105	.	AC	A	.	.	CSQ=@121	GT	1|0	0|0
+2	121	.	AC	A	.	.	CSQ=frameshift|ABC|ENST00000000002|protein_coding|-|11VVRTY>11*|105AC>A+121AC>A,splice_region|ABC|ENST00000000002|protein_coding	GT	1|0	0|0
+2	126	.	C	CTT	.	.	CSQ=@127	GT	1|0	0|0
+2	127	.	G	GG	.	.	CSQ=inframe_insertion|ABC|ENST00000000002|protein_coding|-|9T>8TK|126C>CTT+127G>GG	GT	1|0	0|0
+2	144	.	TAC	T	.	.	CSQ=@148	GT	1|0	0|0
+2	148	.	TA	T	.	.	CSQ=inframe_deletion|ABC|ENST00000000002|protein_coding|-|5YV>5T|144TAC>T+148TA>T,splice_region|ABC|ENST00000000002|protein_coding	GT	1|0	0|0
+2	164	.	T	G	.	.	CSQ=missense|ABC|ENST00000000002|protein_coding|-|3T>3P|164T>G	GT	1|0	0|0
+2	165	.	A	C	.	.	CSQ=synonymous|ABC|ENST00000000002|protein_coding|-|2R|165A>C	GT	1|0	0|0
+2	169	.	A	G	.	.	CSQ=@170	GT	1|0	0|0
+2	170	.	C	T	.	.	CSQ=missense|ABC|ENST00000000002|protein_coding|-|1V>1T|169A>G+170C>T	GT	1|0	0|0
+2	199	.	G	T	.	.	CSQ=5_prime_utr|ABC|ENST00000000002|protein_coding	GT	1|0	0|0
+3	20	.	T	A	.	.	CSQ=non_coding|mir-007||lincRNA	GT	1|0	0|0
+3	109	.	ACGTACGT	A	1	.	CSQ=splice_acceptor|QWRTY|ENST00000000003|protein_coding	GT	1|0	0|0
+3	113	.	A	T	.	.	CSQ=splice_region|QWRTY|ENST00000000003|protein_coding	GT	1|0	0|0
+3	120	.	T	A	.	.	CSQ=intron|QWRTY||protein_coding	GT	1|0	0|0
+3	152	.	T	A	.	.	CSQ=splice_region|QWRTY|ENST00000000003|protein_coding	GT	1|0	0|0
+3	159	.	G	A	.	.	CSQ=splice_donor|QWRTY|ENST00000000003|protein_coding	GT	1|0	0|0

From b41f87bb11f4d57fdbbc15c77946a600edf6f989 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 15:09:05 +0000
Subject: [PATCH 02/27] Correctly transfer Number=G annotations,

the number of alleles may be different and they can come in different order
---
 test/annotate11.vcf |  11 ++
 test/annotate15.out |  17 +++
 test/annots11.vcf   |  15 +++
 vcfannotate.c       | 255 +++++++++++++++++++++++++++++++++++++++++++-
 vcmp.c              |  27 ++++-
 vcmp.h              |   1 +
 6 files changed, 322 insertions(+), 4 deletions(-)
 create mode 100644 test/annotate11.vcf
 create mode 100644 test/annotate15.out
 create mode 100644 test/annots11.vcf

diff --git a/test/annotate11.vcf b/test/annotate11.vcf
new file mode 100644
index 000000000..1da05f8c4
--- /dev/null
+++ b/test/annotate11.vcf
@@ -0,0 +1,11 @@
+##fileformat=VCFv4.2
+##contig=<ID=1,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	B	A
+1	3000001	.	C	T	.	.	.	GT	.	.
+1	3000002	.	C	T	.	.	.	GT	1|1	0
+1	3000003	.	C	CT,CTT	.	.	.	GT	1|1	0
+X	3000001	.	C	T	.	.	.	GT	.	.
+X	3000002	.	C	T	.	.	.	GT	1|1	0
+X	3000003	.	C	CT,CTT	.	.	.	GT	1|1	0
diff --git a/test/annotate15.out b/test/annotate15.out
new file mode 100644
index 000000000..e3365edb4
--- /dev/null
+++ b/test/annotate15.out
@@ -0,0 +1,17 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=1,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=X>
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic Depth">
+##FORMAT=<ID=AF,Number=R,Type=Float,Description="Genotype likelihoods">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype likelihoods">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihoods">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	B	A
+1	3000001	.	C	T	.	.	.	GT:AD:AF:PL:GL	.:8,9:8,9:10,11,12:10,11,12	.:1,2:1,2:3,4:3,4
+1	3000002	.	C	T	.	.	.	GT:AD:AF:PL:GL	0/1:10,12:10,12:10,13,15:10,13,15	1/0:1,3:1,3:1,4,6:1,4,6
+1	3000003	.	C	CT,CTT	.	.	.	GT:AD:AF:PL:GL	0/1:4,.,5:4,.,5:4,.,.,5,.,6:4,.,.,5,.,6	1/0:1,.,2:1,.,2:1,.,.,2,.,3:1,.,.,2,.,3
+X	3000001	.	C	T	.	.	.	GT:AD:AF:PL:GL	.:8,9:8,9:10,11,12:10,11,12	.:1,2:1,2:3,4:3,4
+X	3000002	.	C	T	.	.	.	GT:AD:AF:PL:GL	0/1:10,12:10,12:10,13,15:10,13,15	0:1,3:1,3:1,3:1,3
+X	3000003	.	C	CT,CTT	.	.	.	GT:AD:AF:PL:GL	0/1:4,.,5:4,.,5:4,.,.,5,.,6:4,.,.,5,.,6	0:1,.,2:1,.,2:1,.,2:1,.,2
diff --git a/test/annots11.vcf b/test/annots11.vcf
new file mode 100644
index 000000000..2a81bfa87
--- /dev/null
+++ b/test/annots11.vcf
@@ -0,0 +1,15 @@
+##fileformat=VCFv4.2
+##contig=<ID=1,assembly=b37,length=249250621>
+##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic Depth">
+##FORMAT=<ID=AF,Number=R,Type=Float,Description="Genotype likelihoods">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype likelihoods">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihoods">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	A	B
+1	3000001	.	C	T	.	.	.	GT:AD:AF:PL:GL	.:1,2:1,2:3,4:3,4	.:8,9:8,9:10,11,12:10,11,12
+1	3000002	.	C	CT,T	.	.	.	GT:AD:AF:PL:GL	1/0:1,2,3:1,2,3:1,2,3,4,5,6:1,2,3,4,5,6	0/1:10,11,12:10,11,12:10,11,12,13,14,15:10,11,12,13,14,15
+1	3000003	.	C	CTT	.	.	.	GT:AD:AF:PL:GL	1/0:1,2:1,2:1,2,3:1,2,3	0/1:4,5:4,5:4,5,6:4,5,6
+X	3000001	.	C	T	.	.	.	GT:AD:AF:PL:GL	.:1,2:1,2:3,4:3,4	.:8,9:8,9:10,11,12:10,11,12
+X	3000002	.	C	CT,T	.	.	.	GT:AD:AF:PL:GL	0:1,2,3:1,2,3:1,2,3:1,2,3	0/1:10,11,12:10,11,12:10,11,12,13,14,15:10,11,12,13,14,15
+X	3000003	.	C	CTT	.	.	.	GT:AD:AF:PL:GL	0:1,2:1,2:1,2:1,2	0/1:4,5:4,5:4,5,6:4,5,6
diff --git a/vcfannotate.c b/vcfannotate.c
index abea98f90..3de9eb5cf 100644
--- a/vcfannotate.c
+++ b/vcfannotate.c
@@ -114,6 +114,7 @@ typedef struct _args_t
 
     int nsmpl_annot;
     int *sample_map, nsample_map, sample_is_file;   // map[idst] -> isrc
+    uint8_t *src_smpl_pld, *dst_smpl_pld;   // for Number=G format fields
     int mtmpi, mtmpf, mtmps;
     int mtmpi2, mtmpf2, mtmps2;
     int mtmpi3, mtmpf3, mtmps3;
@@ -1113,21 +1114,261 @@ static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void
 
     return core_setter_format_str(args,line,col,args->tmpp);
 }
+static int determine_ploidy(int nals, int *vals, int nvals1, uint8_t *smpl, int nsmpl)
+{
+    int i, j, ndip = nals*(nals+1)/2, max_ploidy = 0;
+    for (i=0; i<nsmpl; i++)
+    {
+        int *ptr = vals + i*nvals1;
+        int has_value = 0;
+        for (j=0; j<nvals1; j++)
+        {
+            if ( ptr[j]==bcf_int32_vector_end ) break;
+            if ( ptr[j]!=bcf_int32_missing ) has_value = 1;
+        }
+        if ( has_value )
+        {
+            if ( j==ndip )
+            { 
+                smpl[i] = 2;
+                max_ploidy = 2; 
+            }
+            else if ( j==nals )
+            { 
+                smpl[i] = 1;
+                if ( !max_ploidy ) max_ploidy = 1;
+            }
+            else return -j;
+        }
+        else smpl[i] = 0;
+    }
+    return max_ploidy;
+}
 static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
     bcf1_t *rec = (bcf1_t*) data;
     int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
     if ( nsrc==-3 ) return 0;    // the tag is not present
     if ( nsrc<=0 ) return 1;     // error
-    return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+    int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
+    int nsrc1 = nsrc / nsmpl_src;
+    if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A )
+        return core_setter_format_int(args,line,col,args->tmpi,nsrc1);
+
+    // create mapping from src to dst genotypes, haploid and diploid version
+    int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1;
+    int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele);
+    if ( !map_hap ) error("REF alleles not compatible at %s:%d\n");
+
+    int i, j;
+    if ( rec->n_allele==line->n_allele )
+    {
+        // alleles unchanged?
+        for (i=0; i<rec->n_allele; i++) if ( map_hap[i]!=i ) break;
+        if ( i==rec->n_allele )
+            return core_setter_format_int(args,line,col,args->tmpi,nsrc1);
+    }
+
+    int nsmpl_dst = rec->n_sample;
+    int ndst  = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
+    int ndst1 = ndst / nsmpl_dst;
+    if ( ndst <= 0 )
+    {
+        if ( col->replace==REPLACE_NON_MISSING ) return 0;  // overwrite only if present
+        if ( col->number==BCF_VL_G )
+            ndst1 = line->n_allele*(line->n_allele+1)/2;
+        else
+            ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+        hts_expand(int, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
+        for (i=0; i<nsmpl_dst; i++)
+        {
+            int32_t *dst = args->tmpi2 + i*ndst1;
+            for (j=0; j<ndst1; j++) dst[j] = bcf_int32_missing;
+        }
+    }
+
+    int nmap_dip = 0, *map_dip = NULL;
+    if ( col->number==BCF_VL_G )
+    {
+        map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip);
+        if ( !args->src_smpl_pld )
+        {
+            args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src);
+            args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst);
+        }
+        int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src);
+        if ( pld_src<0 )
+            error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1);
+        int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst);
+        if ( pld_dst<0 )
+            error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1);
+
+        int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2;
+        if ( ndst1_new != ndst1 )
+        {
+            if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d  at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1);
+            ndst1 = ndst1_new;
+            hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
+        }
+    }
+    else if ( !ndst1 )
+    {
+        ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+        hts_expand(int32_t, ndst1*nsmpl_dst, args->mtmpi2, args->tmpi2);
+    }
+
+    for (i=0; i<nsmpl_dst; i++)
+    {
+        int ii = args->sample_map ? args->sample_map[i] : i;
+        int32_t *ptr_src = args->tmpi + i*nsrc1;
+        int32_t *ptr_dst = args->tmpi2 + ii*ndst1;
+
+        if ( col->number==BCF_VL_G )
+        {
+            if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] )
+                error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+            if ( !args->dst_smpl_pld[i] )
+                for (j=0; j<ndst1; j++) ptr_dst[j] = bcf_int32_missing;
+        }
+        if ( col->number!=BCF_VL_G || args->src_smpl_pld[i]==1 )
+        {
+            for (j=0; j<nmap_hap; j++)
+            {
+                int k = map_hap[j];
+                if ( k>=0 ) ptr_dst[k] = ptr_src[j];
+            }
+            if ( col->number==BCF_VL_G )
+                for (j=line->n_allele; j<ndst1; j++) ptr_dst[j++] = bcf_int32_vector_end;
+        }
+        else
+        {
+            for (j=0; j<nmap_dip; j++)
+            {
+                int k = map_dip[j];
+                if ( k>=0 ) ptr_dst[k] = ptr_src[j];
+            }
+        }
+    }
+    return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nsmpl_dst*ndst1);
 }
 static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
 {
+
     bcf1_t *rec = (bcf1_t*) data;
     int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
     if ( nsrc==-3 ) return 0;    // the tag is not present
     if ( nsrc<=0 ) return 1;     // error
-    return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+    int nsmpl_src = bcf_hdr_nsamples(args->files->readers[1].header);
+    int nsrc1 = nsrc / nsmpl_src;
+    if ( col->number!=BCF_VL_G && col->number!=BCF_VL_R && col->number!=BCF_VL_A )
+        return core_setter_format_real(args,line,col,args->tmpf,nsrc1);
+
+    // create mapping from src to dst genotypes, haploid and diploid version
+    int nmap_hap = col->number==BCF_VL_G || col->number==BCF_VL_R ? rec->n_allele : rec->n_allele - 1;
+    int *map_hap = vcmp_map_ARvalues(args->vcmp,nmap_hap,line->n_allele,line->d.allele,rec->n_allele,rec->d.allele);
+    if ( !map_hap ) error("REF alleles not compatible at %s:%d\n");
+
+    int i, j;
+    if ( rec->n_allele==line->n_allele )
+    {
+        // alleles unchanged?
+        for (i=0; i<rec->n_allele; i++) if ( map_hap[i]!=i ) break;
+        if ( i==rec->n_allele )
+            return core_setter_format_real(args,line,col,args->tmpf,nsrc1);
+    }
+
+    int nsmpl_dst = rec->n_sample;
+    int ndst  = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
+    int ndst1 = ndst / nsmpl_dst;
+    if ( ndst <= 0 )
+    {
+        if ( col->replace==REPLACE_NON_MISSING ) return 0;  // overwrite only if present
+        if ( col->number==BCF_VL_G )
+            ndst1 = line->n_allele*(line->n_allele+1)/2;
+        else
+            ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+        hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
+        for (i=0; i<nsmpl_dst; i++)
+        {
+            float *dst = args->tmpf2 + i*ndst1;
+            for (j=0; j<ndst1; j++) bcf_float_set_missing(dst[j]);
+        }
+    }
+
+    int nmap_dip = 0, *map_dip = NULL;
+    if ( col->number==BCF_VL_G )
+    {
+        map_dip = vcmp_map_dipGvalues(args->vcmp, &nmap_dip);
+        if ( !args->src_smpl_pld )
+        {
+            args->src_smpl_pld = (uint8_t*) malloc(nsmpl_src);
+            args->dst_smpl_pld = (uint8_t*) malloc(nsmpl_dst);
+        }
+        int pld_src = determine_ploidy(rec->n_allele, args->tmpi, nsrc1, args->src_smpl_pld, nsmpl_src);
+        if ( pld_src<0 )
+            error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_src, rec->n_allele, bcf_seqname(bcf_sr_get_header(args->files,1),rec),rec->pos+1);
+        int pld_dst = determine_ploidy(line->n_allele, args->tmpi2, ndst1, args->dst_smpl_pld, nsmpl_dst);
+        if ( pld_dst<0 )
+            error("Unexpected number of %s values (%d) for %d alleles at %s:%d\n", col->hdr_key_src,-pld_dst, line->n_allele, bcf_seqname(args->hdr,line),line->pos+1);
+
+        int ndst1_new = pld_dst==1 ? line->n_allele : line->n_allele*(line->n_allele+1)/2;
+        if ( ndst1_new != ndst1 )
+        {
+            if ( ndst1 ) error("todo: %s ndst1!=ndst .. %d %d  at %s:%d\n",col->hdr_key_src,ndst1_new,ndst1,bcf_seqname(args->hdr,line),line->pos+1);
+            ndst1 = ndst1_new;
+            hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
+        }
+    }
+    else if ( !ndst1 )
+    {
+        ndst1 = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
+        hts_expand(float, ndst1*nsmpl_dst, args->mtmpf2, args->tmpf2);
+    }
+
+    for (i=0; i<nsmpl_dst; i++)
+    {
+        int ii = args->sample_map ? args->sample_map[i] : i;
+        float *ptr_src = args->tmpf + i*nsrc1;
+        float *ptr_dst = args->tmpf2 + ii*ndst1;
+
+        if ( col->number==BCF_VL_G )
+        {
+            if ( args->src_smpl_pld[ii] > 0 && args->dst_smpl_pld[i] > 0 && args->src_smpl_pld[ii]!=args->dst_smpl_pld[i] )
+                error("Sample ploidy differs at %s:%d\n", bcf_seqname(args->hdr,line),line->pos+1);
+            if ( !args->dst_smpl_pld[i] )
+                for (j=0; j<ndst1; j++) bcf_float_set_missing(ptr_dst[j]);
+        }
+        if ( col->number!=BCF_VL_G || args->src_smpl_pld[i]==1 )
+        {
+            for (j=0; j<nmap_hap; j++)
+            {
+                int k = map_hap[j];
+                if ( k>=0 )
+                {
+                    if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]);
+                    else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]); 
+                    else ptr_dst[k] = ptr_src[j];
+                }
+            }
+            if ( col->number==BCF_VL_G )
+                for (j=line->n_allele; j<ndst1; j++) { bcf_float_set_vector_end(ptr_dst[j]); j++; }
+        }
+        else
+        {
+            for (j=0; j<nmap_dip; j++)
+            {
+                int k = map_dip[j];
+                if ( k>=0 )
+                {
+                    if ( bcf_float_is_missing(ptr_src[j]) ) bcf_float_set_missing(ptr_dst[k]);
+                    else if ( bcf_float_is_vector_end(ptr_src[j]) ) bcf_float_set_vector_end(ptr_dst[k]);
+                    else ptr_dst[k] = ptr_src[j];
+                }
+            }
+        }
+    }
+    return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1);
+
 }
 
 static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
@@ -1458,6 +1699,8 @@ static void init_columns(args_t *args)
                         case BCF_HT_STR:    col->setter = vcf_setter_format_str; has_fmt_str = 1; break;
                         default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
                     }
+                hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, hrec->vals[k]);
+                col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id);
             }
         }
         else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
@@ -1506,6 +1749,12 @@ static void init_columns(args_t *args)
                     case BCF_HT_STR:    col->setter = args->tgts_is_vcf ? vcf_setter_format_str  : setter_format_str; has_fmt_str = 1; break;
                     default: error("The type of %s not recognised (%d)\n", str.s,bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id));
                 }
+            if ( args->tgts_is_vcf )
+            {
+                bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
+                hdr_id = bcf_hdr_id2int(tgts_hdr, BCF_DT_ID, col->hdr_key_src);
+                col->number = bcf_hdr_id2length(tgts_hdr,BCF_HL_FMT,hdr_id);
+            }
         }
         else
         {
@@ -1697,6 +1946,8 @@ static void destroy_data(args_t *args)
     free(args->tmpp2);
     free(args->tmpi3);
     free(args->tmpf3);
+    free(args->src_smpl_pld);
+    free(args->dst_smpl_pld);
     if ( args->set_ids )
         convert_destroy(args->set_ids);
     if ( args->filter )
diff --git a/vcmp.c b/vcmp.c
index 8d04b8942..7d3b0f9ea 100644
--- a/vcmp.c
+++ b/vcmp.c
@@ -26,6 +26,7 @@ THE SOFTWARE.  */
 #include <string.h>
 #include <stdlib.h>
 #include <htslib/hts.h>
+#include <htslib/vcf.h>
 #include <ctype.h>
 #include "vcmp.h"
 
@@ -34,7 +35,8 @@ struct _vcmp_t
     char *dref;
     int ndref, mdref;   // ndref: positive when ref1 longer, negative when ref2 is longer
     int nmatch;
-    int *map, mmap;
+    int *map, mmap, nmap;
+    int *map_dip, mmap_dip, nmap_dip;
 };
 
 vcmp_t *vcmp_init()
@@ -44,6 +46,7 @@ vcmp_t *vcmp_init()
 
 void vcmp_destroy(vcmp_t *vcmp)
 {
+    free(vcmp->map_dip);
     free(vcmp->map);
     free(vcmp->dref);
     free(vcmp);
@@ -120,7 +123,8 @@ int *vcmp_map_ARvalues(vcmp_t *vcmp, int n, int nals1, char **als1, int nals2, c
 {
     if ( vcmp_set_ref(vcmp,als1[0],als2[0]) < 0 ) return NULL;
 
-    vcmp->map = (int*) realloc(vcmp->map,sizeof(int)*n);
+    vcmp->nmap = n;
+    hts_expand(int, vcmp->nmap, vcmp->mmap, vcmp->map);
 
     int i, ifrom = n==nals2 ? 0 : 1;
     for (i=ifrom; i<nals2; i++)
@@ -130,3 +134,22 @@ int *vcmp_map_ARvalues(vcmp_t *vcmp, int n, int nals1, char **als1, int nals2, c
     return vcmp->map;
 }
 
+int *vcmp_map_dipGvalues(vcmp_t *vcmp, int *nmap)
+{
+    vcmp->nmap_dip = vcmp->nmap*(vcmp->nmap+1)/2;
+    hts_expand(int, vcmp->nmap_dip, vcmp->mmap_dip, vcmp->map_dip);
+
+    int i, j, k = 0;
+    for (i=0; i<vcmp->nmap; i++)
+    {
+        for (j=0; j<=i; j++)
+        {
+            vcmp->map_dip[k] = vcmp->map[i]>=0 && vcmp->map[j]>=0 ? bcf_alleles2gt(vcmp->map[i],vcmp->map[j]) : -1;
+            k++;
+        }
+    }
+    *nmap = k;
+    return vcmp->map_dip;
+}
+
+
diff --git a/vcmp.h b/vcmp.h
index 031770404..9c6370ce2 100644
--- a/vcmp.h
+++ b/vcmp.h
@@ -58,5 +58,6 @@ int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2);
  */
 int *vcmp_map_ARvalues(vcmp_t *vcmp, int number, int nals1, char **als1, int nals2, char **als2);
 
+int *vcmp_map_dipGvalues(vcmp_t *vcmp, int *nmap);
 
 #endif

From 3a63b849081e19557a7c06570e962efc8cbb1d44 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 15:11:17 +0000
Subject: [PATCH 03/27] New +contrast plugin

---
 plugins/contrast.c | 364 +++++++++++++++++++++++++++++++++++++++++++++
 test/contrast.out  |  11 ++
 test/contrast.vcf  |   8 +
 test/test.pl       |   2 +
 4 files changed, 385 insertions(+)
 create mode 100644 plugins/contrast.c
 create mode 100644 test/contrast.out
 create mode 100644 test/contrast.vcf

diff --git a/plugins/contrast.c b/plugins/contrast.c
new file mode 100644
index 000000000..71cb1723f
--- /dev/null
+++ b/plugins/contrast.c
@@ -0,0 +1,364 @@
+/* The MIT License
+
+   Copyright (c) 2018 Genome Research Ltd.
+
+   Author: Petr Danecek <pd3@sanger.ac.uk>
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <errno.h>
+#include <unistd.h>     // for isatty
+#include <htslib/hts.h>
+#include <htslib/vcf.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include <htslib/synced_bcf_reader.h>
+#include "bcftools.h"
+#include "filter.h"
+
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+typedef struct
+{
+    int argc, filter_logic, regions_is_file, targets_is_file, output_type;
+    char **argv, *output_fname, *fname, *regions, *targets, *filter_str;
+    char *bg_samples_str, *novel_samples_str;
+    int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl;
+    filter_t *filter;
+    bcf_srs_t *sr;
+    bcf_hdr_t *hdr, *hdr_out;
+    htsFile *out_fh;
+    int32_t *gts;
+    int mgts;
+    uint32_t *bg_gts;
+    int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt;
+    kstring_t novel_als_smpl, novel_gts_smpl;
+}
+args_t;
+
+args_t args;
+
+const char *about(void)
+{
+    return "Find novel alleles and genotypes in two groups of samples.\n";
+}
+
+static const char *usage_text(void)
+{
+    return 
+        "\n"
+        "About: Finds novel alleles and genotypes in two groups of samples. Adds\n"
+        "       an annotation which lists samples with a novel allele (INFO/NOVELAL)\n"
+        "       or a novel genotype (INFO/NOVELGT)\n"
+        "Usage: bcftools +contrast [Plugin Options]\n"
+        "Plugin options:\n"
+        "   -0, --bg-samples <list>     list of background samples\n"
+        "   -1, --novel-samples <list>  list of samples where novel allele or genotype are expected\n"
+        "   -e, --exclude EXPR          exclude sites and samples for which the expression is true\n"
+        "   -i, --include EXPR          include sites and samples for which the expression is true\n"
+        "   -o, --output FILE           output file name [stdout]\n"
+        "   -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"
+        "   -r, --regions REG           restrict to comma-separated list of regions\n"
+        "   -R, --regions-file FILE     restrict to regions listed in a file\n"
+        "   -t, --targets REG           similar to -r but streams rather than index-jumps\n"
+        "   -T, --targets-file FILE     similar to -R but streams rather than index-jumps\n"
+        "\n"
+        "Example:\n"
+        "   # Test if any of the samples a,b is different from the samples c,d,e\n"
+        "   bcftools +contrast -0 c,d,e -1 a,b file.bcf\n"
+        "\n";
+}
+
+static void init_data(args_t *args)
+{
+    args->sr = bcf_sr_init();
+    if ( args->regions )
+    {
+        args->sr->require_index = 1;
+        if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
+    }
+    if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
+    if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
+    args->hdr = bcf_sr_get_header(args->sr,0);
+    args->hdr_out = bcf_hdr_dup(args->hdr);
+    bcf_hdr_append(args->hdr_out, "##INFO=<ID=NOVELAL,Number=.,Type=String,Description=\"List of samples with novel alleles\">");
+    bcf_hdr_append(args->hdr_out, "##INFO=<ID=NOVELGT,Number=.,Type=String,Description=\"List of samples with novel genotypes. Note that only samples w/o a novel allele are listed.\">");
+
+    if ( args->filter_str )
+        args->filter = filter_init(args->hdr, args->filter_str);
+
+    int i;
+    char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl);
+    args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl);
+    for (i=0; i<args->nbg_smpl; i++)
+    {
+        args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]);
+        if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]);
+        free(smpl[i]);
+    }
+    free(smpl);
+
+    smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl);
+    args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl);
+    for (i=0; i<args->nnovel_smpl; i++)
+    {
+        args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]);
+        if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]);
+        free(smpl[i]);
+    }
+    free(smpl);
+
+    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
+    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
+    bcf_hdr_write(args->out_fh, args->hdr_out);
+}
+static void destroy_data(args_t *args)
+{
+    bcf_hdr_destroy(args->hdr_out);
+    hts_close(args->out_fh);
+    free(args->novel_als_smpl.s);
+    free(args->novel_gts_smpl.s);
+    free(args->gts);
+    free(args->bg_gts);
+    free(args->bg_smpl);
+    free(args->novel_smpl);
+    if ( args->filter ) filter_destroy(args->filter);
+    bcf_sr_destroy(args->sr);
+    free(args);
+}
+static inline int binary_search(uint32_t val, uint32_t *dat, int ndat)
+{
+    int i = -1, imin = 0, imax = ndat - 1;
+    while ( imin<=imax )
+    {
+        i = (imin+imax)/2;
+        if ( dat[i] < val ) imin = i + 1;
+        else if ( dat[i] > val ) imax = i - 1;
+        else return 1;
+    }
+    return 0;
+}
+static inline void binary_insert(uint32_t val, uint32_t **dat, int *ndat, int *mdat)
+{
+    int i = -1, imin = 0, imax = *ndat - 1;
+    while ( imin<=imax )
+    {
+        i = (imin+imax)/2;
+        if ( (*dat)[i] < val ) imin = i + 1;
+        else if ( (*dat)[i] > val ) imax = i - 1;
+        else return;
+    }
+    while ( i>=0 && (*dat)[i]>val ) i--;
+
+    (*ndat)++;
+    hts_expand(uint32_t, (*ndat), (*mdat), (*dat));
+
+    if ( *ndat > 1 )
+        memmove(*dat + i + 1, *dat + i, sizeof(uint32_t)*(*ndat - i - 1));
+
+    (*dat)[i+1] = val;
+}
+static int process_record(args_t *args, bcf1_t *rec)
+{
+    args->ntotal++;
+
+    static int warned = 0;
+    int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->mgts);
+    ngts /= rec->n_sample;
+    if ( ngts>2 ) error("todo: ploidy=%d\n", ngts);
+
+    args->nbg_gts = 0;
+    uint32_t bg_als = 0;
+    int i,j;
+    for (i=0; i<args->nbg_smpl; i++)
+    {
+        uint32_t gt  = 0;
+        int32_t *ptr = args->gts + args->bg_smpl[i]*ngts;
+        for (j=0; j<ngts; j++)
+        {
+            if ( ptr[j]==bcf_int32_vector_end ) break;
+            if ( bcf_gt_is_missing(ptr[j]) ) continue; 
+            int ial = bcf_gt_allele(ptr[j]);
+            if ( ial > 31 )
+            {
+                if ( !warned )
+                {
+                    fprintf(stderr,"Too many alleles (>32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+                    warned = 1;
+                }
+                args->nskipped++;
+                return -1;
+            }
+            bg_als |= 1<<ial;
+            gt |= 1<<ial;
+        }
+        binary_insert(gt, &args->bg_gts, &args->nbg_gts, &args->mbg_gts);
+    }
+    if ( !bg_als )
+    {
+        // all are missing
+        args->nskipped++;
+        return -1;
+    }
+
+    args->novel_als_smpl.l = 0;
+    args->novel_gts_smpl.l = 0;
+
+    int has_gt = 0;
+    for (i=0; i<args->nnovel_smpl; i++)
+    {
+        int novel_al = 0;
+        uint32_t gt  = 0;
+        int32_t *ptr = args->gts + args->novel_smpl[i]*ngts;
+        for (j=0; j<ngts; j++)
+        {
+            if ( ptr[j]==bcf_int32_vector_end ) break;
+            if ( bcf_gt_is_missing(ptr[j]) ) continue; 
+            int ial = bcf_gt_allele(ptr[j]);
+            if ( ial > 31 )
+            {
+                if ( !warned )
+                {
+                    fprintf(stderr,"Too many alleles (>32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+                    warned = 1;
+                }
+                args->nskipped++;
+                return -1;
+            }
+            if ( !(bg_als & (1<<ial)) ) novel_al = 1; 
+            gt |= 1<<ial;
+        }
+        if ( !gt ) continue;
+        has_gt = 1;
+
+        char *smpl = args->hdr->samples[ args->novel_smpl[i] ];
+        if ( novel_al )
+        {
+            if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl);
+            kputs(smpl, &args->novel_als_smpl);
+        }
+        else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) )
+        {
+            if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl);
+            kputs(smpl, &args->novel_gts_smpl);
+        }
+    }
+    if ( !has_gt )
+    {
+        // all are missing
+        args->nskipped++;
+        return -1;
+    }
+    if ( args->novel_als_smpl.l ) 
+    {
+        bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s);
+        args->nnovel_al++;
+    }
+    if ( args->novel_gts_smpl.l ) 
+    {
+        bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s);
+        args->nnovel_gt++;
+    }
+    args->ntested++;
+    return 0;
+}
+
+int run(int argc, char **argv)
+{
+    args_t *args = (args_t*) calloc(1,sizeof(args_t));
+    args->argc   = argc; args->argv = argv;
+    args->output_fname = "-";
+    static struct option loptions[] =
+    {
+        {"bg-samples",required_argument,0,'0'},
+        {"novel-samples",required_argument,0,'1'},
+        {"include",required_argument,0,'i'},
+        {"exclude",required_argument,0,'e'},
+        {"output",required_argument,NULL,'o'},
+        {"output-type",required_argument,NULL,'O'},
+        {"regions",1,0,'r'},
+        {"regions-file",1,0,'R'},
+        {"targets",1,0,'t'},
+        {"targets-file",1,0,'T'},
+        {NULL,0,NULL,0}
+    };
+    int c;
+    while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0)
+    {
+        switch (c) 
+        {
+            case '0': args->bg_samples_str = optarg; break;
+            case '1': args->novel_samples_str = optarg; break;
+            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+            case 't': args->targets = optarg; break;
+            case 'T': args->targets = optarg; args->targets_is_file = 1; break;
+            case 'r': args->regions = optarg; break;
+            case 'R': args->regions = optarg; args->regions_is_file = 1; break;
+            case 'o': args->output_fname = optarg; break;
+            case 'O':
+                      switch (optarg[0]) {
+                          case 'b': args->output_type = FT_BCF_GZ; break;
+                          case 'u': args->output_type = FT_BCF; break;
+                          case 'z': args->output_type = FT_VCF_GZ; break;
+                          case 'v': args->output_type = FT_VCF; break;
+                          default: error("The output type \"%s\" not recognised\n", optarg);
+                      };
+                      break;
+            case 'h':
+            case '?':
+            default: error("%s", usage_text()); break;
+        }
+    }
+    if ( optind==argc )
+    {
+        if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-";  // reading from stdin
+        else { error(usage_text()); }
+    }
+    else if ( optind+1!=argc ) error(usage_text());
+    else args->fname = argv[optind];
+
+    init_data(args);
+
+    while ( bcf_sr_next_line(args->sr) )
+    {
+        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
+        if ( args->filter )
+        {
+            int pass = filter_test(args->filter, rec, NULL);
+            if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+            if ( !pass ) continue;
+        }
+        process_record(args, rec);
+        bcf_write(args->out_fh, args->hdr_out, rec);
+    }
+
+    fprintf(stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt);
+    destroy_data(args);
+
+    return 0;
+}
diff --git a/test/contrast.out b/test/contrast.out
new file mode 100644
index 000000000..7707bc004
--- /dev/null
+++ b/test/contrast.out
@@ -0,0 +1,11 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=1>
+##INFO=<ID=NOVELAL,Number=.,Type=String,Description="List of samples with novel alleles">
+##INFO=<ID=NOVELGT,Number=.,Type=String,Description="List of samples with novel genotypes. Note that only samples w/o a novel allele are listed.">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b	c
+1	100	.	A	G	.	.	.	GT	0/0	0/0	0/0
+1	101	.	A	G	.	.	NOVELAL=c	GT	0/0	0/0	0/1
+1	102	.	A	G	.	.	NOVELGT=c	GT	0/0	0/1	1/1
+1	103	.	A	G	.	.	NOVELAL=c	GT	1/1	1/1	0/1
diff --git a/test/contrast.vcf b/test/contrast.vcf
new file mode 100644
index 000000000..59941d274
--- /dev/null
+++ b/test/contrast.vcf
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=1>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	a	b	c
+1	100	.	A	G	.	.	.	GT	0/0	0/0	0/0
+1	101	.	A	G	.	.	.	GT	0/0	0/0	0/1
+1	102	.	A	G	.	.	.	GT	0/0	0/1	1/1
+1	103	.	A	G	.	.	.	GT	1/1	1/1	0/1
diff --git a/test/test.pl b/test/test.pl
index e6a862b04..f2d801176 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -249,6 +249,7 @@
 test_vcf_annotate($opts,in=>'annotate2',vcf=>'annots2',out=>'annotate12.out',args=>'-c AAA:=IINT,FMT/BBB:=FMT/FINT');
 test_vcf_annotate($opts,in=>'annotate2',vcf=>'annots2',out=>'annotate13.out',args=>'-x INFO -c INFO/IINT');
 test_vcf_annotate($opts,in=>'annotate2',vcf=>'annots2',out=>'annotate14.out',args=>q[-x INFO -c INFO/IINT -e'POS=3000001' -k]);
+test_vcf_annotate($opts,in=>'annotate11',vcf=>'annots11',out=>'annotate15.out',args=>q[-c FMT]);
 test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref --no-version');
 test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT --no-version',args=>'-- -t . -n 0');
 test_vcf_plugin($opts,in=>'setGT',out=>'setGT.1.out',cmd=>'+setGT --no-version',args=>'-- -t q -n 0 -i \'GT~"." && FMT/DP=30 && GQ=150\'');
@@ -285,6 +286,7 @@
 test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.1.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -d');
 test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.2.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -l+');
 test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -lx');
+test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-0 a,b -1 c');
 test_vcf_concat($opts,in=>['concat.1.a','concat.1.b'],out=>'concat.1.vcf.out',do_bcf=>0,args=>'');
 test_vcf_concat($opts,in=>['concat.1.a','concat.1.b'],out=>'concat.1.bcf.out',do_bcf=>1,args=>'');
 test_vcf_concat($opts,in=>['concat.2.a','concat.2.b'],out=>'concat.2.vcf.out',do_bcf=>0,args=>'-a');

From c320cd89fe93b3c2180c49e9ed5e6415ddae4f10 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 15:11:38 +0000
Subject: [PATCH 04/27] Print informative message if no arguments are given

---
 vcfnorm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vcfnorm.c b/vcfnorm.c
index ead1e8951..ac0321954 100644
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -1944,9 +1944,7 @@ int main_vcfnorm(int argc, char *argv[])
             default: error("Unknown argument: %s\n", optarg);
         }
     }
-    if ( argc>optind+1 ) usage();
-    if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) usage();
-    if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n");
+
     char *fname = NULL;
     if ( optind>=argc )
     {
@@ -1955,6 +1953,9 @@ int main_vcfnorm(int argc, char *argv[])
     }
     else fname = argv[optind];
 
+    if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n");
+    if ( !args->ref_fname && args->check_ref&CHECK_REF_FIX ) error("Expected --fasta-ref with --check-ref s\n");
+
     if ( args->region )
     {
         if ( bcf_sr_set_regions(args->files, args->region,region_is_file)<0 )

From 76babbc42c03e81b132cf3eaabcbb215df464e57 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 15:19:37 +0000
Subject: [PATCH 05/27] Fix the warning string, `-D` is in fact an alias of `-d
 none`

---
 vcfnorm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vcfnorm.c b/vcfnorm.c
index ac0321954..fb2aadb42 100644
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -1924,7 +1924,7 @@ int main_vcfnorm(int argc, char *argv[])
                 break;
             case 'o': args->output_fname = optarg; break;
             case 'D':
-                fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by `-d both`.\n"); 
+                fprintf(stderr,"Warning: `-D` is functional but deprecated, replaced by and alias of `-d none`.\n"); 
                 args->rmdup = BCF_SR_PAIR_EXACT;
                 break;
             case 's': args->strict_filter = 1; break;

From 4dca79a0d64035b7ae0a1b54a00469a7903ef54b Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 15:48:47 +0000
Subject: [PATCH 06/27] Sort records with the same chr:pos lexicographically by
 ref,alt

---
 test/sort.out | 32 +++++++++++++++++---------------
 test/sort.vcf |  4 +++-
 test/test.pl  |  4 ++--
 vcfsort.c     | 20 +++++++++++++++++---
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/test/sort.out b/test/sort.out
index fa723cda8..1c31ceb7b 100644
--- a/test/sort.out
+++ b/test/sort.out
@@ -1,15 +1,17 @@
-1	101
-1	102
-1	103
-1	104
-1	105
-2	101
-2	102
-2	103
-2	104
-2	105
-3	101
-3	102
-3	103
-3	104
-3	105
+1	101	T,C,A
+1	101	T,CC,A
+1	101	T,TC
+1	102	T,C
+1	103	T,C
+1	104	T,C
+1	105	T,C
+2	101	T,C
+2	102	T,C
+2	103	T,C
+2	104	T,C
+2	105	T,C
+3	101	T,C
+3	102	T,C
+3	103	T,C
+3	104	T,C
+3	105	T,C
diff --git a/test/sort.vcf b/test/sort.vcf
index 649bd01e8..b86c97bdb 100644
--- a/test/sort.vcf
+++ b/test/sort.vcf
@@ -18,4 +18,6 @@
 1	104	.	T	C	.	.	.
 1	103	.	T	C	.	.	.
 1	102	.	T	C	.	.	.
-1	101	.	T	C	.	.	.
+1	101	.	T	TC	.	.	.
+1	101	.	T	C,A	.	.	.
+1	101	.	T	CC,A	.	.	.
diff --git a/test/test.pl b/test/test.pl
index f2d801176..6c7e54b15 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -231,8 +231,8 @@
 test_vcf_filter($opts,in=>'filter.5',out=>'filter.24.out',args=>q[-i'AD[0:0]=="."'],fmt=>'%POS[\\t%AD]\\n');
 test_vcf_filter($opts,in=>'filter.5',out=>'filter.25.out',args=>q[-i'AD[0:0]!="."'],fmt=>'%POS[\\t%AD]\\n');
 test_vcf_filter($opts,in=>'filter.5',out=>'filter.26.out',args=>q[-i'QUAL=="."'],fmt=>'%POS\\t%QUAL\\n');
-test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\n');
-test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\n');
+test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n');
+test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n');
 test_vcf_regions($opts,in=>'regions');
 test_vcf_annotate($opts,in=>'annotate',tab=>'annotate',out=>'annotate.out',args=>'-c CHROM,POS,REF,ALT,ID,QUAL,INFO/T_INT,INFO/T_FLOAT,INDEL');
 test_vcf_annotate($opts,in=>'annotate',tab=>'annotate2',out=>'annotate2.out',args=>'-c CHROM,FROM,TO,T_STR');
diff --git a/vcfsort.c b/vcfsort.c
index e41b628c9..78f7adbe2 100644
--- a/vcfsort.c
+++ b/vcfsort.c
@@ -67,6 +67,21 @@ int cmp_bcf_pos(const void *aptr, const void *bptr)
     if ( a->rid > b->rid ) return 1;
     if ( a->pos < b->pos ) return -1;
     if ( a->pos > b->pos ) return 1;
+
+    // Sort the same chr:pos records lexicographically by ref,alt.
+    // This will be called rarely so should not slow the sorting down
+    // noticeably.
+
+    if ( !a->unpacked ) bcf_unpack(a, BCF_UN_STR);
+    if ( !b->unpacked ) bcf_unpack(b, BCF_UN_STR);
+    int i, ret;
+    for (i=0; i<a->n_allele; i++)
+    { 
+        if ( i >= b->n_allele ) return 1;
+        int ret = strcasecmp(a->d.allele[i],b->d.allele[i]);
+        if ( ret ) return ret;
+    }
+    if ( a->n_allele < b->n_allele ) return -1;
     return 0;
 }
 
@@ -138,9 +153,8 @@ static inline int blk_is_smaller(blk_t **aptr, blk_t **bptr)
 {
     blk_t *a = *aptr;
     blk_t *b = *bptr;
-    if ( a->rec->rid < b->rec->rid ) return 1;
-    if ( a->rec->rid > b->rec->rid ) return 0;
-    if ( a->rec->pos < b->rec->pos ) return 1;
+    int ret = cmp_bcf_pos(&a->rec, &b->rec);
+    if ( ret < 0 ) return 1;
     return 0;
 }
 KHEAP_INIT(blk, blk_t*, blk_is_smaller)

From 347a508c54b1de7a0ac005830ecad81b5f7fe2cd Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 15:50:19 +0000
Subject: [PATCH 07/27] Updated documentation

---
 doc/bcftools.1    | 65 ++++++++++++++++++++++++++++++-----------------
 doc/bcftools.html | 59 +++++++++++++++++++++++++-----------------
 doc/bcftools.txt  | 24 +++++++++--------
 3 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/doc/bcftools.1 b/doc/bcftools.1
index c0e70b8c2..dd40d8d75 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
 .\"     Title: bcftools
 .\"    Author: [see the "AUTHORS" section]
 .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\"      Date: 2018-02-12
+.\"      Date: 2018-03-15 15:49 GMT
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "BCFTOOLS" "1" "2018\-02\-12" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2018\-03\-15 15:49 GMT" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Define some portability stuff
 .\" -----------------------------------------------------------------
@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
 BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
 .SS "VERSION"
 .sp
-This manual page was last updated \fB2018\-02\-12\fR and refers to bcftools git version \fB1\&.7\fR\&.
+This manual page was last updated \fB2018\-03\-15 15:49 GMT\fR and refers to bcftools git version \fB1\&.7\-7\-g4dca79a+\fR\&.
 .SS "BCF1"
 .sp
 The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
@@ -2096,7 +2096,7 @@ see
 .RE
 .SS "bcftools gtcheck [\fIOPTIONS\fR] [\-g \fIgenotypes\&.vcf\&.gz\fR] \fIquery\&.vcf\&.gz\fR"
 .sp
-Checks sample identity or, without \fB\-g\fR, multi\-sample cross\-check is performed\&.
+Checks sample identity\&. The program can operate in two modes\&. If the \fB\-g\fR option is given, the identity of the \fB\-s\fR sample from \fIquery\&.vcf\&.gz\fR is checked against the samples in the \fB\-g\fR file\&. Without the \fB\-g\fR option, multi\-sample cross\-check of samples in \fIquery\&.vcf\&.gz\fR is performed\&.
 .PP
 \fB\-a, \-\-all\-sites\fR
 .RS 4
@@ -2227,21 +2227,14 @@ is used for the unseen genotypes\&. With
 can be used instead; the discordance value then gives exactly the number of differing genotypes\&.
 .RE
 .PP
-SM, Average Discordance
+ERR, error rate
 .RS 4
-Average discordance between sample
-\fIa\fR
-and all other samples\&.
-.RE
-.PP
-SM, Average Depth
-.RS 4
-Average depth at evaluated sites, or 1 if FORMAT/DP field is not present\&.
+Pairwise error rate calculated as number of differences divided by the total number of comparisons\&.
 .RE
 .PP
-SM, Average Number of sites
+CLUSTER, TH, DOT
 .RS 4
-The average number of sites used to calculate the discordance\&. In other words, the average number of non\-missing PLs/genotypes seen both samples\&.
+In presence of multiple samples, related samples and outliers can be identified by clustering samples by error rate\&. A simple hierarchical clustering based on minimization of standard deviation is used\&. This is useful to detect sample swaps, for example in situations where one sample has been sequenced in multiple runs\&.
 .RE
 .RE
 .SS "bcftools index [\fIOPTIONS\fR] \fIin\&.bcf\fR|\fIin\&.vcf\&.gz\fR"
@@ -2986,7 +2979,7 @@ and
 can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields\&.
 .RE
 .PP
-\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIany\fR
+\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fInone\fR
 .RS 4
 If a record is present multiple times, output only the first instance, see
 \fB\-\-collapse\fR
@@ -2997,8 +2990,7 @@ in
 \fB\-D, \-\-remove\-duplicates\fR
 .RS 4
 If a record is present in multiple files, output only the first instance\&. Alias for
-\fB\-d none\fR\&. Requires
-\fB\-a, \-\-allow\-overlaps\fR\&.
+\fB\-d none\fR, deprecated\&.
 .RE
 .PP
 \fB\-f, \-\-fasta\-ref\fR \fIFILE\fR
@@ -4680,15 +4672,17 @@ TYPE!~"snp"
 .sp -1
 .IP \(bu 2.3
 .\}
-array subscripts (0\-based), "*" for any field, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
+array subscripts (0\-based), "*" for any element, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and an element of the vector, as shown in the examples below
 .sp
 .if n \{\
 .RS 4
 .\}
 .nf
-(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
-DP4[*] == 0
-CSQ[*] ~ "missense_variant\&.*deleterious"
+INFO/AF[0] > 0\&.3             \&.\&. first AF value bigger than 0\&.3
+FORMAT/AD[0:0] > 30          \&.\&. first AD value of the first sample bigger than 30
+FORMAT/AD[0:1]               \&.\&. first sample, second AD value
+FORMAT/AD[1:0]               \&.\&. second sample, first AD value
+DP4[*] == 0                  \&.\&. any DP4 value
 FORMAT/DP[0]   > 30         \&.\&. DP of the first sample bigger than 30
 FORMAT/DP[1\-3] > 10         \&.\&. samples 2\-4
 FORMAT/DP[1\-]  < 7          \&.\&. all samples but the first
@@ -4696,6 +4690,8 @@ FORMAT/DP[0,2\-4] > 20       \&.\&. samples 1, 3\-5
 FORMAT/AD[0:1]              \&.\&. first sample, second AD field
 FORMAT/AD[0:*], AD[0:] or AD[0] \&.\&. first sample, any AD field
 FORMAT/AD[*:1] or AD[:1]        \&.\&. any sample, second AD field
+(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
+CSQ[*] ~ "missense_variant\&.*deleterious"
 .fi
 .if n \{\
 .RE
@@ -4743,6 +4739,29 @@ N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
 .RE
 .\}
 .RE
+.sp
+.RS 4
+.ie n \{\
+\h'-04'\(bu\h'+03'\c
+.\}
+.el \{\
+.sp -1
+.IP \(bu 2.3
+.\}
+custom perl filtering\&. Note that this command is not compiled in by default, see the section
+\fBOptional Compilation with Perl\fR
+in the INSTALL file for help and misc/demo\-flt\&.pl for a working example\&. The demo defined the perl subroutine "severity" which can be invoked from the command line as follows:
+.sp
+.if n \{\
+.RS 4
+.\}
+.nf
+perl:path/to/script\&.pl; perl\&.severity(INFO/CSQ) > 3
+.fi
+.if n \{\
+.RE
+.\}
+.RE
 .PP
 \fBNotes:\fR
 .sp
@@ -4776,7 +4795,7 @@ Variables and function names are case\-insensitive, but not tag names\&. For exa
 .sp -1
 .IP \(bu 2.3
 .\}
-When querying multiple subfields, all subfields are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
+When querying multiple values, all elements are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
 .sp
 .if n \{\
 .RS 4
diff --git a/doc/bcftools.html b/doc/bcftools.html
index b01c7c336..58331e5dd 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp144864"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools  is  a set of utilities that manipulate variant calls in the Variant
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25164528"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools  is  a set of utilities that manipulate variant calls in the Variant
 Call Format (VCF) and its binary counterpart BCF. All commands work
 transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
 automatically even when streaming from a pipe. Indexed VCF and BCF
@@ -8,7 +8,7 @@
 work in most, but not all situations. In general, whenever multiple VCFs are
 read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
 standard input (stdin) and outputs to the standard output (stdout). Several
-commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-02-12</strong></span> and refers to bcftools git version <span class="strong"><strong>1.7</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
+commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-03-15 15:49 GMT</strong></span> and refers to bcftools git version <span class="strong"><strong>1.7-7-g4dca79a+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions &lt;= 0.1.19 to convert to VCF, which can then be read by
@@ -1232,7 +1232,10 @@
 <span class="strong"><strong>--threads</strong></span> <span class="emphasis"><em>INT</em></span>
 </span></dt><dd>
     see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
-</dd></dl></div></div><div class="refsect2" title="bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz"><a id="gtcheck"></a><h3>bcftools gtcheck [<span class="emphasis"><em>OPTIONS</em></span>] [<span class="strong"><strong>-g</strong></span> <span class="emphasis"><em>genotypes.vcf.gz</em></span>] <span class="emphasis"><em>query.vcf.gz</em></span></h3><p>Checks sample identity or, without <span class="strong"><strong>-g</strong></span>, multi-sample cross-check is performed.</p><div class="variablelist"><dl><dt><span class="term">
+</dd></dl></div></div><div class="refsect2" title="bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz"><a id="gtcheck"></a><h3>bcftools gtcheck [<span class="emphasis"><em>OPTIONS</em></span>] [<span class="strong"><strong>-g</strong></span> <span class="emphasis"><em>genotypes.vcf.gz</em></span>] <span class="emphasis"><em>query.vcf.gz</em></span></h3><p>Checks sample identity. The program can operate in two modes. If the <span class="strong"><strong>-g</strong></span>
+option is given, the identity of the <span class="strong"><strong>-s</strong></span> sample from <span class="emphasis"><em>query.vcf.gz</em></span>
+is checked against the samples in the <span class="strong"><strong>-g</strong></span> file.
+Without the <span class="strong"><strong>-g</strong></span> option, multi-sample cross-check of samples in <span class="emphasis"><em>query.vcf.gz</em></span> is performed.</p><div class="variablelist"><dl><dt><span class="term">
 <span class="strong"><strong>-a, --all-sites</strong></span>
 </span></dt><dd>
     output for all sites
@@ -1304,20 +1307,18 @@
         <span class="strong"><strong>-G</strong></span>, the value <span class="emphasis"><em>1</em></span> can be used instead; the discordance value then
         gives exactly the number of differing genotypes.
 </dd><dt><span class="term">
-SM, Average Discordance
+ERR, error rate
 </span></dt><dd>
-        Average discordance between sample <span class="emphasis"><em>a</em></span> and all other samples.
+        Pairwise error rate calculated as number of differences divided
+        by the total number of comparisons.
 </dd><dt><span class="term">
-SM, Average Depth
+CLUSTER, TH, DOT
 </span></dt><dd>
-        Average depth at evaluated sites, or 1 if FORMAT/DP field is not
-        present.
-</dd><dt><span class="term">
-SM, Average Number of sites
-</span></dt><dd>
-        The average number of sites used to calculate the discordance.  In
-        other words, the average number of non-missing PLs/genotypes seen
-        both samples.
+        In presence of multiple samples, related samples and outliers can be
+        identified by clustering samples by error rate. A simple hierarchical
+        clustering based on minimization of standard deviation is used. This is
+        useful to detect sample swaps, for example in situations where one
+        sample has been sequenced in multiple runs.
 </dd></dl></div></div></div><div class="refsect2" title="bcftools index [OPTIONS] in.bcf|in.vcf.gz"><a id="index"></a><h3>bcftools index [<span class="emphasis"><em>OPTIONS</em></span>]  <span class="emphasis"><em>in.bcf</em></span>|<span class="emphasis"><em>in.vcf.gz</em></span></h3><p>Creates index for bgzip compressed VCF/BCF files for random access. CSI
 (coordinate-sorted index) is created by default. The CSI format
 supports indexing of chromosomes up to length 2^31. TBI (tabix index)
@@ -1778,7 +1779,7 @@
     can swap alleles and will update genotypes (GT) and AC counts,
     but will not attempt to fix PL or other fields.
 </dd><dt><span class="term">
-<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>any</em></span>
+<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>all</em></span>|<span class="emphasis"><em>none</em></span>
 </span></dt><dd>
     If a record is present multiple times, output only the first instance,
     see <span class="strong"><strong>--collapse</strong></span> in <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>.
@@ -1786,7 +1787,7 @@
 <span class="strong"><strong>-D, --remove-duplicates</strong></span>
 </span></dt><dd>
     If a record is present in multiple files, output only the first instance.
-    Alias for <span class="strong"><strong>-d none</strong></span>.  Requires <span class="strong"><strong>-a, --allow-overlaps</strong></span>.
+    Alias for <span class="strong"><strong>-d none</strong></span>, deprecated.
 </dd><dt><span class="term">
 <span class="strong"><strong>-f, --fasta-ref</strong></span> <span class="emphasis"><em>FILE</em></span><a id="fasta_ref"></a>
 </span></dt><dd>
@@ -2707,18 +2708,23 @@
 TYPE~"snp"
 TYPE!="snp"
 TYPE!~"snp"</pre></li><li class="listitem"><p class="simpara">
-array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
-for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
-</p><pre class="literallayout">(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) &gt; 0.3
-DP4[*] == 0
-CSQ[*] ~ "missense_variant.*deleterious"
+array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
+for querying FORMAT vectors, the colon ":" can be used to select a sample and an
+element of the vector, as shown in the examples below
+</p><pre class="literallayout">INFO/AF[0] &gt; 0.3             .. first AF value bigger than 0.3
+FORMAT/AD[0:0] &gt; 30          .. first AD value of the first sample bigger than 30
+FORMAT/AD[0:1]               .. first sample, second AD value
+FORMAT/AD[1:0]               .. second sample, first AD value
+DP4[*] == 0                  .. any DP4 value
 FORMAT/DP[0]   &gt; 30         .. DP of the first sample bigger than 30
 FORMAT/DP[1-3] &gt; 10         .. samples 2-4
 FORMAT/DP[1-]  &lt; 7          .. all samples but the first
 FORMAT/DP[0,2-4] &gt; 20       .. samples 1, 3-5
 FORMAT/AD[0:1]              .. first sample, second AD field
 FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
-FORMAT/AD[*:1] or AD[:1]        .. any sample, second AD field</pre></li><li class="listitem"><p class="simpara">
+FORMAT/AD[*:1] or AD[:1]        .. any sample, second AD field
+(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) &gt; 0.3
+CSQ[*] ~ "missense_variant.*deleterious"</pre></li><li class="listitem"><p class="simpara">
 function on FORMAT tags (over samples) and INFO tags (over vector fields)
 </p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT</pre></li><li class="listitem"><p class="simpara">
 variables calculated on the fly if not present: number of alternate alleles;
@@ -2726,14 +2732,19 @@
 AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN);
 frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes;
 number of samples with missing genotype; fraction of samples with missing genotype
-</p><pre class="literallayout">N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING</pre></li></ul></div><div class="itemizedlist" title="Notes:"><p class="title"><strong>Notes:</strong></p><ul class="itemizedlist" type="disc"><li class="listitem">
+</p><pre class="literallayout">N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING</pre></li><li class="listitem"><p class="simpara">
+custom perl filtering. Note that this command is not compiled in by default, see
+the section <span class="strong"><strong>Optional Compilation with Perl</strong></span> in the INSTALL file for help
+and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
+"severity" which can be invoked from the command line as follows:
+</p><pre class="literallayout">perl:path/to/script.pl; perl.severity(INFO/CSQ) &gt; 3</pre></li></ul></div><div class="itemizedlist" title="Notes:"><p class="title"><strong>Notes:</strong></p><ul class="itemizedlist" type="disc"><li class="listitem">
 String comparisons and regular expressions are case-insensitive
 </li><li class="listitem">
 Variables and function names are case-insensitive, but not tag names. For example,
 "qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
 not "dp" instead of "DP".
 </li><li class="listitem"><p class="simpara">
-When querying multiple subfields, all subfields are tested and the OR logic is
+When querying multiple values, all elements are tested and the OR logic is
 used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
 </p><pre class="literallayout">-i 'TAG[*]=1'   .. true, the record will be printed
 -i 'TAG[*]!=1'  .. true
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index deebfd1c4..a5633be8b 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -1809,13 +1809,13 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
     can swap alleles and will update genotypes (GT) and AC counts,
     but will not attempt to fix PL or other fields.
 
-*-d, --rm-dup* 'snps'|'indels'|'both'|'any'::
+*-d, --rm-dup* 'snps'|'indels'|'both'|'all'|'none'::
     If a record is present multiple times, output only the first instance,
     see *--collapse* in *<<common_options,Common Options>>*.
 
 *-D, --remove-duplicates*::
     If a record is present in multiple files, output only the first instance.
-    Alias for *-d none*.  Requires *-a, --allow-overlaps*.
+    Alias for *-d none*, deprecated.
 
 *-f, --fasta-ref* 'FILE'[[fasta_ref]]::
     reference sequence. Supplying this option will turn on left-alignment
@@ -2745,12 +2745,15 @@ to require that all alleles are of the given type. Compare
         TYPE!="snp"
         TYPE!~"snp"
 
-* array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
-for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
-
-        (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
-        DP4[*] == 0
-        CSQ[*] ~ "missense_variant.*deleterious"
+* array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
+for querying FORMAT vectors, the colon ":" can be used to select a sample and an
+element of the vector, as shown in the examples below
+    
+        INFO/AF[0] > 0.3             .. first AF value bigger than 0.3
+        FORMAT/AD[0:0] > 30          .. first AD value of the first sample bigger than 30
+        FORMAT/AD[0:1]               .. first sample, second AD value
+        FORMAT/AD[1:0]               .. second sample, first AD value
+        DP4[*] == 0                  .. any DP4 value
         FORMAT/DP[0]   > 30         .. DP of the first sample bigger than 30
         FORMAT/DP[1-3] > 10         .. samples 2-4
         FORMAT/DP[1-]  < 7          .. all samples but the first
@@ -2758,6 +2761,8 @@ for querying FORMAT vectors, the colon ":" can be used to select a sample and a
         FORMAT/AD[0:1]              .. first sample, second AD field
         FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
         FORMAT/AD[*:1] or AD[:1]        .. any sample, second AD field
+        (DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
+        CSQ[*] ~ "missense_variant.*deleterious"
 
 * function on FORMAT tags (over samples) and INFO tags (over vector fields)
 
@@ -2779,14 +2784,13 @@ and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
         perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3
 
 
-
 .Notes:
 
 * String comparisons and regular expressions are case-insensitive
 * Variables and function names are case-insensitive, but not tag names. For example,
 "qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
 not "dp" instead of "DP".
-* When querying multiple subfields, all subfields are tested and the OR logic is
+* When querying multiple values, all elements are tested and the OR logic is
 used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
 
     -i 'TAG[*]=1'   .. true, the record will be printed

From 5c6d98ed9167696cfb3f591843a4e0f71332880a Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 15 Mar 2018 16:14:23 +0000
Subject: [PATCH 08/27] Fix compilation

---
 filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/filter.c b/filter.c
index e3a9cb759..0b4053841 100644
--- a/filter.c
+++ b/filter.c
@@ -27,6 +27,8 @@ THE SOFTWARE.  */
 #include <strings.h>
 #include <errno.h>
 #include <math.h>
+#include <sys/types.h>
+#include <pwd.h>
 #include <regex.h>
 #include <htslib/khash_str2int.h>
 #include <htslib/hts_defs.h>

From 96b9067d1d028295e8cbf0560235c4b215f6f71f Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Sat, 17 Mar 2018 09:35:23 +0000
Subject: [PATCH 09/27] Support for -i/-e filtering in roh and updated scripts

---
 misc/plot-roh.py | 96 ++++++++++++++++++++++++++++--------------------
 misc/run-roh.pl  | 21 +++++++++--
 vcfroh.c         | 61 ++++++++++++++++++++++++------
 3 files changed, 125 insertions(+), 53 deletions(-)

diff --git a/misc/plot-roh.py b/misc/plot-roh.py
index 546099047..41d9a212a 100755
--- a/misc/plot-roh.py
+++ b/misc/plot-roh.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #  
 #  The MIT License
 #  
@@ -41,36 +41,39 @@ def usage(msg=None):
         print '   -s, --samples <file>                  List of samples to show, rename or group: "name[\\tnew_name[\\tgroup]]"'
         print '   -h, --help                            This usage text'
         print 'Matplotlib options:'
-        print '   +adj, --adjust <str>     Set plot adjust [bottom=0.18,left=0.07,right=0.98]'
-        print '   +dpi, --dpi <num>        Set bitmap DPI [150]'
-        print '   +sxt, --show-xticks      Show x-ticks (genomic coordinate)'
-        print '   +xlb, --xlabel <str>     Set x-label'
-        print '   +xli, --xlimit <num>     Extend x-range by this fraction [0.05]'
+        print '   +adj, --adjust <str>          Set plot adjust [bottom=0.18,left=0.07,right=0.98]'
+        print '   +dpi, --dpi <num>             Set bitmap DPI [150]'
+        print '   +sxt, --show-xticks           Show x-ticks (genomic coordinate)'
+        print '   +twh, --track-wh <num,num>    Set track width and height [20,1]'
+        print '   +xlb, --xlabel <str>          Set x-label'
+        print '   +xli, --xlimit <num>          Extend x-range by this fraction [0.05]'
     else:
         print msg
     sys.exit(1)
 
-dir         = None
-regs        = None
-min_length  = 0
-min_markers = 0
-min_qual    = 0
-interactive = False
-sample_file = None
-highlight   = None
-outfile     = None
-adjust      = 'bottom=0.18,left=0.07,right=0.98'
-dpi         = 150
-xlim        = 0.05
-show_xticks = False
-xlabel      = None
+dir          = None
+plot_regions = None
+min_length   = 0
+min_markers  = 0
+min_qual     = 0
+interactive  = False
+sample_file  = None
+highlight    = None
+outfile      = None
+adjust       = 'bottom=0.18,left=0.07,right=0.98'
+dpi          = 150
+xlim         = 0.05
+show_xticks  = False
+xlabel       = None
+track_width  = 20
+track_height = 1
 
 if len(sys.argv) < 2: usage()
 args = sys.argv[1:]
 while len(args):
     if args[0]=='-r' or args[0]=='--region': 
         args = args[1:]
-        regs = args[0]
+        plot_regions = args[0]
     elif args[0]=='-i' or args[0]=='--interactive': 
         interactive = True
     elif args[0]=='-l' or args[0]=='--min-length': 
@@ -99,11 +102,16 @@ def usage(msg=None):
     elif args[0]=='+dpi' or args[0]=='--dpi': 
         args = args[1:]
         dpi = float(args[0])
+    elif args[0]=='+sxt' or args[0]=='--show-xticks': 
+        show_xticks = True
+    elif args[0]=='+twh' or args[0]=='--track-wh': 
+        args = args[1:]
+        (track_width,track_height) = args[0].split(',')
+        track_height = -float(track_height)    # will be used as if negative, no auto-magic
+        track_width  = float(track_width)
     elif args[0]=='+xlb' or args[0]=='--xlabel': 
         args = args[1:]
         xlabel = args[0]
-    elif args[0]=='+sxt' or args[0]=='--show-xticks': 
-        show_xticks = True
     elif args[0]=='+xli' or args[0]=='--xlimit': 
         args = args[1:]
         xlim = float(args[0])
@@ -119,17 +127,21 @@ def wrap_hash(**args): return args
 
 
 import matplotlib as mpl
-for gui in ['TKAgg','GTKAgg','Qt4Agg','WXAgg','MacOSX']:
-    try:
-        mpl.use(gui,warn=False, force=True)
-        import matplotlib.pyplot as plt
-        import matplotlib.patches as patches
-        break
-    except:
-        continue
+if interactive==False:
+    mpl.use('Agg')
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+else:
+    for gui in ['TKAgg','GTKAgg','Qt4Agg','WXAgg','MacOSX']:
+        try:
+            mpl.use(gui,warn=False, force=True)
+            import matplotlib.pyplot as plt
+            import matplotlib.patches as patches
+            break
+        except:
+            continue
 
 cols = [ '#337ab7', '#5cb85c', '#5bc0de', '#f0ad4e', '#d9534f', 'grey', 'black' ]
-mpl.rcParams['axes.color_cycle'] = cols
 
 globstr = os.path.join(dir, '*.txt.gz')
 fnames = glob.glob(globstr)
@@ -272,7 +284,7 @@ def parse_samples(fname,highlight):
     if highlight==None: groups = None
     return (samples,groups,smpl2y)
 
-regs = parse_regions(regs)
+plot_regions = parse_regions(plot_regions)
 (samples,groups,smpl2y) = parse_samples(sample_file,highlight)
 
 dat_gt = {}
@@ -285,7 +297,7 @@ def parse_samples(fname,highlight):
         if row[0]=='GT':
             chr  = row[1]
             pos  = int(row[2])
-            reg  = region_overlap(regs,chr,pos,pos)
+            reg  = region_overlap(plot_regions,chr,pos,pos)
             if reg==None: continue
             for i in range(3,len(row),2):
                 smpl = row[i]
@@ -317,7 +329,8 @@ def parse_samples(fname,highlight):
             if length < min_length: continue
             if nmark < min_markers : continue
             if qual < min_qual : continue
-            reg = region_overlap(regs,chr,beg,end)
+            reg = region_overlap(plot_regions,chr,beg,end)
+            if reg==None: continue
             if chr not in dat_rg: dat_rg[chr] = {}
             if smpl not in dat_rg[chr]: dat_rg[chr][smpl] = []
             if reg!=None:
@@ -352,9 +365,11 @@ def parse_samples(fname,highlight):
     off += max_pos + off_sep
     off_list.append(off)
 
-height = len(smpl2y)
-if len(smpl2y)>5: heigth = 5
-wh = 20,height
+wh = track_width,len(smpl2y)
+if track_height < 0:
+    wh = track_width,-track_height*len(smpl2y)
+elif len(smpl2y)>5: 
+    wh = track_width,5
 
 def bignum(num):
     s = str(num); out = ''; slen = len(s)
@@ -379,6 +394,7 @@ def format_coord(x, y):
 xtick_lbl = []
 xtick_pos = []
 max_x = 0
+min_x = -1
 for chr in dat_gt:
     off  = off_hash[chr]
     icol = 0
@@ -394,6 +410,7 @@ def format_coord(x, y):
                 rect = patches.Rectangle((rg[0]+off,3*y+0.5), rg[1]-rg[0]+1, 2, color='#d9534f')
                 ax1.add_patch(rect)
         ax1.plot([x[0]+off for x in dat_gt[chr][smpl]],[x[1]+3*y for x in dat_gt[chr][smpl]],'.',color=cols[icol])
+        if min_x==-1 or min_x > dat_gt[chr][smpl][0][0]+off: min_x = dat_gt[chr][smpl][0][0]+off
         if max_x < dat_gt[chr][smpl][-1][0]+off: max_x = dat_gt[chr][smpl][-1][0]+off
         if max < dat_gt[chr][smpl][-1][0]: max = dat_gt[chr][smpl][-1][0]
         icol += 1
@@ -408,7 +425,8 @@ def format_coord(x, y):
         ytick_pos.append(3*smpl2y[smpl]+1)
     break
 if xlim!=0:
-    ax1.set_xlim(0,max_x+xlim*max_x)
+    if min_x==-1: min_x = 0
+    ax1.set_xlim(min_x,max_x+xlim*max_x)
 lbl_pos = 3*(len(smpl2y)-1)
 ax1.annotate('   HomAlt ',xy=(max_x,lbl_pos-1),xycoords='data',va='center')
 ax1.annotate('   Het',xy=(max_x,lbl_pos-2),xycoords='data',va='center')
diff --git a/misc/run-roh.pl b/misc/run-roh.pl
index 8c76bf842..a70ea9639 100755
--- a/misc/run-roh.pl
+++ b/misc/run-roh.pl
@@ -51,6 +51,8 @@ sub error
         "Options:\n",
         "   -a, --af-annots <file>      Allele frequency annotations [1000GP-AFs/AFs.tab.gz]\n",
         "   -i, --indir <dir>           Input directory with VCF files\n",
+        "       --include <expr>        Select sites for which the expression is true\n",
+        "       --exclude <expr>        Exclude sites for which the epxression is true\n",
         "   -l, --min-length <num>      Filter input regions shorter than this [1e6]\n",
         "   -m, --genmap <dir>          Directory with genetic map in IMPUTE2 format (optional)\n",
         "   -M, --rec-rate <float>      constant recombination rate per bp (optional)\n",
@@ -74,6 +76,8 @@ sub parse_params
     };
     while (defined(my $arg=shift(@ARGV)))
     {
+        if (                 $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next }
+        if (                 $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next }
         if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next }
         if ( $arg eq '-l' || $arg eq '--min-length' ) { $$opts{min_length}=shift(@ARGV); next }
         if ( $arg eq '-n' || $arg eq '--min-markers' ) { $$opts{min_markers}=shift(@ARGV); next }
@@ -92,6 +96,8 @@ sub parse_params
     if ( ! -e "$$opts{af_annots}.tbi" ) { error("The annotation file is not indexed: $$opts{af_annots}.tbi\n"); }
     if ( ! -e "$$opts{af_annots}.hdr" ) { error("The annotation file has no header: $$opts{af_annots}.hdr\n"); }
     if ( exists($$opts{genmap}) && ! -d "$$opts{genmap}" ) { error("The directory with genetic maps does not exist: $$opts{genmap}\n"); }
+    if ( exists($$opts{include_expr}) ) { $$opts{include_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -i '$$opts{include_expr}']; }
+    if ( exists($$opts{exclude_expr}) ) { $$opts{exclude_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -e '$$opts{exclude_expr}']; }
     return $opts;
 }
 
@@ -196,10 +202,19 @@ sub run_roh
         my $outfile = "$$opts{outdir}/$`.bcf";
         push @files,$outfile;
         if ( -e $outfile ) { next; }
-        cmd(
+
+        my $cmd = 
             "bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " .
-            "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} -Ob -o $outfile.part && " .
-            "mv $outfile.part $outfile",%$opts);
+            "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} ";
+
+        if ( exists($$opts{inc_exc}) )
+        {
+            $cmd .= " -Ou | bcftools view $$opts{inc_exc} ";
+        }
+        $cmd .= "-Ob -o $outfile.part && ";
+        $cmd .= "mv $outfile.part $outfile";
+
+        cmd($cmd, %$opts);
     }
     closedir($dh) or error("close failed: $$opts{indir}");
 
diff --git a/vcfroh.c b/vcfroh.c
index 626f97538..a96a968b2 100644
--- a/vcfroh.c
+++ b/vcfroh.c
@@ -35,6 +35,7 @@ THE SOFTWARE.  */
 #include "bcftools.h"
 #include "HMM.h"
 #include "smpl_ilist.h"
+#include "filter.h"
 
 #define STATE_HW 0        // normal state, follows Hardy-Weinberg allele frequencies
 #define STATE_AZ 1        // autozygous state
@@ -43,6 +44,11 @@ THE SOFTWARE.  */
 #define OUTPUT_RG (1<<2)
 #define OUTPUT_GZ (1<<3)
 
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+
 /** Genetic map */
 typedef struct
 {
@@ -94,6 +100,9 @@ typedef struct _args_t
     int32_t skip_rid, prev_rid, prev_pos;
 
     int ntot;                   // some stats to detect if things didn't go wrong
+    int nno_af;                 // number of sites rejected because AF could not be determined
+    int nfiltered;              // .. because of filters
+    int nnot_biallelic, ndup;
     smpl_t *smpl;               // HMM data for each sample
     smpl_ilist_t *af_smpl;      // list of samples to estimate AF from (--estimate-AF)
     smpl_ilist_t *roh_smpl;     // list of samples to analyze (--samples, --samples-file)
@@ -103,6 +112,10 @@ typedef struct _args_t
     int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
     BGZF *out;
     kstring_t str;
+
+    int filter_logic;
+    filter_t *filter;
+    char *filter_str;
 }
 args_t;
 
@@ -125,6 +138,9 @@ static void init_data(args_t *args)
 
     if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
 
+    if ( args->filter_str )
+        args->filter = filter_init(args->hdr, args->filter_str);
+
     if ( !args->fake_PLs )
     {
         args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
@@ -318,6 +334,7 @@ static void init_data(args_t *args)
 
 static void destroy_data(args_t *args)
 {
+    if ( args->filter ) filter_destroy(args->filter);
     if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname);
     int i;
     for (i=0; i<args->roh_smpl->n; i++)
@@ -854,8 +871,9 @@ int process_line(args_t *args, bcf1_t *line, int ial)
             alt_freq = (double) AC/AN;
     }
 
-    if ( ret<0 ) return ret;
-    if ( alt_freq==0.0 ) return -1;
+    if ( args->dflt_AF>0 && (ret<0 || alt_freq==0.0) ) alt_freq = args->dflt_AF;
+    else if ( ret<0 ) { args->nno_af++; return ret; }
+    else if ( alt_freq==0.0 ) { args->nno_af++; return -1; }
 
     int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
     if ( args->fake_PLs )
@@ -970,12 +988,11 @@ static void vcfroh(args_t *args, bcf1_t *line)
         for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
         return; 
     }
-    args->ntot++;
 
     // Skip unwanted lines, for simplicity we consider only biallelic sites 
     if ( line->rid == args->skip_rid ) return;
-    if ( line->n_allele==1 ) return;    // no ALT allele
-    if ( line->n_allele > 3 ) return;   // cannot be bi-allelic, even with <*>
+    if ( line->n_allele==1 ) { args->nnot_biallelic++; return; }   // no ALT allele
+    if ( line->n_allele > 3 ) { args->nnot_biallelic++; return; }   // cannot be bi-allelic, even with <*>
 
     // This can be raw callable VCF with the symbolic unseen allele <*>
     int ial = 0;
@@ -983,7 +1000,7 @@ static void vcfroh(args_t *args, bcf1_t *line)
         if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
     if ( ial==0 )    // normal VCF, the symbolic allele is not present
     {
-        if ( line->n_allele!=2 ) return;    // not biallelic
+        if ( line->n_allele!=2 ) { args->nnot_biallelic++; return; }   // not biallelic
         ial = 1;
     }
     else
@@ -1017,7 +1034,11 @@ static void vcfroh(args_t *args, bcf1_t *line)
         args->prev_pos = line->pos;
         skip_rid = load_genmap(args, bcf_seqname(args->hdr,line));
     }
-    else if ( args->prev_pos == line->pos ) return;     // skip duplicate positions
+    else if ( args->prev_pos == line->pos ) 
+    {
+        args->ndup++;
+        return;     // skip duplicate positions
+    }
 
     if ( skip_rid )
     {
@@ -1044,14 +1065,16 @@ static void usage(args_t *args)
     fprintf(stderr, "General Options:\n");
     fprintf(stderr, "        --AF-dflt <float>              if AF is not known, use this allele frequency [skip]\n");
     fprintf(stderr, "        --AF-tag <TAG>                 use TAG for allele frequency\n");
-    fprintf(stderr, "        --AF-file <file>               read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
+    fprintf(stderr, "        --AF-file <file>               read allele frequencies from file (CHR\\tPOS\\tREF\\tALT\\tAF)\n");
     fprintf(stderr, "    -b  --buffer-size <int[,int]>      buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
     fprintf(stderr, "                                           If the first number is negative, it is interpreted as the maximum memory to\n");
     fprintf(stderr, "                                           use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
     fprintf(stderr, "    -e, --estimate-AF [TAG],<file>     estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n");
     fprintf(stderr, "                                            in <file>. If TAG is not given, the frequency is estimated from GT by default\n");
+    fprintf(stderr, "        --exclude <expr>               exclude sites for which the expression is true\n");
     fprintf(stderr, "    -G, --GTs-only <float>             use GTs and ignore PLs, instead using <float> for PL of the two least likely genotypes.\n");
     fprintf(stderr, "                                           Safe value to use is 30 to account for GT errors.\n");
+    fprintf(stderr, "        --include <expr>               select sites for which the expression is true\n");
     fprintf(stderr, "    -i, --ignore-homref                skip hom-ref genotypes (0/0)\n");
     fprintf(stderr, "    -I, --skip-indels                  skip indels as their genotypes are enriched for errors\n");
     fprintf(stderr, "    -m, --genetic-map <file>           genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
@@ -1091,6 +1114,8 @@ int main_vcfroh(int argc, char *argv[])
         {"AF-tag",1,0,0},
         {"AF-file",1,0,1},
         {"AF-dflt",1,0,2},
+        {"include",1,0,3},
+        {"exclude",1,0,4},
         {"buffer-size",1,0,'b'},
         {"ignore-homref",0,0,'i'},
         {"estimate-AF",1,0,'e'},
@@ -1123,6 +1148,8 @@ int main_vcfroh(int argc, char *argv[])
                 args->dflt_AF = strtod(optarg,&tmp);
                 if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
                 break;
+            case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
+            case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
             case 'o': args->output_fname = optarg; break;
             case 'O': 
                 if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
@@ -1206,16 +1233,28 @@ int main_vcfroh(int argc, char *argv[])
     init_data(args);
     while ( bcf_sr_next_line(args->files) )
     {
-        vcfroh(args, args->files->readers[0].buffer[0]);
+        args->ntot++;
+        bcf1_t *line = bcf_sr_get_line(args->files,0);
+        if ( args->filter )
+        {
+            int pass = filter_test(args->filter, line, NULL);
+            if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
+            if ( !pass ) { args->nfiltered++; continue; }
+        }
+        vcfroh(args, line);
     }
     vcfroh(args, NULL);
     int i, nmin = 0;
     for (i=0; i<args->roh_smpl->n; i++)
         if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused;
-    fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
+    if ( args->af_fname )
+        fprintf(stderr,"Number of lines overlapping with --AF-file/processed: %d/%d\n", args->ntot,nmin);
+    else
+        fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
+    fprintf(stderr,"Number of lines filtered/no AF/not biallelic/dup: %d/%d/%d/%d\n", args->nfiltered,args->nno_af,args->nnot_biallelic,args->ndup);
     if ( nmin==0 )
     {
-        fprintf(stderr,"No usable sites were found.");
+        fprintf(stderr,"No usable sites were found.\n", args->nno_af);
         if ( !naf_opts && !args->dflt_AF ) fprintf(stderr, " Consider using one of the AF options.\n");
     }
     destroy_data(args);

From 9224bf1b35da00ccd65e6ae9b6b0b56acbe7a2dd Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Sat, 17 Mar 2018 09:37:04 +0000
Subject: [PATCH 10/27] Update documentation

---
 doc/bcftools.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index a5633be8b..44a15373f 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -2312,11 +2312,19 @@ Transition probabilities:
     If neither *-e* nor the other *--AF-...* options are given, the allele frequency is
     estimated from AC and AN counts which are already present in the INFO field.
 
+*--exclude* 'EXPRESSION'::
+    exclude sites for which 'EXPRESSION' is true. For valid expressions see
+    *<<expressions,EXPRESSIONS>>*.
+
 *-G, --GTs-only* 'FLOAT'::
     use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL),
     setting PL of unseen genotypes to 'FLOAT'. Safe value to use is 30 to
     account for GT errors.
 
+*--include* 'EXPRESSION'::
+    include only sites for which 'EXPRESSION' is true. For valid expressions see
+    *<<expressions,EXPRESSIONS>>*.
+
 *-I, --skip-indels*::
     skip indels as their genotypes are usually enriched for errors
 

From 32122e95e789bcf65278aa3ca99bdee9dcbde5e6 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Mon, 19 Mar 2018 12:52:31 +0000
Subject: [PATCH 11/27] Fix a small memory leak

---
 consensus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/consensus.c b/consensus.c
index d67a05219..4d555c88c 100644
--- a/consensus.c
+++ b/consensus.c
@@ -280,6 +280,7 @@ static void init_region(args_t *args, char *line)
             else to--;
         }
     }
+    free(args->chr);
     args->chr = strdup(line);
     args->rid = bcf_hdr_name2id(args->hdr,line);
     if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);

From 6400857948e0d8beef598651489ce5181252c423 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Mon, 19 Mar 2018 12:53:16 +0000
Subject: [PATCH 12/27] Support for vcf,vcf.gz regidx parser. Fixes #745

---
 regidx.c | 13 ++++++++++++-
 regidx.h |  1 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/regidx.c b/regidx.c
index 9b2c66d71..d7d8a812a 100644
--- a/regidx.c
+++ b/regidx.c
@@ -232,6 +232,10 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
                 parser = regidx_parse_bed;
             else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
                 parser = regidx_parse_bed;
+            else if ( len>=4 && !strcasecmp(".vcf",fname+len-4) )
+                parser = regidx_parse_vcf;
+            else if ( len>=4 && !strcasecmp(".vcf.gz",fname+len-7) )
+                parser = regidx_parse_vcf;
             else
                 parser = regidx_parse_tab;
         }
@@ -488,13 +492,20 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t
     {
         ss = se+1;
         *end = strtod(ss, &se);
-        if ( ss==se ) *end = *beg;
+        if ( ss==se || (*se && !isspace(*se)) ) *end = *beg;
         else if ( *end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
         else (*end)--;
     }
     return 0;
 }
 
+int regidx_parse_vcf(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+    int ret = regidx_parse_tab(line, chr_beg, chr_end, beg, end, payload, usr);
+    if ( !ret ) *end = *beg;
+    return ret;
+}
+
 int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
 {
     char *ss = (char*) line;
diff --git a/regidx.h b/regidx.h
index fe0a897e4..eec978b6c 100644
--- a/regidx.h
+++ b/regidx.h
@@ -106,6 +106,7 @@ typedef void (*regidx_free_f)(void *payload);
 int regidx_parse_bed(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);   // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open)
 int regidx_parse_tab(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);   // CHROM or whitespace-separated CHROM,POS (1-based, inclusive)
 int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);   // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive)
+int regidx_parse_vcf(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
 
 /*
  *  regidx_init() - creates new index

From fb94d291e30b2237f0645051119a55d737d81355 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Mon, 19 Mar 2018 13:08:30 +0000
Subject: [PATCH 13/27] Fix a copy-and-paste error

---
 regidx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/regidx.c b/regidx.c
index d7d8a812a..3e4620ff1 100644
--- a/regidx.c
+++ b/regidx.c
@@ -234,7 +234,7 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
                 parser = regidx_parse_bed;
             else if ( len>=4 && !strcasecmp(".vcf",fname+len-4) )
                 parser = regidx_parse_vcf;
-            else if ( len>=4 && !strcasecmp(".vcf.gz",fname+len-7) )
+            else if ( len>=7 && !strcasecmp(".vcf.gz",fname+len-7) )
                 parser = regidx_parse_vcf;
             else
                 parser = regidx_parse_tab;

From d78e524c53aa50cfb52765eb4197f1028b5621c9 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Mon, 19 Mar 2018 13:31:04 +0000
Subject: [PATCH 14/27] Fix --haplegendsample on purely haploid genotypes.
 Resolves #730

---
 convert.c                     | 23 ++++++++++++++++++++++-
 test/convert.hap-missing.haps |  5 +++++
 test/convert.hap-missing.vcf  |  9 +++++++++
 test/test.pl                  |  1 +
 4 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 test/convert.hap-missing.haps
 create mode 100644 test/convert.hap-missing.vcf

diff --git a/convert.c b/convert.c
index d7ed0abc1..dce4529fd 100644
--- a/convert.c
+++ b/convert.c
@@ -760,12 +760,33 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
 
     if ( fmt_gt->type!=BCF_BT_INT8 )    // todo: use BRANCH_INT if the VCF is valid
         error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+    if ( fmt_gt->n!=1 && fmt_gt->n!=2 )
+        error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1);
 
     int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
     for (i=0; i<convert->nsamples; i++)
     {
         ptr += fmt_gt->n;
-        if ( ptr[0]==2 )
+        if ( fmt_gt->n==1 ) // haploid genotypes
+        {
+            if ( ptr[0]==2 ) /* 0 */
+            {
+                str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+            }
+            else if ( ptr[0]==bcf_int32_missing )   /* . */
+            {
+                str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+            }
+            else if ( ptr[0]==4 ) /* 1 */
+            {
+                str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+            }
+            else
+            {
+                kputw(bcf_gt_allele(ptr[0]),str); str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+            }
+        }
+        else if ( ptr[0]==2 )
         {
             if ( ptr[1]==3 ) /* 0|0 */
             {
diff --git a/test/convert.hap-missing.haps b/test/convert.hap-missing.haps
new file mode 100644
index 000000000..952e13c5e
--- /dev/null
+++ b/test/convert.hap-missing.haps
@@ -0,0 +1,5 @@
+0 0 0 0 1 0
+0 - 0 - 1 -
+? ? ? ? ? ?
+? ? ? ? ? ?
+0 0 0 0 1 0
diff --git a/test/convert.hap-missing.vcf b/test/convert.hap-missing.vcf
new file mode 100644
index 000000000..eac4d155c
--- /dev/null
+++ b/test/convert.hap-missing.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.1
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=X,assembly=b37,length=155270560>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
+X	2699450	.	A	C	.	.	.	GT	0|0	0|0	1|0
+X	2699451	.	A	C	.	.	.	GT	0	0	1
+X	2699968	.	A	G	.	.	.	GT	.|.	0|.	1|.
+X	2699970	.	T	C	.	.	.	GT	.|0	0|.	1|.
+X	2699990	.	C	<INS:ME:LINE1>	.	.	.	GT	0|0	0|0	1|0
diff --git a/test/test.pl b/test/test.pl
index 6c7e54b15..7ce31e3bf 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -320,6 +320,7 @@
 test_vcf_convert_hs2vcf($opts,h=>'convert.hs.gt.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.vcf',args=>'--hapsample2vcf');
 test_vcf_convert($opts,in=>'convert',out=>'convert.hs.hap',args=>'--hapsample -,.');
 test_vcf_convert($opts,in=>'convert',out=>'convert.hs.sample',args=>'--hapsample .,-');
+test_vcf_convert($opts,in=>'convert.hap-missing',out=>'convert.hap-missing.haps',args=>'--haplegendsample -,.,.');
 test_vcf_convert_gvcf($opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf');
 test_vcf_convert_tsv2vcf($opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme');
 test_vcf_consensus($opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'');

From ba084390383af977cf8e9d07b0fd224b24c9f594 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Mon, 19 Mar 2018 13:55:15 +0000
Subject: [PATCH 15/27] Add threading support for BCF reheader. Resolves #722

---
 reheader.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/reheader.c b/reheader.c
index 277606900..863c4ff46 100644
--- a/reheader.c
+++ b/reheader.c
@@ -1,6 +1,6 @@
 /*  reheader.c -- reheader subcommand.
 
-    Copyright (C) 2014-2017 Genome Research Ltd.
+    Copyright (C) 2014-2018 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -36,6 +36,7 @@ THE SOFTWARE.  */
 #include <htslib/bgzf.h>
 #include <htslib/tbx.h> // for hts_get_bgzfp()
 #include <htslib/kseq.h>
+#include <htslib/thread_pool.h>
 #include "bcftools.h"
 #include "khash_str2str.h"
 
@@ -44,7 +45,8 @@ typedef struct _args_t
     char **argv, *fname, *samples_fname, *header_fname, *output_fname;
     htsFile *fp;
     htsFormat type;
-    int argc;
+    htsThreadPool *threads;
+    int argc, n_threads;
 }
 args_t;
 
@@ -369,6 +371,16 @@ static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst)
 static void reheader_bcf(args_t *args, int is_compressed)
 {
     htsFile *fp = args->fp;
+
+    if ( args->n_threads > 0 )
+    {
+        args->threads = calloc(1, sizeof(*args->threads));
+        if ( !args->threads ) error("Could not allocate memory\n");
+        if ( !(args->threads->pool = hts_tpool_init(args->n_threads)) ) error("Could not initialize threading\n");
+        BGZF *bgzf = hts_get_bgzfp(fp);
+        if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize);
+    }
+
     bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname);
     kstring_t htxt = {0,0,0};
     bcf_hdr_format(hdr, 1, &htxt);
@@ -396,6 +408,11 @@ static void reheader_bcf(args_t *args, int is_compressed)
     // write the header and the body
     htsFile *fp_out = hts_open(args->output_fname ? args->output_fname : "-",is_compressed ? "wb" : "wbu");
     if ( !fp_out ) error("%s: %s\n", args->output_fname ? args->output_fname : "-", strerror(errno));
+    if ( args->threads )
+    {
+        BGZF *bgzf = hts_get_bgzfp(fp_out);
+        if ( bgzf ) bgzf_thread_pool(bgzf, args->threads->pool, args->threads->qsize);
+    }
     bcf_hdr_write(fp_out, hdr_out);
 
     bcf1_t *rec = bcf_init();
@@ -450,6 +467,11 @@ static void reheader_bcf(args_t *args, int is_compressed)
     hts_close(fp);
     bcf_hdr_destroy(hdr_out);
     bcf_hdr_destroy(hdr);
+    if ( args->threads )
+    {
+        hts_tpool_destroy(args->threads->pool);
+        free(args->threads);
+    }
 }
 
 
@@ -463,6 +485,7 @@ static void usage(args_t *args)
     fprintf(stderr, "    -h, --header <file>     new header\n");
     fprintf(stderr, "    -o, --output <file>     write output to a file [standard output]\n");
     fprintf(stderr, "    -s, --samples <file>    new sample names\n");
+    fprintf(stderr, "        --threads <int>     number of extra compression threads (BCF only) [0]\n");
     fprintf(stderr, "\n");
     exit(1);
 }
@@ -472,18 +495,20 @@ int main_reheader(int argc, char *argv[])
     int c;
     args_t *args  = (args_t*) calloc(1,sizeof(args_t));
     args->argc    = argc; args->argv = argv;
-
+    
     static struct option loptions[] =
     {
         {"output",1,0,'o'},
         {"header",1,0,'h'},
         {"samples",1,0,'s'},
+        {"threads",1,NULL,1},
         {0,0,0,0}
     };
     while ((c = getopt_long(argc, argv, "s:h:o:",loptions,NULL)) >= 0)
     {
         switch (c)
         {
+            case  1 : args->n_threads = strtol(optarg, 0, 0); break;
             case 'o': args->output_fname = optarg; break;
             case 's': args->samples_fname = optarg; break;
             case 'h': args->header_fname = optarg; break;

From 799c73979db70e90b7f59d6c22b0d9846963f69c Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Mon, 19 Mar 2018 14:37:41 +0000
Subject: [PATCH 16/27] Exit nicely on non-existent FMT/TAGs. Resolves #696

---
 vcfannotate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vcfannotate.c b/vcfannotate.c
index 3de9eb5cf..06f56c6b3 100644
--- a/vcfannotate.c
+++ b/vcfannotate.c
@@ -1720,6 +1720,7 @@ static void init_columns(args_t *args)
             if ( args->tgts_is_vcf )
             {
                 bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
+                if ( !hrec ) error("No such annotation \"%s\" in %s\n", key_src,args->targets_fname);
                 tmp.l = 0;
                 bcf_hrec_format_rename(hrec, key_dst, &tmp);
                 bcf_hdr_append(args->hdr_out, tmp.s);

From 2c6162eb565f318c9dfe77becea6298d900aaab9 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Tue, 20 Mar 2018 14:59:51 +0000
Subject: [PATCH 17/27] Annotate with 1-based R2 coordinate, not 0-based.
 Resolves #760

---
 plugins/prune.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/plugins/prune.c b/plugins/prune.c
index 5c4f506d6..8b1ee52b1 100644
--- a/plugins/prune.c
+++ b/plugins/prune.c
@@ -182,8 +182,9 @@ static void process(args_t *args)
         if ( ld_rec && args->info_r2 )
         {
             float tmp = ld_val;
+            int32_t tmp_pos = ld_rec->pos + 1;
             bcf_update_info_float(args->hdr, rec, args->info_r2, &tmp, 1);
-            bcf_update_info_int32(args->hdr, rec, args->info_pos, &ld_rec->pos, 1);
+            bcf_update_info_int32(args->hdr, rec, args->info_pos, &tmp_pos, 1);
         }
     }
     sr->buffer[0] = vcfbuf_push(args->vcfbuf, rec, 1);

From 0d1ee0baa1ce180893a733cb9e904e12c9f8a96c Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 22 Mar 2018 17:24:56 +0000
Subject: [PATCH 18/27] The indexed file must be BGZF-compressed. Resolves #761

---
 vcfindex.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vcfindex.c b/vcfindex.c
index 807fedd5f..c19587dc5 100644
--- a/vcfindex.c
+++ b/vcfindex.c
@@ -213,6 +213,7 @@ int main_vcfindex(int argc, char *argv[])
         // check for truncated files, allow only with -f
         BGZF *fp = bgzf_open(fname, "r");
         if ( !fp ) error("index: failed to open %s\n", fname);
+        if ( bgzf_compression(fp)!=2 ) error("index: the file is not BGZF compressed, cannot index: %s\n", fname);
         if ( bgzf_check_EOF(fp)!=1 ) error("index: the input is probably truncated, use -f to index anyway: %s\n", fname);
         if ( bgzf_close(fp)!=0 ) error("index: close failed: %s\n", fname);
     }

From 9d94f7bfded90baf04a9dc9c0942c69120f53116 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 22 Mar 2018 17:31:19 +0000
Subject: [PATCH 19/27] Add options for simpler ploidy usage

---
 plugins/fixploidy.c | 58 ++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/plugins/fixploidy.c b/plugins/fixploidy.c
index 04cc07677..039d0f48b 100644
--- a/plugins/fixploidy.c
+++ b/plugins/fixploidy.c
@@ -38,6 +38,7 @@ static int *sample2sex = NULL;
 static int n_sample = 0, nsex = 0, *sex2ploidy = NULL;
 static int32_t ngt_arr = 0, *gt_arr = NULL, *gt_arr2 = NULL, ngt_arr2 = 0;
 static ploidy_t *ploidy = NULL;
+static int force_ploidy = -1;
 
 const char *about(void)
 {
@@ -54,9 +55,11 @@ const char *usage(void)
         "   run \"bcftools plugin\" for a list of common options\n"
         "\n"
         "Plugin options:\n"
-        "   -p, --ploidy <file>   space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"
-        "   -s, --sex <file>      list of samples, \"NAME SEX\"\n"
-        "   -t, --tags <list>     VCF tags to fix [GT]\n"
+        "   -d, --default-ploidy <int>  default ploidy for regions unlisted in -p [2]\n"
+        "   -f, --force-ploidy <int>    ignore -p, set the same ploidy for all genotypes\n"
+        "   -p, --ploidy <file>         space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"
+        "   -s, --sex <file>            list of samples, \"NAME SEX\"\n"
+        "   -t, --tags <list>           VCF tags to fix [GT]\n"
         "\n"
         "Example:\n"
         "   # Default ploidy, if -p not given. Unlisted regions have ploidy 2\n"
@@ -109,21 +112,32 @@ void set_samples(char *fname, bcf_hdr_t *hdr, ploidy_t *ploidy, int *sample2sex)
 
 int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
 {
-    int c;
+    int c, default_ploidy = 2;
     char *tags_str = "GT";
     char *ploidy_fname = NULL, *sex_fname = NULL;
 
     static struct option loptions[] =
     {
+        {"default-ploidy",1,0,'d'},
+        {"force-ploidy",1,0,'f'},
         {"ploidy",1,0,'p'},
         {"sex",1,0,'s'},
         {"tags",1,0,'t'},
         {0,0,0,0}
     };
-    while ((c = getopt_long(argc, argv, "?ht:s:p:",loptions,NULL)) >= 0)
+    char *tmp;
+    while ((c = getopt_long(argc, argv, "?ht:s:p:d:f:",loptions,NULL)) >= 0)
     {
         switch (c) 
         {
+            case 'd': 
+                default_ploidy = strtod(optarg,&tmp);
+                if ( *tmp ) error("Could not parse: -d %s\n", optarg);
+                break;
+            case 'f': 
+                force_ploidy = strtod(optarg,&tmp);
+                if ( *tmp ) error("Could not parse: -f %s\n", optarg);
+                break;
             case 'p': ploidy_fname = optarg; break;
             case 's': sex_fname = optarg; break;
             case 't': tags_str = optarg; break;
@@ -140,8 +154,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
     out_hdr    = out;
 
     if ( ploidy_fname )
-        ploidy = ploidy_init(ploidy_fname, 2);
-    else
+        ploidy = ploidy_init(ploidy_fname, default_ploidy);
+    else if ( force_ploidy==-1 )
     {
         ploidy = ploidy_init_string(
                 "X 1 60000 M 1\n"
@@ -151,14 +165,17 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
                 "MT 1 16569 M 1\n"
                 "MT 1 16569 F 1\n", 2);
     }
-    if ( !ploidy ) return -1;
-
-    // add default sex in case it was not included
-    int i, dflt_sex_id = ploidy_add_sex(ploidy, "F");
-    for (i=0; i<n_sample; i++) sample2sex[i] = dflt_sex_id; // by default all are F
-    if ( sex_fname ) set_samples(sex_fname, in, ploidy, sample2sex);
-    nsex = ploidy_nsex(ploidy);
-    sex2ploidy = (int*) malloc(sizeof(int)*nsex);
+    if ( force_ploidy==-1 )
+    {
+        if ( !ploidy ) return -1;
+
+        // add default sex in case it was not included
+        int i, dflt_sex_id = ploidy_add_sex(ploidy, "F");
+        for (i=0; i<n_sample; i++) sample2sex[i] = dflt_sex_id; // by default all are F
+        if ( sex_fname ) set_samples(sex_fname, in, ploidy, sample2sex);
+        nsex = ploidy_nsex(ploidy);
+        sex2ploidy = (int*) malloc(sizeof(int)*nsex);
+    }
 
     return 0;
 }
@@ -175,7 +192,10 @@ bcf1_t *process(bcf1_t *rec)
     if ( ngts % n_sample )
         error("Error at %s:%d: wrong number of GT fields\n",bcf_seqname(in_hdr,rec),rec->pos+1);
 
-    ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy);
+    if ( force_ploidy==-1 )
+        ploidy_query(ploidy, (char*)bcf_seqname(in_hdr,rec), rec->pos, sex2ploidy,NULL,&max_ploidy);
+    else
+        max_ploidy = force_ploidy;
 
     ngts /= n_sample;
     if ( ngts < max_ploidy )
@@ -183,7 +203,7 @@ bcf1_t *process(bcf1_t *rec)
         hts_expand(int32_t,max_ploidy*n_sample,ngt_arr2,gt_arr2);
         for (i=0; i<n_sample; i++)
         {
-            int ploidy = sex2ploidy[ sample2sex[i] ];
+            int ploidy = force_ploidy!=-1 ? force_ploidy : sex2ploidy[ sample2sex[i] ];
             int32_t *src = &gt_arr[i*ngts];
             int32_t *dst = &gt_arr2[i*max_ploidy];
             j = 0;
@@ -201,7 +221,7 @@ bcf1_t *process(bcf1_t *rec)
     {
         for (i=0; i<n_sample; i++)
         {
-            int ploidy = sex2ploidy[ sample2sex[i] ];
+            int ploidy = force_ploidy!=-1 ? force_ploidy : sex2ploidy[ sample2sex[i] ];
             int32_t *gts = &gt_arr[i*ngts];
             j = 0;
             if ( !ploidy ) { gts[j] = bcf_gt_missing; j++; }
@@ -224,7 +244,7 @@ void destroy(void)
     free(gt_arr2);
     free(sample2sex);
     free(sex2ploidy);
-    ploidy_destroy(ploidy);
+    if ( ploidy ) ploidy_destroy(ploidy);
 }
 
 

From 2b272cf9568a950e0263f0fdbce41a14a5733e7b Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 22 Mar 2018 17:34:13 +0000
Subject: [PATCH 20/27] Do not rely on FMT/GT size in BCF, instead check the
 maximum ploidy. Fixes #705

---
 vcfmerge.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/vcfmerge.c b/vcfmerge.c
index 31f5dad51..a69e2cb06 100644
--- a/vcfmerge.c
+++ b/vcfmerge.c
@@ -1281,6 +1281,37 @@ void update_AN_AC(bcf_hdr_t *hdr, bcf1_t *line)
     free(tmp);
 }
 
+static inline int max_used_gt_ploidy(bcf_fmt_t *fmt, int nsmpl)
+{
+    int i,j, max_ploidy = 0;
+
+    #define BRANCH(type_t, vector_end) { \
+        type_t *ptr  = (type_t*) fmt->p; \
+        for (i=0; i<nsmpl; i++) \
+        { \
+            for (j=0; j<fmt->n; j++) \
+                if ( ptr[j]==vector_end ) break; \
+            if ( j==fmt->n ) \
+            { \
+                /* all fields were used */ \
+                if ( max_ploidy < j ) max_ploidy = j; \
+                break; \
+            } \
+            if ( max_ploidy < j ) max_ploidy = j; \
+            ptr += fmt->n; \
+        } \
+    }
+    switch (fmt->type)
+    {
+        case BCF_BT_INT8:  BRANCH(int8_t,   bcf_int8_vector_end); break;
+        case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
+        case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
+        default: error("Unexpected case: %d\n", fmt->type);
+    }
+    #undef BRANCH
+    return max_ploidy;
+}
+
 void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
 {
     bcf_srs_t *files = args->files;
@@ -1291,9 +1322,12 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
     int nsize = 0, msize = sizeof(int32_t);
     for (i=0; i<files->nreaders; i++)
     {
-        if ( !fmt_map[i] ) continue;
-        if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
+        bcf_fmt_t *fmt = fmt_map[i];
+        if ( !fmt ) continue;
+        int pld = max_used_gt_ploidy(fmt_map[i], bcf_hdr_nsamples(bcf_sr_get_header(args->files,i)));
+        if ( nsize < pld ) nsize = pld;
     }
+    if ( nsize==0 ) nsize = 1;
 
     if ( ma->ntmp_arr < nsamples*nsize*msize )
     {
@@ -1311,7 +1345,7 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
         int32_t *tmp  = (int32_t *) ma->tmp_arr + ismpl*nsize;
         int irec = ma->buf[i].cur;
 
-        int j, k;
+        int j,k;
         if ( !fmt_ori )
         {
             // missing values: assume maximum ploidy

From f4928e69f24a9c100413fa2df727feadd95cc057 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 22 Mar 2018 17:35:18 +0000
Subject: [PATCH 21/27] Added a new option for passing parameters to bcftools
 roh

---
 misc/run-roh.pl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/misc/run-roh.pl b/misc/run-roh.pl
index a70ea9639..c980c9b44 100755
--- a/misc/run-roh.pl
+++ b/misc/run-roh.pl
@@ -59,6 +59,7 @@ sub error
         "   -n, --min-markers <num>     Filter input regions with fewer marker than this [100]\n",
         "   -o, --outdir <dir>          Output directory\n",
         "   -q, --min-qual <num>        Filter input regions with quality smaller than this [10]\n",
+        "       --roh-args <string>     Extra arguments to pass to bcftools roh\n",
         "   -s, --silent                Quiet output, do not print commands\n",
         "   -h, -?, --help              This help message\n",
         "\n";
@@ -73,9 +74,11 @@ sub parse_params
         min_length  => 1e6,
         min_markers => 100,
         min_qual    => 10,
+        roh_args    => '',
     };
     while (defined(my $arg=shift(@ARGV)))
     {
+        if (                 $arg eq '--roh-args' ) { $$opts{roh_args}=shift(@ARGV); next }
         if (                 $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next }
         if (                 $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next }
         if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next }
@@ -224,7 +227,7 @@ sub run_roh
     for my $file (@files)
     {
         if ( -e "$file.txt.gz" ) { next; }
-        my @out = cmd("bcftools roh --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
+        my @out = cmd("bcftools roh $$opts{roh_args} --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
         for my $line (@out)
         {
             if ( !($line=~m{total/processed:\s+(\d+)/(\d+)}) ) { next; }

From ed286e95783a19fbd67b1e9258dbbc4644fcd9ce Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 29 Mar 2018 17:00:40 +0100
Subject: [PATCH 22/27] Set all genotypes to phased with +setGT -t a -n p

---
 plugins/setGT.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/plugins/setGT.c b/plugins/setGT.c
index efa2bcb1e..15976622a 100644
--- a/plugins/setGT.c
+++ b/plugins/setGT.c
@@ -195,7 +195,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
         {
             case 'i': args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break;
             case 'e': args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break;
-            case 'n': args->new_mask = bcf_gt_phased(0); 
+            case 'n': args->new_mask = 0;
                 if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING;
                 if ( strchr(optarg,'0') ) args->new_mask |= GT_REF;
                 if ( strchr(optarg,'M') ) args->new_mask |= GT_MAJOR;
@@ -232,6 +232,19 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
     return 0;
 }
 
+static inline int phase_gt(int32_t *ptr, int ngts)
+{
+    int j, changed = 0;
+    for (j=0; j<ngts; j++)
+    {
+        if ( ptr[j]==bcf_int32_vector_end ) break;
+        if ( bcf_gt_is_phased(ptr[j]) ) continue;
+        ptr[j] = bcf_gt_phased(bcf_gt_allele(ptr[j]));    // add phasing; this may need a fix, I think the flag should be set for one allele only?
+        changed++;
+    }
+    return changed;
+}
+
 static inline int unphase_gt(int32_t *ptr, int ngts)
 {
     int j, changed = 0;
@@ -405,6 +418,8 @@ bcf1_t *process(bcf1_t *rec)
 
             if ( args->new_mask&GT_UNPHASED )
                 changed += unphase_gt(ptr, ngts);
+            else if ( args->new_mask==GT_PHASED )
+                changed += phase_gt(ptr, ngts);
             else
                 changed += set_gt(ptr, ngts, args->new_gt);
         }

From fd12d5127794883df930ae1c4eb734c9ff1df6a2 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 29 Mar 2018 17:10:13 +0100
Subject: [PATCH 23/27] Prevent dropping AC,AN INFO fields on -i/-e MAF,AF
 queries

The query command asks htslib to skip the expensive parsing of INFO and FORMAT
fields when it is not needed in the formatting and the filtering expression.
This backfired for fields calculated on the fly, commands like

    bcftools query -e 'MAF[0]<0.05' -f '%POS\n'

would not bee able to calculate MAF because INFO/AC, INFO/AN is not accessible.

This is now fixed, but with a caveat: if the header contains INFO/AC,AN line
but the annotations are not filled in the data lines, the program will not
be able to determine AC, AN from FORMAT/GT. In such case, fill-tags can
be used to update the INFO annotations or the tags can be stripped from the
header using `annotate` or `reheader`.

Resolves #762.
---
 filter.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/filter.c b/filter.c
index 0b4053841..fe17634f3 100644
--- a/filter.c
+++ b/filter.c
@@ -1748,6 +1748,18 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
         for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
     }
 }
+static int max_ac_an_unpack(bcf_hdr_t *hdr)
+{
+    int hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AC");
+    if ( hdr_id<0 ) return BCF_UN_FMT;
+    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
+
+    hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AN");
+    if ( hdr_id<0 ) return BCF_UN_FMT;
+    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
+
+    return BCF_UN_INFO;
+}
 static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
 {
     tok->tok_type  = TOK_VAL;
@@ -1980,6 +1992,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     }
     else if ( !strcasecmp(tmp.s,"AN") )
     {
+        filter->max_unpack |= BCF_UN_FMT;
         tok->setter = &filters_set_an;
         tok->tag = strdup("AN");
         free(tmp.s);
@@ -1987,6 +2000,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     }
     else if ( !strcasecmp(tmp.s,"AC") )
     {
+        filter->max_unpack |= BCF_UN_FMT;
         tok->setter = &filters_set_ac;
         tok->tag = strdup("AC");
         free(tmp.s);
@@ -1994,6 +2008,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     }
     else if ( !strcasecmp(tmp.s,"MAC") )
     {
+        filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_mac;
         tok->tag = strdup("MAC");
         free(tmp.s);
@@ -2001,6 +2016,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     }
     else if ( !strcasecmp(tmp.s,"AF") )
     {
+        filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_af;
         tok->tag = strdup("AF");
         free(tmp.s);
@@ -2008,6 +2024,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
     }
     else if ( !strcasecmp(tmp.s,"MAF") )
     {
+        filter->max_unpack |= max_ac_an_unpack(filter->hdr);
         tok->setter = &filters_set_maf;
         tok->tag = strdup("MAF");
         free(tmp.s);

From e12f43d709feca601d15ab0f93f9f16142c388b6 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 29 Mar 2018 17:28:08 +0100
Subject: [PATCH 24/27] Apply ed286e9 in all the other branches

---
 plugins/setGT.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/plugins/setGT.c b/plugins/setGT.c
index 15976622a..80943ff08 100644
--- a/plugins/setGT.c
+++ b/plugins/setGT.c
@@ -361,6 +361,8 @@ bcf1_t *process(bcf1_t *rec)
 
             if ( args->new_mask&GT_UNPHASED )
                 changed += unphase_gt(ptr, ngts);
+            else if ( args->new_mask==GT_PHASED )
+                changed += phase_gt(ptr, ngts);
             else
                 changed += set_gt(ptr, ngts, args->new_gt);
         }
@@ -392,6 +394,8 @@ bcf1_t *process(bcf1_t *rec)
             if ( !args->smpl_pass[i] ) continue;
             if ( args->new_mask&GT_UNPHASED )
                 changed += unphase_gt(args->gts + i*ngts, ngts);
+            else if ( args->new_mask==GT_PHASED )
+                changed += phase_gt(args->gts + i*ngts, ngts);
             else
                 changed += set_gt(args->gts + i*ngts, ngts, args->new_gt);
         }

From 4479210a0c0182db2e35330eab865cd30b60cceb Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Sat, 31 Mar 2018 09:25:12 +0100
Subject: [PATCH 25/27] Fix bug in indexed ALT access and let STRLEN(".")
 evaluate to 0

- a spurious line was reset the index to 0 after first access
  so that querying ALT[9] resulted in querying ALT[0] after
  the first line

- strlen(ALT[9])==1 was always true because missing value is
  filled. Arguably, the missing string has zero length.

Fixes #767
---
 filter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/filter.c b/filter.c
index fe17634f3..1c7881ee4 100644
--- a/filter.c
+++ b/filter.c
@@ -922,7 +922,6 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
             kputs(line->d.allele[tok->idx + 1], &tok->str_value);
         else
             kputc('.', &tok->str_value);
-        tok->idx = 0;
     }
     else if ( tok->idx==-2 )
     {
@@ -1184,7 +1183,10 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
     }
     else
     {
-        rtok->values[0] = strlen(tok->str_value.s);
+        if ( !tok->str_value.s[1] && tok->str_value.s[0]=='.' )
+            rtok->values[0] = 0;
+        else
+            rtok->values[0] = strlen(tok->str_value.s);
         rtok->nvalues = 1;
     }
     return 1;

From 591bf02a9463b69ca8a07bec097fceebfd704f2f Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Sat, 31 Mar 2018 09:37:03 +0100
Subject: [PATCH 26/27] Expand docs to include the csq --gff-annot file format
 description

---
 csq.c             |  2 +-
 doc/bcftools.1    | 61 +++++++++++++++++++++++++++++++++++++++++++----
 doc/bcftools.html | 46 +++++++++++++++++++++++++++++++----
 doc/bcftools.txt  | 33 ++++++++++++++++++++++++-
 4 files changed, 132 insertions(+), 10 deletions(-)

diff --git a/csq.c b/csq.c
index b36ebc32d..848d1eaa8 100644
--- a/csq.c
+++ b/csq.c
@@ -71,7 +71,7 @@
             A .. gene line with a supported biotype
                     A.ID=~/^gene:/
 
-            B .. transcript line referencing A
+            B .. transcript line referencing A with supported biotype
                     B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
 
             C .. corresponding CDS, exon, and UTR lines:
diff --git a/doc/bcftools.1 b/doc/bcftools.1
index dd40d8d75..92c4e10f8 100644
--- a/doc/bcftools.1
+++ b/doc/bcftools.1
@@ -2,12 +2,12 @@
 .\"     Title: bcftools
 .\"    Author: [see the "AUTHORS" section]
 .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\"      Date: 2018-03-15 15:49 GMT
+.\"      Date: 2018-03-30 14:15 BST
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "BCFTOOLS" "1" "2018\-03\-15 15:49 GMT" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2018\-03\-30 14:15 BST" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Define some portability stuff
 .\" -----------------------------------------------------------------
@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
 BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
 .SS "VERSION"
 .sp
-This manual page was last updated \fB2018\-03\-15 15:49 GMT\fR and refers to bcftools git version \fB1\&.7\-7\-g4dca79a+\fR\&.
+This manual page was last updated \fB2018\-03\-30 14:15 BST\fR and refers to bcftools git version \fB1\&.7\-25\-ge12f43d+\fR\&.
 .SS "BCF1"
 .sp
 The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
@@ -1796,8 +1796,45 @@ reference sequence in fasta format (required)
 \fB\-g, \-\-gff\-annot\fR \fIFILE\fR
 .RS 4
 GFF3 annotation file (required), such as
-ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens/
+ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens\&. An example of a minimal working GFF file:
 .RE
+.sp
+.if n \{\
+.RS 4
+.\}
+.nf
+    # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
+    # looks up their parent transcript (determined from the "Parent=transcript:" attribute),
+    # the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype
+    # (the most interesting is "protein_coding")\&.
+    #
+    # Attributes required for
+    #   gene lines:
+    #   \- ID=gene:<gene_id>
+    #   \- biotype=<biotype>
+    #   \- Name=<gene_name>      [optional]
+    #
+    #   transcript lines:
+    #   \- ID=transcript:<transcript_id>
+    #   \- Parent=gene:<gene_id>
+    #   \- biotype=<biotype>
+    #
+    #   other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
+    #   \- Parent=transcript:<transcript_id>
+    #
+    # Supported biotypes:
+    #   \- see the function gff_parse_biotype() in bcftools/csq\&.c
+
+    1   ignored_field  gene            21  2148  \&.   \-   \&.   ID=gene:GeneId;biotype=protein_coding;Name=GeneName
+    1   ignored_field  transcript      21  2148  \&.   \-   \&.   ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
+    1   ignored_field  three_prime_UTR 21  2054  \&.   \-   \&.   Parent=transcript:TranscriptId
+    1   ignored_field  exon            21  2148  \&.   \-   \&.   Parent=transcript:TranscriptId
+    1   ignored_field  CDS             21  2148  \&.   \-   1   Parent=transcript:TranscriptId
+    1   ignored_field  five_prime_UTR  210 2148  \&.   \-   \&.   Parent=transcript:TranscriptId
+.fi
+.if n \{\
+.RE
+.\}
 .PP
 \fB\-i, \-\-include\fR \fIEXPRESSION\fR
 .RS 4
@@ -3856,12 +3893,28 @@ nor the other
 options are given, the allele frequency is estimated from AC and AN counts which are already present in the INFO field\&.
 .RE
 .PP
+\fB\-\-exclude\fR \fIEXPRESSION\fR
+.RS 4
+exclude sites for which
+\fIEXPRESSION\fR
+is true\&. For valid expressions see
+\fBEXPRESSIONS\fR\&.
+.RE
+.PP
 \fB\-G, \-\-GTs\-only\fR \fIFLOAT\fR
 .RS 4
 use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL), setting PL of unseen genotypes to
 \fIFLOAT\fR\&. Safe value to use is 30 to account for GT errors\&.
 .RE
 .PP
+\fB\-\-include\fR \fIEXPRESSION\fR
+.RS 4
+include only sites for which
+\fIEXPRESSION\fR
+is true\&. For valid expressions see
+\fBEXPRESSIONS\fR\&.
+.RE
+.PP
 \fB\-I, \-\-skip\-indels\fR
 .RS 4
 skip indels as their genotypes are usually enriched for errors
diff --git a/doc/bcftools.html b/doc/bcftools.html
index 58331e5dd..b722cb27a 100644
--- a/doc/bcftools.html
+++ b/doc/bcftools.html
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25164528"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools  is  a set of utilities that manipulate variant calls in the Variant
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp147792"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools  is  a set of utilities that manipulate variant calls in the Variant
 Call Format (VCF) and its binary counterpart BCF. All commands work
 transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
 automatically even when streaming from a pipe. Indexed VCF and BCF
@@ -8,7 +8,7 @@
 work in most, but not all situations. In general, whenever multiple VCFs are
 read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
 standard input (stdin) and outputs to the standard output (stdout). Several
-commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-03-15 15:49 GMT</strong></span> and refers to bcftools git version <span class="strong"><strong>1.7-7-g4dca79a+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
+commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-03-30 14:15 BST</strong></span> and refers to bcftools git version <span class="strong"><strong>1.7-25-ge12f43d+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions &lt;= 0.1.19 to convert to VCF, which can then be read by
@@ -1036,8 +1036,36 @@
 </dd><dt><span class="term">
 <span class="strong"><strong>-g, --gff-annot</strong></span> <span class="emphasis"><em>FILE</em></span>
 </span></dt><dd>
-    GFF3 annotation file (required), such as <a class="ulink" href="ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/" target="_top">ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/</a>
-</dd><dt><span class="term">
+    GFF3 annotation file (required), such as <a class="ulink" href="ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" target="_top">ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens</a>.
+    An example of a minimal working GFF file:
+</dd></dl></div><pre class="screen">    # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
+    # looks up their parent transcript (determined from the "Parent=transcript:" attribute),
+    # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
+    # (the most interesting is "protein_coding").
+    #
+    # Attributes required for
+    #   gene lines:
+    #   - ID=gene:&lt;gene_id&gt;
+    #   - biotype=&lt;biotype&gt;
+    #   - Name=&lt;gene_name&gt;      [optional]
+    #
+    #   transcript lines:
+    #   - ID=transcript:&lt;transcript_id&gt;
+    #   - Parent=gene:&lt;gene_id&gt;
+    #   - biotype=&lt;biotype&gt;
+    #
+    #   other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
+    #   - Parent=transcript:&lt;transcript_id&gt;
+    #
+    # Supported biotypes:
+    #   - see the function gff_parse_biotype() in bcftools/csq.c
+
+    1   ignored_field  gene            21  2148  .   -   .   ID=gene:GeneId;biotype=protein_coding;Name=GeneName
+    1   ignored_field  transcript      21  2148  .   -   .   ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
+    1   ignored_field  three_prime_UTR 21  2054  .   -   .   Parent=transcript:TranscriptId
+    1   ignored_field  exon            21  2148  .   -   .   Parent=transcript:TranscriptId
+    1   ignored_field  CDS             21  2148  .   -   1   Parent=transcript:TranscriptId
+    1   ignored_field  five_prime_UTR  210 2148  .   -   .   Parent=transcript:TranscriptId</pre><div class="variablelist"><dl><dt><span class="term">
 <span class="strong"><strong>-i, --include</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
 </span></dt><dd>
     include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
@@ -2279,12 +2307,22 @@
     If neither <span class="strong"><strong>-e</strong></span> nor the other <span class="strong"><strong>--AF-…</strong></span> options are given, the allele frequency is
     estimated from AC and AN counts which are already present in the INFO field.
 </dd><dt><span class="term">
+<span class="strong"><strong>--exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
+</span></dt><dd>
+    exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
+    <span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
+</dd><dt><span class="term">
 <span class="strong"><strong>-G, --GTs-only</strong></span> <span class="emphasis"><em>FLOAT</em></span>
 </span></dt><dd>
     use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL),
     setting PL of unseen genotypes to <span class="emphasis"><em>FLOAT</em></span>. Safe value to use is 30 to
     account for GT errors.
 </dd><dt><span class="term">
+<span class="strong"><strong>--include</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
+</span></dt><dd>
+    include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
+    <span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
+</dd><dt><span class="term">
 <span class="strong"><strong>-I, --skip-indels</strong></span>
 </span></dt><dd>
     skip indels as their genotypes are usually enriched for errors
diff --git a/doc/bcftools.txt b/doc/bcftools.txt
index 44a15373f..7d4578fe3 100644
--- a/doc/bcftools.txt
+++ b/doc/bcftools.txt
@@ -1064,7 +1064,38 @@ output VCF and are ignored for the prediction analysis.
     reference sequence in fasta format (required)
 
 *-g, --gff-annot* 'FILE'::
-    GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/
+    GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens.
+    An example of a minimal working GFF file:
+----
+    # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
+    # looks up their parent transcript (determined from the "Parent=transcript:" attribute),
+    # the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
+    # (the most interesting is "protein_coding").
+    #
+    # Attributes required for
+    #   gene lines:
+    #   - ID=gene:<gene_id> 
+    #   - biotype=<biotype>
+    #   - Name=<gene_name>      [optional]
+    #
+    #   transcript lines:
+    #   - ID=transcript:<transcript_id>
+    #   - Parent=gene:<gene_id>
+    #   - biotype=<biotype>
+    #
+    #   other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
+    #   - Parent=transcript:<transcript_id>
+    #
+    # Supported biotypes:
+    #   - see the function gff_parse_biotype() in bcftools/csq.c
+
+    1   ignored_field  gene            21  2148  .   -   .   ID=gene:GeneId;biotype=protein_coding;Name=GeneName
+    1   ignored_field  transcript      21  2148  .   -   .   ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
+    1   ignored_field  three_prime_UTR 21  2054  .   -   .   Parent=transcript:TranscriptId
+    1   ignored_field  exon            21  2148  .   -   .   Parent=transcript:TranscriptId
+    1   ignored_field  CDS             21  2148  .   -   1   Parent=transcript:TranscriptId
+    1   ignored_field  five_prime_UTR  210 2148  .   -   .   Parent=transcript:TranscriptId
+----
 
 *-i, --include* 'EXPRESSION'::
     include only sites for which 'EXPRESSION' is true. For valid expressions see

From ad7d585f09d83dc58c5c80c3dd6d0a318abfa259 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Sat, 31 Mar 2018 09:49:17 +0100
Subject: [PATCH 27/27] Update NEWS

---
 NEWS | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/NEWS b/NEWS
index ced627f20..454960de5 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,20 @@
 ## Release a.b
 
+## Release 1.8 (April 2018)
+
+* `-i, -e` filtering: Support for custom perl scripts
+
+* `+contrast`: New plugin to annotate genotype differences between groups of samples
+
+* `+fixploidy`: New options for simpler ploidy usage
+
+* `+setGT`: Target genotypes can be set to phased by giving `--new-gt p`
+
+* `run-roh.pl`: Allow to pass options directly to `bcftools roh`
+
+* Number of bug fixes
+
+
 ## Release 1.7 (February 2018)
 
 * `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields