Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
baffe47
set htslib=1.2.1, pbwt=3.1 and link against bsd
mcshane Apr 22, 2015
6662992
change allele dosage annotation from AD to ADS to avoid collision wit…
mcshane Apr 22, 2015
f490a8e
Merge pull request #1 from mcshane/feature/read_write_samples
mcshane May 12, 2015
8b1a95d
Merge pull request #2 from mcshane/feature/log
mcshane May 12, 2015
a0ad99b
update referenceImpute test files so that the test actually does some…
mcshane Feb 10, 2016
f80da84
first pass at adding support for chrX and chrY
mcshane Feb 10, 2016
257a60d
makefile addition to allow linking against htslib-with plugins
mcshane Feb 12, 2016
85a0a90
Merge branch 'master' of github.com:richarddurbin/pbwt into feature/p…
mcshane Mar 11, 2016
42200ae
further ploidy updates
mcshane May 4, 2016
41afc51
subsetting fixes
mcshane May 11, 2016
bd9a8d3
revert the recent `imputeMissing` addition to `referenceImpute`
mcshane May 11, 2016
1d940e4
include `removeSamples` in the list of options
mcshane May 11, 2016
f65287d
referenceImpute now records sites which were typed
mcshane May 11, 2016
c0c6ae7
reverse logic of `Site->typed` to `Site->isImputed`
mcshane May 13, 2016
61971e5
code cleanup
mcshane May 13, 2016
a880d7e
clean up sample handling
mcshane May 13, 2016
ecd9fbc
remove need for global `isX` and `isY` variables
mcshane May 16, 2016
23b7f58
remove some code duplication
mcshane May 19, 2016
7664577
in progress: changes to metadata handling
mcshane May 19, 2016
f11ff51
progress commit
mcshane Dec 7, 2016
1e62967
Fix in -X -loadSamples functionality
pd3 Dec 20, 2016
9d20e91
Prevent segfault if empty file was given
pd3 Jan 5, 2017
968380f
Merge pull request #1 from pd3/feature/ploidy
mcshane Jan 5, 2017
9e6f79a
Merge remote-tracking branch 'feature/ploidy' into production
pd3 Jan 11, 2017
c5cf324
Clean memory leaks
pd3 Jan 14, 2017
8ce3e4d
Tentative fix for dosages of typed genotypes, force them to {0,1}
pd3 Jan 14, 2017
f682390
The samples array is padded with an empty name
pd3 Jan 20, 2017
0bf5255
Prevent segfault if no arguments were given
pd3 Jan 20, 2017
e74f522
Fix a typo: iterate over all ALT alleles, do not get stuck on the first
pd3 Jan 20, 2017
1071aff
Important bug fixed: incorrect overlapping segments were used with -r…
pd3 Jan 20, 2017
832335e
Merge remote-tracking branch 'pd3/feature/ploidy/feature/ploidy' into…
pd3 Jan 30, 2017
56e7f61
Sanity check for {0,1} dosages at typed markers
pd3 Jan 30, 2017
4705410
Bump minor version to 3.1
pd3 Jan 30, 2017
017fdc9
Similarly to f682390, the samples array is padded with an empty name
pd3 Feb 2, 2017
bf6ebe2
Fix the -X -loadSamples functionality
pd3 Feb 2, 2017
00e0b52
in pbwtImpute3() disable sparse, correct stop point for match countin…
richarddurbin Nov 13, 2017
ebb9d13
comment changes plus framework for imputing missing to major allele
richarddurbin Nov 13, 2017
dca08a0
Fill missing target genotypes intersecting with reference panel with …
pd3 Nov 15, 2017
9a7a038
Updated tests
pd3 Nov 15, 2017
f09141f
Remove forgotten debugging messages
pd3 Nov 15, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ CFLAGS= -g -O3
CPPFLAGS=-I$(HTSDIR)
HTSDIR=../htslib
HTSLIB=$(HTSDIR)/libhts.a
LDLIBS=-lpthread -lz -lm $(HTSLIB)
LDLIBS=-lpthread -lz -lm $(HTSLIB) -ldl
LDFLAGS=-rdynamic

all: pbwt

Expand Down
36 changes: 23 additions & 13 deletions pbwt.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

#include "utils.h"

const static int pbwtMajorVersion = 3, pbwtMinorVersion = 0 ;
const static int pbwtMajorVersion = 3, pbwtMinorVersion = 1 ;

const char *pbwtCommitHash(void);
const char *pbwtHtslibVersionString(void);
Expand All @@ -34,7 +34,7 @@ typedef unsigned char uchar ;

typedef struct PBWTstruct {
int N ; /* number of sites */
int M ; /* number of samples */
int M ; /* number of haplotypes */
char* chrom ; /* chromosome name */
Array sites ; /* array of Site */
Array samples ; /* array of int index into global samples */
Expand All @@ -50,6 +50,8 @@ typedef struct PBWTstruct {
Array dosageOffset ; /* of long, site index into zDosage, 0 if no dosage data */
BOOL isRefFreq ; /* some flags for the whole VCF */
BOOL isUnphased ;
BOOL isX; /* chromosome is nonPAR region of chrX, treat male samples as haploid, female samples as diploid */
BOOL isY; /* chromosome is chrY, treat male samples as haploid and ignore female samples */
} PBWT ;

/* philosophy is to be lazy about PBWT - only fill items for which we have info */
Expand All @@ -60,15 +62,18 @@ typedef struct SiteStruct {
double freq ; /* frequency */
double refFreq ; /* frequency from reference used for last phasing or imputation */
double imputeInfo ; /* estimated r^2 from imputation */
BOOL isImputed ; /* TRUE if site was imputed */
} Site ;

typedef struct SampleStruct {
int nameD ; /* index in sampleDict */
int father ; /* index into samples */
int mother ; /* index into samples */
int father ; /* index in sampleDict */
int mother ; /* index in sampleDict */
int family ; /* index in familyDict */
int popD ; /* index in populationDict */
BOOL isMale ; /* treat X chromosome as haploid */
BOOL isFemale ; /* treat X chromosome as diploid and ignore Y */
BOOL isMale ; /* treat X chromosome as haploid, Y chromosome as haploid */
BOOL isFemale ; /* treat X chromosome as diploid and ignore for Y */
Array metaData ; // sample metadata
} Sample ;

typedef struct { /* data structure for moving forwards - doesn't know PBWT */
Expand All @@ -89,7 +94,7 @@ typedef struct { /* data structure for moving forwards - doesn't know PBWT */
/* pbwtMain.c */

extern char *commandLine ; /* a copy of the command line */
extern FILE *logFile ; /* log file pointer */
extern FILE *logFile ; /* log file pointer */

/* pbwtCore.c */

Expand Down Expand Up @@ -152,13 +157,18 @@ int extendPackedBackwards (uchar *yzp, int M, int *f, int c, uchar *zp) ; /* mov

void sampleInit (void) ;
void sampleDestroy (void) ;
Sample *sample (PBWT *p, int i) ; /* give back Sample structure for sample i from p */
int sampleAdd (char* name, char *father, char *mother, char *pop) ;
int sampleAdd (char *name) ;
int sampleCount (void);
Sample *sample (int i) ; /* give back Sample structure for sample i, where i is an index into sampleDict */
#define pbwtSample(p,i) sample(arr(p->samples,i,int)) /* give back Sample structure for sample i, where i the index into p->samples array */
int pbwtSamplePloidy(PBWT *p, int i) ;
char* sampleName (Sample *s) ;
char* popName (Sample *s) ; /* give back population name for sample i */
char* popName (Sample *s, char *name) ; /* give back population name for Sample s */
char* familyName (Sample *s, char *name) ; /* give back family name for Sample s */
PBWT *pbwtSubSample (PBWT *pOld, Array select) ;
PBWT *pbwtSubSampleInterval (PBWT *pOld, int start, int Mnew) ;
PBWT *pbwtSelectSamples (PBWT *pOld, FILE *fp) ;
PBWT *pbwtRemoveSamples (PBWT *pOld, FILE *fp) ;

/* pbwtIO.c */

Expand All @@ -177,8 +187,8 @@ PBWT *pbwtRead (FILE *fp) ;
Array pbwtReadSitesFile (FILE *fp, char **chrom) ;
void pbwtReadSites (PBWT *p, FILE *fp) ;
void pbwtReadRefFreq (PBWT *p, FILE *fp) ;
Array pbwtReadSamplesFile (FILE *fp) ;
void pbwtReadSamples (PBWT *p, FILE *fp) ;
Array pbwtReadSamplesFile (FILE *fp) ; /* read samples and sample metadata from a file */
void pbwtReadSamples (PBWT *p, FILE *fp) ; /* read samples and sample metadata after pbwt read from file */
void pbwtReadMissing (PBWT *p, FILE *fp) ;
void pbwtReadDosage (PBWT *p, FILE *fp) ;
void pbwtReadReverse (PBWT *p, FILE *fp) ;
Expand All @@ -197,7 +207,7 @@ void pbwtCheckPoint (PbwtCursor *u, PBWT *p) ; /* need cursor to write end index

/* pbwtHtslib.c */
/* all these functions also read and write samples and sites */
PBWT *pbwtReadVcfGT (char *filename) ; /* read GTs from vcf/bcf using htslib */
PBWT *pbwtReadVcfGT (char *filename, int isXY) ; /* read GTs from vcf/bcf using htslib; isX: isXY == 2; isY: isXY == 1 */
PBWT *pbwtReadVcfPL (char *filename) ; /* read PLs from vcf/bcf using htslib */
// mode: wb=compressed BCF; wbu=uncompressed BCF; wz=compressed VCF; w=uncompressed VCF
void pbwtWriteVcf (PBWT *p, char *filename, char *reference_fname, char *mode) ; /* write vcf/bcf using htslib */
Expand Down
100 changes: 73 additions & 27 deletions pbwtCore.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ void pbwtInit (void)
variationDict = dictCreate (32) ;
pack3init () ;
sampleInit () ;
metaDataInit () ;
}

PBWT *pbwtCreate (int M, int N)
Expand All @@ -44,7 +45,6 @@ PBWT *pbwtCreate (int M, int N)

p->M = M ; p->N = N ;
p->aFstart = myalloc (M, int) ; int i ; for (i = 0 ; i < M ; ++i) p->aFstart[i] = i ;

return p ;
}

Expand Down Expand Up @@ -104,6 +104,7 @@ PBWT *pbwtSubSites (PBWT *pOld, double fmin, double frac)
pNew->samples = pOld->samples ; pOld->samples = 0 ;
pNew->missingOffset = pOld->missingOffset ; pOld->missingOffset = 0 ;
pNew->zMissing = pOld->zMissing ; pOld->zMissing = 0 ;
pNew->isX = pOld->isX ; pNew->isY = pOld->isY ;
pbwtDestroy (pOld) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ;
free(x) ;
return pNew ;
Expand Down Expand Up @@ -141,6 +142,7 @@ PBWT *pbwtSubRange (PBWT *pOld, int start, int end)
pNew->samples = pOld->samples ; pOld->samples = 0 ;
pNew->missingOffset = pOld->missingOffset ; pOld->missingOffset = 0 ;
pNew->zMissing = pOld->zMissing ; pOld->zMissing = 0 ;
pNew->isX = pOld->isX ; pNew->isY = pOld->isY ;
pbwtDestroy (pOld) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ;
free(x) ;
return pNew ;
Expand Down Expand Up @@ -618,7 +620,6 @@ void pbwtCursorForwardsAPacked (PbwtCursor *u)
memcpy (u->a+u->c, u->b, (u->M-u->c)*sizeof(int)) ;
}


/***************************************************/

PBWT *pbwtSelectSites (PBWT *pOld, Array sites, BOOL isKeepOld)
Expand All @@ -630,6 +631,10 @@ PBWT *pbwtSelectSites (PBWT *pOld, Array sites, BOOL isKeepOld)
PbwtCursor *uNew = pbwtCursorCreate (pNew, TRUE, TRUE) ;
uchar *x = myalloc (pNew->M, uchar) ;

int nMissingSites = 0 ;
uchar *xMissing = myalloc (pNew->M+1, uchar) ;
xMissing[pNew->M] = Y_SENTINEL ; /* needed for efficient packing */

pNew->sites = arrayCreate (arrayMax(sites), Site) ;
while (ip < pOld->N && ia < arrayMax(sites))
{ if (sp->x < sa->x)
Expand All @@ -648,19 +653,42 @@ PBWT *pbwtSelectSites (PBWT *pOld, Array sites, BOOL isKeepOld)
}
else if (!noAlt && sp->varD > sa->varD) { ++ia ; ++sa ; }
else
{ array(pNew->sites,pNew->N++,Site) = *sp ;
++ip ; ++sp ; ++ia ; ++sa ;
{ array(pNew->sites,pNew->N,Site) = *sp ;
for (j = 0 ; j < pOld->M ; ++j) x[uOld->a[j]] = uOld->y[j] ;
pbwtCursorForwardsRead (uOld) ;
for (j = 0 ; j < pNew->M ; ++j) uNew->y[j] = x[uNew->a[j]] ;
pbwtCursorWriteForwards (uNew) ;

if (pOld->missingOffset)
{ if (!pNew->missingOffset)
{ pNew->zMissing = arrayCreate (10000, uchar) ;
array(pNew->zMissing, 0, uchar) = 0 ; /* needed so missing[] has offset > 0 */
pNew->missingOffset = arrayCreate (1024, long) ;
}
if (!arr(pOld->missingOffset,ip,long)) bzero (xMissing, pNew->M) ;
else unpack3 (arrp(pOld->zMissing, arr(pOld->missingOffset,ip,long), uchar), pNew->M, xMissing, 0) ;
if (arr(pOld->missingOffset,ip,long))
{
array(pNew->missingOffset,pNew->N,long) = arrayMax(pNew->zMissing) ;
pack3arrayAdd (xMissing,pNew->M,pNew->zMissing) ; /* NB original order, not pbwt sort */
nMissingSites++ ;
}
else
array(pNew->missingOffset,pNew->N,long) = 0 ;
}
++ip ; ++sp ; ++ia ; ++sa ; pNew->N++ ;
}
}
}
pbwtCursorToAFend (uNew, pNew) ;
free(xMissing) ;

// TODO: subset dosage

fprintf (logFile, "%d sites selected from %d, pbwt size for %d haplotypes is %ld\n",
pNew->N, pOld->N, pNew->M, arrayMax(pNew->yz)) ;
fprintf (logFile, "%d sites selected from %d, %d missing sites, pbwt size for %d haplotypes is %ld\n",
pNew->N, pOld->N, nMissingSites, pNew->M, arrayMax(pNew->yz)) ;

pNew->isX = pOld->isX ; pNew->isY = pOld->isY ;

if (isKeepOld)
{ if (pOld->samples) pNew->samples = arrayCopy (pOld->samples) ;
Expand All @@ -674,6 +702,10 @@ PBWT *pbwtSelectSites (PBWT *pOld, Array sites, BOOL isKeepOld)
else
{ pNew->chrom = pOld->chrom ; pOld->chrom = 0 ;
pNew->samples = pOld->samples ; pOld->samples = 0 ;
if (pOld->missingOffset) arrayDestroy (pOld->missingOffset) ;
pOld->missingOffset = 0 ;
if (pOld->zMissing) arrayDestroy (pOld->zMissing) ;
pOld->zMissing = 0 ;
pbwtDestroy (pOld) ;
}

Expand All @@ -692,35 +724,48 @@ PBWT *pbwtRemoveSites (PBWT *pOld, Array sites, BOOL isKeepOld)
PbwtCursor *uNew = pbwtCursorCreate (pNew, TRUE, TRUE) ;
uchar *x = myalloc (pNew->M, uchar) ;

int nMissingSites = 0 ;
uchar *xMissing = myalloc (pNew->M+1, uchar) ;
xMissing[pNew->M] = Y_SENTINEL ; /* needed for efficient packing */

pNew->sites = arrayCreate (arrayMax(sites), Site) ;
while (ip < pOld->N && ia < arrayMax(sites))
{ if (sp->x < sa->x)
{ array(pNew->sites,pNew->N++,Site) = *sp ;
++ip ; ++sp ;
for (j = 0 ; j < pOld->M ; ++j) x[uOld->a[j]] = uOld->y[j] ;
pbwtCursorForwardsRead (uOld) ;
for (j = 0 ; j < pNew->M ; ++j) uNew->y[j] = x[uNew->a[j]] ;
pbwtCursorWriteForwards (uNew) ;
}
else if (sp->x > sa->x) { ++ia ; ++sa ; }
else if (sp->varD < sa->varD)
{ array(pNew->sites,pNew->N++,Site) = *sp ;
++ip ; ++sp ;
for (j = 0 ; j < pOld->M ; ++j) x[uOld->a[j]] = uOld->y[j] ;
pbwtCursorForwardsRead (uOld) ;
for (j = 0 ; j < pNew->M ; ++j) uNew->y[j] = x[uNew->a[j]] ;
pbwtCursorWriteForwards (uNew) ;
}
else if (sp->varD > sa->varD) { ++ia ; ++sa ; }
while (ip < pOld->N)
{ if (ia>=arrayMax(sites) || sp->x < sa->x || (sp->varD < sa->varD && sp->x <= sa->x))
{ array(pNew->sites,pNew->N,Site) = *sp ;
for (j = 0 ; j < pOld->M ; ++j) x[uOld->a[j]] = uOld->y[j] ;
pbwtCursorForwardsRead (uOld) ;
for (j = 0 ; j < pNew->M ; ++j) uNew->y[j] = x[uNew->a[j]] ;
pbwtCursorWriteForwards (uNew) ;

if (pOld->missingOffset)
{ if (!pNew->missingOffset)
{ pNew->zMissing = arrayCreate (10000, uchar) ;
array(pNew->zMissing, 0, uchar) = 0 ; /* needed so missing[] has offset > 0 */
pNew->missingOffset = arrayCreate (1024, long) ;
}
if (!arr(pOld->missingOffset,ip,long)) bzero (xMissing, pNew->M) ;
else unpack3 (arrp(pOld->zMissing, arr(pOld->missingOffset,ip,long), uchar), pNew->M, xMissing, 0) ;
if (arr(pOld->missingOffset,ip,long))
{
array(pNew->missingOffset,pNew->N,long) = arrayMax(pNew->zMissing) ;
pack3arrayAdd (xMissing,pNew->M,pNew->zMissing) ; /* NB original order, not pbwt sort */
nMissingSites++ ;
}
else
array(pNew->missingOffset,pNew->N,long) = 0 ;
}
++ip ; ++sp ; pNew->N++ ;
}
else if (sp->x > sa->x || sp->varD > sa->varD) { ++ia ; ++sa ; }
else
{ ++ip ; ++sp ; ++ia ; ++sa ;
pbwtCursorForwardsRead (uOld) ;
}
}
pbwtCursorToAFend (uNew, pNew) ;

fprintf (logFile, "%d sites selected from %d, pbwt size for %d haplotypes is %ld\n",
pNew->N, pOld->N, pNew->M, arrayMax(pNew->yz)) ;
fprintf (logFile, "%d sites selected from %d, %d missing sites, pbwt size for %d haplotypes is %ld\n",
pNew->N, pOld->N, nMissingSites, pNew->M, arrayMax(pNew->yz)) ;

if (isKeepOld)
{ if (pOld->samples) pNew->samples = arrayCopy (pOld->samples) ;
Expand All @@ -736,6 +781,7 @@ PBWT *pbwtRemoveSites (PBWT *pOld, Array sites, BOOL isKeepOld)
pNew->samples = pOld->samples ; pOld->samples = 0 ;
pbwtDestroy (pOld) ;
}
pNew->isX = pOld->isX ; pNew->isY = pOld->isY ;

free(x) ; pbwtCursorDestroy (uOld) ; pbwtCursorDestroy (uNew) ;
return pNew ;
Expand Down
Loading