tests.stu

@all.tests
:
    # tests/sub1KG-chr19-18M/dense.tags.reimputeall/imputations.txt # 4 minutes
    # tests/sub1KG-chr19-18M/dense.tags/imputations.txt # 4 minutes

    tests/missingness.partchr22/none/imputations.txt # 90 seconds ?

    tests/missingness1/none/imputations.txt # 1 minute ?
    tests/missingness1/dep/imputations.txt # 1 minute ?
    tests/missingness1/ind/imputations.txt # 1 minute ?

    tests/sub1KG-tiny/EUR.via.filter.all22.reimputeall/imputations.txt
    tests/sub1KG-tiny/EUR.via.filter.all22/imputations.txt

    tests/errors.and.logging/log/imputations.txt

    tests/lambda/0.001/imputations.txt
    tests/lambda/0.0891756/imputations.txt
    tests/lambda/2sqrtn/imputations.txt
    tests/lambda/0.999/imputations.txt
    tests/lambda/default/imputations.txt

    tests/lambda1.0/imputations.txt
    tests/test.whatever/imputations.txt
    tests/3.chromosomes/UKB.every100/imputations.txt
    tests/impute.range/chr1_1000000-chr1_2000000/imputations.txt
    tests/impute.range/chr2-3/imputations.txt
    tests/impute.range/chr2/imputations.txt
    tests/impute.maf/0.05/imputations.txt
    tests/tags.maf/tags.maf.0.01/imputations.txt

    tests/tags.range/chr1_1000000-chr1_2000000/imputations.txt
    tests/tags.range/chr2-3/imputations.txt
    tests/tags.range/chr2/imputations.txt

    tests/sub1KG-tiny/EUR.via.filter.all22..impute.snps/imputations.txt
    tests/sub1KG-tiny/EUR.via.filter.all22..tag.snps/imputations.txt
    tests/sub1KG-tiny/EUR.via.filter.all22..impute.maf/imputations.txt
    tests/sub1KG-tiny/EUR.via.filter.all22..tag.maf/imputations.txt
    tests/sub1KG-tiny/EUR.via.filter.all22..impute.range/imputations.txt
    tests/sub1KG-tiny/EUR.via.filter.all22..tag.range/imputations.txt


    tests/swapping/swap.just.the.allele.headers/imputations.txt
    tests/swapping/swap.some.alleles/imputations.txt
    tests/swapping/swap.some.alleles.and.their.zs/imputations.txt

    tests/test.wood/imputations.txt

    # tests/test0/imputations.txt # reference build undetectable. I think those five SNPs aren't in my database. Must check this again

    tests/sub1KG-tiny/EUR.via.filter.chr4.5/imputations.txt
;

    tests/${testname}/imputations.txt
:       bin/ssimp
    tests/${testname}/command
{
    export SSIMP=bin/ssimp
    export   REF=ref
    export  GWAS=gwas
    export COMMANDDIR="tests/${testname}"
    export PROF_CHANGE_DIR_AT_THE_LAST_MINUTE="${COMMANDDIR}"
    export OUTPUT="tests/${testname}"/output.actual.tmp
    export OUT_IMPUTATIONS="tests/${testname}"/imputations.txt
    export FORCE_THOUSANDS_SEPARATOR="'"

    rm                    "${PROF_CHANGE_DIR_AT_THE_LAST_MINUTE}"/gmon.out 2>/dev/null || :
    bash "tests/${testname}"/command

    sed -i '/^\[ssimp-.*\]$/d' "$OUTPUT" # delete the line with version info
    if test -e "${PROF_CHANGE_DIR_AT_THE_LAST_MINUTE}"/gmon.out; then
        gprof "${SSIMP}" "${PROF_CHANGE_DIR_AT_THE_LAST_MINUTE}"/gmon.out > "${PROF_CHANGE_DIR_AT_THE_LAST_MINUTE}"/gmon.txt
        rm               "${PROF_CHANGE_DIR_AT_THE_LAST_MINUTE}"/gmon.out
    fi

    mv "tests/${testname}"/output.actual.tmp "tests/${testname}"/output.actual
}

> tests.compare/COMP${testname}.txt :
tests/test${testname}/imputations.txt
tests/test${testname}.R/imputations.txt
-p tests.compare
{
    mkdir -p tests.compare
    # load up both, and print the correlations between the results
    Rscript tests.compare.R
}


ref/sub1KG-18M/chr${chrm}.vcf.gz
ref/sub1KG-18M/chr${chrm}.vcf.gz.tbi
: -t @mkdirfile..ref/sub1KG-18M/chr${chrm}.vcf.gz
{
    test -f ref/link.to.1kg.data/ALL.chr${chrm}.phase3_shapeit2_mvncall_integrated_v*.20130502.genotypes.vcf.gz
    zcat ref/link.to.1kg.data/ALL.chr${chrm}.phase3_shapeit2_mvncall_integrated_v*.20130502.genotypes.vcf.gz |
        egrep -v '^##' | {
            IFS='' read -r JUST1LIINE
            printf "%s\n" "$JUST1LIINE"
            awk '$2 >= 18500000{exit} ; $2>=18000000'
        } | bgzip > /tmp/chr${chrm}.vcf.gz
        tabix       /tmp/chr${chrm}.vcf.gz
        mv /tmp/chr${chrm}.vcf.gz       ref/sub1KG-18M/chr${chrm}.vcf.gz
        mv /tmp/chr${chrm}.vcf.gz.tbi   ref/sub1KG-18M/chr${chrm}.vcf.gz.tbi
}
ref/sub1KG-tiny/chr${chrm}.vcf.gz
ref/sub1KG-tiny/chr${chrm}.vcf.gz.tbi
: -t @mkdirfile..ref/sub1KG-tiny/chr${chrm}.vcf.gz
{
    # This will take a subrange of the full 1kg data, but give preference
    # to tag SNPs (i.e. those in GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt.gz  \
    zcat    ref/sub1KG-18M/chr${chrm}.vcf.gz |
        python <(cat<<"EOF"
# python script starts here

import sys
def main():
    tags = set( l.strip() for l in open(sys.argv[2]) if l != 'MarkerName\n' )
    #print tags
    vcf = open(sys.argv[1])
    print vcf.next(),
    for v in vcf:
        v_rs = v.split('\t')[2]
        if v_rs in tags:
            if v_rs[-1:] == '1': # 10% of the tags
                print v,
        else:
            if v_rs[-2:] == '00': # 1% of the non-tags
                print v,


main()
EOF
) <(zcat ref/sub1KG-18M/chr${chrm}.vcf.gz) <(zcat gwas/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt.gz | cut -f1) |
        bgzip > /tmp/chr${chrm}.vcf.gz
        tabix       /tmp/chr${chrm}.vcf.gz
        mv /tmp/chr${chrm}.vcf.gz       ref/sub1KG-tiny/chr${chrm}.vcf.gz
        mv /tmp/chr${chrm}.vcf.gz.tbi   ref/sub1KG-tiny/chr${chrm}.vcf.gz.tbi
}
ref/sub1KG-chr19/chr${chrm}.vcf.gz
ref/sub1KG-chr19/chr${chrm}.vcf.gz.tbi
: -t @mkdirfile..ref/sub1KG-chr19/chr${chrm}.vcf.gz
{
    # This will take a subrange of the full 1kg data, but give preference
    # to tag SNPs (i.e. those in GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt.gz  \
    zcat /data/sgg/aaron/shared/ref_panels/1kg/ALL.chr${chrm}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz |
        egrep -v '^##' | {
            IFS='' read -r JUST1LIINE
            printf "%s\n" "$JUST1LIINE"
            awk '$2 >= 19000000{exit} ; $2>=15000000'
        } | python <(cat<<"EOF"
# python script starts here

import sys
def main():
    tags = set( l.strip() for l in open(sys.argv[1]) if l != 'MarkerName\n' )
    #print tags
    vcf = sys.stdin
    print vcf.next(),
    for v in vcf:
        v_rs = v.split('\t')[2]
        if v_rs in tags:
                print v,        # all tags
        else:
            if v_rs[-1:] in ['1','2']: # 10% of the non-tags
                print v,


main()
EOF
) <(zcat gwas/too.big/GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt.gz | cut -f1) |
        bgzip > /tmp/chr${chrm}.vcf.gz
        tabix       /tmp/chr${chrm}.vcf.gz
        mv /tmp/chr${chrm}.vcf.gz       ref/sub1KG-chr19/chr${chrm}.vcf.gz
        mv /tmp/chr${chrm}.vcf.gz.tbi   ref/sub1KG-chr19/chr${chrm}.vcf.gz.tbi
}

#ref/1kg.just.EUR/chr22.17750K.19250K.vcf.gz
                ref/1kg.just.${pop}/chr${chrm}.${posf}.${post}.vcf.gz
{
export  target="ref/1kg.just.${pop}/chr${chrm}.${posf}.${post}.vcf.gz"
    ls -l ref/link.to.1kg.data/ALL.chr${chrm}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

    mkdir -p "$(dirname "$target")"

    egrep ${pop} ref/link.to.1kg.data/integrated_call_samples_v3.20130502.ALL.panel | cut -f1 > "$target".sample_ids
    wc -l "$target".sample_ids
    zcat ref/link.to.1kg.data/ALL.chr${chrm}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz |
        egrep '^#CHROM' -m1 | tr '\t' '\n' |
        egrep -n -f "$target".sample_ids   |
        cut -d: -f1   >  "$target".sample_cols
    { echo -n 'cut -f 1-9,'; < "$target".sample_cols tr '\n' , | sed -re 's/,$//' ; }  >  "$target".sample_cut_script
    {
        echo -n "awk '(NR>1 && \$2>="
        echo $post | sed -re 's/K/000/' | tr -d '\n'
        echo -n "){exit};  (NR==1 || \$2 >= "
        echo $posf | sed -re 's/K/000/' | tr -d '\n'
        echo -n ")'"
    } > "$target".sample_awk_script
    echo
    cat "$target".sample_cut_script
    echo
    cat "$target".sample_awk_script
    echo
    echo --
    zcat ref/link.to.1kg.data/ALL.chr${chrm}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz |
        egrep -v '^##' |
        bash "$target".sample_awk_script |
        bash "$target".sample_cut_script |
        tawk 'NR>1 {$8="."} ; 1'         |
        1head egrep -v 'GT((.0\|0)+|(.1\|1)+)$' |
        bgzip > "$target".tmp

    rm "$target".sample_awk_script
    rm "$target".sample_cut_script
    rm "$target".sample_cols
    rm "$target".sample_ids

    tabix -p vcf    "$target".tmp
    mv              "$target".tmp       "$target"
    mv              "$target".tmp.tbi   "$target".tbi
    ls -l           "$target"*
}

tests.compare       { mkdir -p tests.compare }
@mkdirfile..${path} {
    mkdir -p "$(dirname "${path}")"
}

                tests/full1KG.EUR.Wood/chr${CHRM}.MISS${MISS}/command : tests.stu tests/full1KG.EUR.Wood/command.template {
    mkdir -p    tests/full1KG.EUR.Wood/chr${CHRM}.MISS${MISS}
    sleep 1
    {
        echo   CHRM="${CHRM}"
        echo   MISS="${MISS}"
        cat     tests/full1KG.EUR.Wood/command.template
    } >         tests/full1KG.EUR.Wood/chr${CHRM}.MISS${MISS}/command
}