From 95e1f44eb5ee781761ac8eabe60c78daa11c04e1 Mon Sep 17 00:00:00 2001 From: Animesh Sharma Date: Wed, 2 Aug 2023 16:07:23 +0200 Subject: [PATCH 1/2] Script make_grch38.sh update to download sequence for the GRCh38 release 110 version of H. sapiens (human) from Ensembl and index --ss genome.ss --exon genome.exon --- scripts/make_grch38_tran.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/make_grch38_tran.sh b/scripts/make_grch38_tran.sh index 783aba47..ee7431f3 100755 --- a/scripts/make_grch38_tran.sh +++ b/scripts/make_grch38_tran.sh @@ -15,7 +15,7 @@ # variable below. # -ENSEMBL_RELEASE=84 +ENSEMBL_RELEASE=110 ENSEMBL_GRCh38_BASE=ftp://ftp.ensembl.org/pub/release-${ENSEMBL_RELEASE}/fasta/homo_sapiens/dna ENSEMBL_GRCh38_GTF_BASE=ftp://ftp.ensembl.org/pub/release-${ENSEMBL_RELEASE}/gtf/homo_sapiens GTF_FILE=Homo_sapiens.GRCh38.${ENSEMBL_RELEASE}.gtf From 4bb782a8c97b8968b67e6993c5e3c1f2ee64af51 Mon Sep 17 00:00:00 2001 From: Animesh Sharma Date: Sat, 5 Aug 2023 11:33:56 +0200 Subject: [PATCH 2/2] adding script make_T2T_tran.sh to download sequence for the telomere-to-telomere CHM13 release 2 version of H. sapiens (human) genome from NCBI and index --ss genome.ss --exon genome.exon from corresponding GTF, annotation details at https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Homo_sapiens/GCF_009914755.1-RS_2023_03 --- scripts/make_T2T_tran.sh | 77 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 scripts/make_T2T_tran.sh diff --git a/scripts/make_T2T_tran.sh b/scripts/make_T2T_tran.sh new file mode 100644 index 00000000..a0bb67b5 --- /dev/null +++ b/scripts/make_T2T_tran.sh @@ -0,0 +1,77 @@ +#!/bin/sh + +# +# Downloads sequence for the T2T-CHM13v2.0 of H. sapiens (human) from NCBI. +# ANNOTATION REPORT: +# https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Homo_sapiens/GCF_009914755.1-RS_2023_03 +# + +GENOME_RELEASE=GCF_009914755.1_T2T-CHM13v2.0_genomic +GENOME_BASE=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/ +GTF_FILE=$GENOME_RELEASE.gtf + +get() { + file=$1 + if ! wget --version >/dev/null 2>/dev/null ; then + if ! curl --version >/dev/null 2>/dev/null ; then + echo "Please install wget or curl somewhere in your PATH" + exit 1 + fi + curl -o `basename $1` $1 + return $? + else + wget $1 + return $? + fi +} + +HISAT2_BUILD_EXE=./hisat2-build +if [ ! -x "$HISAT2_BUILD_EXE" ] ; then + if ! which hisat2-build ; then + echo "Could not find hisat2-build in current directory or in PATH" + exit 1 + else + HISAT2_BUILD_EXE=`which hisat2-build` + fi +fi + +HISAT2_SS_SCRIPT=./hisat2_extract_splice_sites.py +if [ ! -x "$HISAT2_SS_SCRIPT" ] ; then + if ! which hisat2_extract_splice_sites.py ; then + echo "Could not find hisat2_extract_splice_sites.py in current directory or in PATH" + exit 1 + else + HISAT2_SS_SCRIPT=`which hisat2_extract_splice_sites.py` + fi +fi + +HISAT2_EXON_SCRIPT=./hisat2_extract_exons.py +if [ ! -x "$HISAT2_EXON_SCRIPT" ] ; then + if ! which hisat2_extract_exons.py ; then + echo "Could not find hisat2_extract_exons.py in current directory or in PATH" + exit 1 + else + HISAT2_EXON_SCRIPT=`which hisat2_extract_exons.py` + fi +fi + +rm -f genome.fa +F=$GENOME_RELEASE.fna +if [ ! -f $F ] ; then + get ${GENOME_BASE}/$F.gz || (echo "Error getting $F" && exit 1) + gunzip $F.gz || (echo "Error unzipping $F" && exit 1) +else + cp $F genome.fa +fi + + +if [ ! -f $GTF_FILE ] ; then + get ${GENOME_BASE}/${GTF_FILE}.gz || (echo "Error getting ${GTF_FILE}" && exit 1) + gunzip ${GTF_FILE}.gz || (echo "Error unzipping ${GTF_FILE}" && exit 1) +else + ${HISAT2_SS_SCRIPT} ${GTF_FILE} > genome.ss + ${HISAT2_EXON_SCRIPT} ${GTF_FILE} > genome.exon +fi + +${HISAT2_BUILD_EXE} -p 4 genome.fa --ss genome.ss --exon genome.exon genome_tran +