Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ KMER_LENGTH = config['kmer_length'] if 'kmer_length' in config else 31
DIFF_METHOD = config['diff_method'] if 'diff_method' in config else 'DESeq2'
OUTPUT_DIR = config['output_dir']
FASTQ_DIR = config['fastq_dir']
ANNOTATION_TYPE = config['annotation_type']

# DIRECTORIES
BIN_DIR = "bin"
Expand Down Expand Up @@ -179,7 +180,7 @@ rule download_kallisto:
# Download the gencode transcripts in fasta format (if no input transcriptome)
rule gencode_download:
output: REF_TRANSCRIPT_FASTA
shell: "wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.transcripts.fa.gz -O {output}"
shell: "wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.transcripts.fa.gz -O {output}"


###############################################################################
Expand Down Expand Up @@ -278,12 +279,23 @@ rule transcript_to_gene_mapping:
input: REF_TRANSCRIPT_FASTA
output: TRANSCRIPT_TO_GENE_MAPPING
run:
mapping = open(output[0], 'w')
with gzip.open(input[0], 'rt') as f:
for line in f:
if line[0] == ">":
fields = line[1:].split("|",2)
mapping.write("\t".join([fields[0],fields[1]]) + "\n")
#print( ANNOTATION_TYPE )
if ANNOTATION_TYPE == "gencode":
mapping = open(output[0], 'w')
#print( "gencode annotation" )
with gzip.open(input[0], 'rt') as f:
for line in f:
if line[0] == ">":
fields = line[1:].split("|",2)
mapping.write("\t".join([fields[0],fields[1]]) + "\n")
elif ANNOTATION_TYPE == "ensembl":
mapping = open(output[0], 'w')
#print( "ensembl annotation" )
with gzip.open(input[0], 'rt') as f:
for line in f:
if line[0] == ">":
fields = line[1:].split(' ')
mapping.write("\t".join([fields[0],fields[3].replace("gene:","").replace("\n","").split('.')[0], ]) + "\n")

# 1.6 Convert transcript counts to gene counts
rule gene_counts:
Expand All @@ -306,13 +318,18 @@ rule gene_counts:
header = f.readline().rstrip()
for line in f:
counts = line.split()
transcript_id, trail = counts[0].split("|",1)
gene_id = conversion_hash[transcript_id]
counts[1:] = [ float(i) for i in counts[1:] ]
if gene_id in gene_counts:
gene_counts[gene_id] = [ sum(i) for i in zip(gene_counts[gene_id], counts[1:]) ]
if ANNOTATION_TYPE == "gencode":
transcript_id, trail = counts[0].split("|",1)
else:
gene_counts[gene_id] = counts[1:]
if ANNOTATION_TYPE == "ensembl":
transcript_id = counts[0]

gene_id = conversion_hash[transcript_id]
counts[1:] = [ float(i) for i in counts[1:] ]
if gene_id in gene_counts:
gene_counts[gene_id] = [ sum(i) for i in zip(gene_counts[gene_id], counts[1:]) ]
else:
gene_counts[gene_id] = counts[1:]
# print Gene counts
with gzip.open(output[0], 'wb') as f:
f.write(bytes(header + "\n",'UTF-8'))
Expand Down
1 change: 1 addition & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"annotation_type": "gencode",
"fastq_dir": "data",
"nb_threads": 8,
"kmer_length": 31,
Expand Down