diff --git a/Snakefile b/Snakefile index 103f13c..41de045 100644 --- a/Snakefile +++ b/Snakefile @@ -60,6 +60,7 @@ KMER_LENGTH = config['kmer_length'] if 'kmer_length' in config else 31 DIFF_METHOD = config['diff_method'] if 'diff_method' in config else 'DESeq2' OUTPUT_DIR = config['output_dir'] FASTQ_DIR = config['fastq_dir'] +ANNOTATION_TYPE = config['annotation_type'] # DIRECTORIES BIN_DIR = "bin" @@ -179,7 +180,7 @@ rule download_kallisto: # Download the gencode transcripts in fasta format (if no input transcriptome) rule gencode_download: output: REF_TRANSCRIPT_FASTA - shell: "wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.transcripts.fa.gz -O {output}" + shell: "wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.transcripts.fa.gz -O {output}" ############################################################################### @@ -278,12 +279,23 @@ rule transcript_to_gene_mapping: input: REF_TRANSCRIPT_FASTA output: TRANSCRIPT_TO_GENE_MAPPING run: - mapping = open(output[0], 'w') - with gzip.open(input[0], 'rt') as f: - for line in f: - if line[0] == ">": - fields = line[1:].split("|",2) - mapping.write("\t".join([fields[0],fields[1]]) + "\n") + #print( ANNOTATION_TYPE ) + if ANNOTATION_TYPE == "gencode": + mapping = open(output[0], 'w') + #print( "gencode annotation" ) + with gzip.open(input[0], 'rt') as f: + for line in f: + if line[0] == ">": + fields = line[1:].split("|",2) + mapping.write("\t".join([fields[0],fields[1]]) + "\n") + elif ANNOTATION_TYPE == "ensembl": + mapping = open(output[0], 'w') + #print( "ensembl annotation" ) + with gzip.open(input[0], 'rt') as f: + for line in f: + if line[0] == ">": + fields = line[1:].split(' ') + mapping.write("\t".join([fields[0],fields[3].replace("gene:","").replace("\n","").split('.')[0], ]) + "\n") # 1.6 Convert transcript counts to gene counts rule gene_counts: @@ -306,13 +318,18 @@ rule gene_counts: header = f.readline().rstrip() for line in f: counts = line.split() - transcript_id, trail = counts[0].split("|",1) - gene_id = conversion_hash[transcript_id] - counts[1:] = [ float(i) for i in counts[1:] ] - if gene_id in gene_counts: - gene_counts[gene_id] = [ sum(i) for i in zip(gene_counts[gene_id], counts[1:]) ] + if ANNOTATION_TYPE == "gencode": + transcript_id, trail = counts[0].split("|",1) else: - gene_counts[gene_id] = counts[1:] + if ANNOTATION_TYPE == "ensembl": + transcript_id = counts[0] + + gene_id = conversion_hash[transcript_id] + counts[1:] = [ float(i) for i in counts[1:] ] + if gene_id in gene_counts: + gene_counts[gene_id] = [ sum(i) for i in zip(gene_counts[gene_id], counts[1:]) ] + else: + gene_counts[gene_id] = counts[1:] # print Gene counts with gzip.open(output[0], 'wb') as f: f.write(bytes(header + "\n",'UTF-8')) diff --git a/config.json b/config.json index ada2d4a..b310d45 100644 --- a/config.json +++ b/config.json @@ -1,4 +1,5 @@ { + "annotation_type": "gencode", "fastq_dir": "data", "nb_threads": 8, "kmer_length": 31,