First commit of all code and descriptions

ComputationalProteomicsUnit · Jul 18, 2018 · 8e5b5c7 · 8e5b5c7
1 parent ddb5784
commit 8e5b5c7
Show file tree

Hide file tree

Showing 20 changed files with 3,473 additions and 2 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -0,0 +1,2 @@
+rmarkdown::render('biopep.pub.Rmd',output_file='biopep.pub.html')
+q
diff --git a/Get_IDRs-DM-v2.py b/Get_IDRs-DM-v2.py
@@ -0,0 +1,183 @@
+#!/usr/bin/python
+# Code for annotating Nucleolar RNABP with Intrinsically disordered regions
+
+# Import packages and modules
+import argparse
+import csv
+import d2p2
+import protinfo
+import numpy as np
+from time import gmtime, strftime
+import sys
+import re
+
+import collections
+import imp
+imp.reload(protinfo)
+imp.reload(d2p2)
+
+
+# Parsing user input
+args = argparse.ArgumentParser(description='This program adds IDRs from certain callers to a uniprot protein list',
+epilog="All IDRs should be in the output file")
+
+args.add_argument('-i',dest="infile",type=str,
+                   help='Input file containing protein IDs',required=True)
+args.add_argument('-o', dest="outfile", type=str,
+                   help='Name for output file')
+args.add_argument('-c', dest = "consensus",type=int,default=3,
+                   help='The minimum caller consensus you require for an IDR to qualify. Default = 3')
+args.add_argument('-l',dest = "minlen",type=int,default=20,
+                   help='The minimum length of IDR called by all consensus callers for an IDR to qualify. Default = 20')
+args.add_argument('-w',dest = "whitelist",nargs='+',default=None,
+                   help='The callers you want to include for IDR calling. Default is None which means all callers are included')
+args.add_argument('-b',dest = "blacklist",nargs='+',default=None,
+                   help='The callers you want to include for IDR calling. Default is None')  
+
+#args.print_help()
+argp = args.parse_args()
+
+
+# We start by reading in the file with Uniprot IDs for Mark and David's protein list
+f = csv.reader(open(argp.infile,'rU'),delimiter="\t")
+col_names = f.next()
+#print(col_names)
+
+# Extracting unipto id column index and storing information with uniprot.id as key 
+idcol = col_names.index('uniprot.id')
+uniprot_ids = []
+metadata = {}
+
+# Save Uniprot IDs as well as metadata 
+for x in f:
+    uniprot_ids.append(x[idcol])
+    metadata[x[idcol]] = x
+
+# Are there any tools in D2P2 that you want or do not want reported
+# eg : VL-XT, VSL2b, PrDOS, PV2, Espritz and IUPred
+
+tools_whitelist = argp.whitelist
+tools_blacklist = argp.blacklist
+consensus_req = argp.consensus # Minimum number of predictors that need to agree on a base being called disordered
+min_block_size = argp.minlen # Minimum length of disordered region all of which should be called by at least 'consensus_req' number of disorder predictors
+print tools_whitelist, tools_blacklist, consensus_req, min_block_size
+
+# Declare outfile
+temp = argp.infile.split("_")[3]
+
+if tools_whitelist:
+    outfile = temp+"_c"+str(consensus_req)+"_len"+str(min_block_size)+"_"+'.'.join(tools_whitelist)+"_outfile.txt"
+else:
+    outfile = temp+"_c"+str(consensus_req)+"_len"+str(min_block_size)+"_all-callers_outfile.txt"
+
+outfile = "Python-output/"+outfile
+print "Infile : "+argp.infile
+print "Outfile : "+outfile
+
+# If user inputs a specific name, use that
+if argp.outfile:
+    outfile = argp.outfile
+
+outf = open(outfile,'w')
+outf.write('\t'.join(col_names)+"\t"+"\t".join(map(str, ("idrs", "consensus","overall.consensus","protein_length","total_idr_length", "fraction_idr", "number.of.idrs","protter.url"))) + "\n")
+
+# Creating IDR data containers to hold the information for proteins that do and do not have IDR annotations
+idr_data_available = set()
+idr_data_unavailable = set()
+sequence_length_idr = {}
+
+blocks = collections.defaultdict(list) # Blocks of consensus calls on disorder
+med_consensus = collections.defaultdict(list) # Mean consensus for the blocks of consensus calls
+n = 0 # number of ids that we have iterated through
+
+# Looping through the set of uniprot ids provided above and adding disorder annotations
+# d2p2_entry is an item of class d2p2 that has the characteristics 
+
+for d2p2_entry in d2p2.iterator(uniprot_ids):
+
+    # Build the predictor list that will be used to call consensus on IDR sequence
+    d2p2_entry.rebuildConsensus(
+        tools_whitelist=tools_whitelist, tools_blacklist=tools_blacklist)
+
+    # Set minimum peptide length and consensus support for IDRs to be called
+    d2p2_entry.setIDRs(consensus_req, min_block_size)
+
+    # Add the IDRs to a list
+    #print(d2p2_entry.idrs)
+    blocks[d2p2_entry.name] = d2p2_entry.idrs
+
+    ku_con = np.array(d2p2_entry.consensus)
+    g = d2p2_entry.idrs
+    med = []
+
+    # Calculate median consensus for each IDR in block and save it in med_consensus
+    #for rang in g:
+    #    calc = np.median(ku_con[int(rang[0]):int(rang[1])+1])
+    #    med.append(float("%.2f" % round(calc,2)))
+    #        
+    #med_consensus[d2p2_entry.name] = med 
+
+    for rang in g:
+        #if tools_whitelist is not None:
+        calc = np.median(ku_con[int(rang[0]):int(rang[1])+1])
+        #print calc
+        med.append(float("%.2f" % round(calc,2)))    
+        #else:
+        #    med.append(1)
+        med_consensus[d2p2_entry.name] = med
+
+    # Add successful entries to data available
+    idr_data_available.add(d2p2_entry.name)
+    sequence_length_idr[d2p2_entry.name] = d2p2_entry.length
+
+    # Checking how long it takes to process requests
+    n += 1
+    if n%500 == 0:
+        print('proteins done: %i %s' % (
+                n, strftime("%Y-%m-%d %H:%M:%S", gmtime())))
+
+
+# Add all uniprot ids without IDRs to idr_data_unavailable
+idr_data_unavailable = set(uniprot_ids).difference(idr_data_available)
+
+# Check output 
+print "IDR data available for "+str(len(idr_data_available))+" proteins." # ID available
+print "IDR data missing for "+str(len(idr_data_unavailable))+" proteins." # ID unavailable
+
+
+# Prints results to outfile
+for protein in idr_data_available:
+    total_idr = 0
+    protein_length=0
+    fraction_idr = 0
+    for block in blocks[protein]:
+        total_idr += (block[1] - block[0])
+        protein_length = sequence_length_idr[protein]
+        fraction_idr = total_idr/float(protein_length)
+    idr_ranges = ','.join(map(str, ('-'.join(str(e)for e in x) for x in blocks[protein])))
+    medcon = ",".join(str(e) for e in med_consensus[protein])
+
+    # Trying to reconstruct the url for Protter using idr, biotin and PTMs
+    tw = tools_whitelist
+    if tw is None:
+    	tw = ['VLXT','VSL2b','PrDOS','PV2','IUPred-S','IUPred-L','Espritz-N','Espritz-X','Espritz-D']
+    gene_name = metadata[protein][col_names.index('uniprot.id')]+" : "+metadata[protein][col_names.index('gene.names')]+" : "+metadata[protein][col_names.index('protein.names')]+" : " + str(",".join(tw))
+    #print(gene_name+"\n\n")
+    protter_url = "http://wlab.ethz.ch/protter/create?up="+protein+"&tm=auto&mc=lightsalmon&lc=blue&tml=numcount&numbers&legend&title="+gene_name+"&n:IDRs,fc:red,bc:peachpuff="+idr_ranges+"&n:Phosphorylation,s:box,cc:aqua,fc:midnightblue,bc:cornflowerblue="+metadata[protein][col_names.index('Phosphorylation')]+"&n:Ubiqutination,s:diamond,cc:black,fc:olive,bc:greenyellow="+metadata[protein][col_names.index('Ubiqutination')]+"&n:Acetylation,s:box,cc:lightpink,fc:lightpink,bc:deeppink="+metadata[protein][col_names.index('Acetylation')]+"&n:Biotins,s:circ,cc:white,fc:forestgreen,bc:forestgreen="+metadata[protein][col_names.index('loc.hits')]+"&format=pdf"
+    protter_url = re.sub("=NA&n:","=&n:", protter_url)   
+    #print(protter_url)
+
+
+    # Write results to outfile
+    outf.write(("\t".join(metadata[protein])+"\t"+"\t".join(map(str, (idr_ranges,medcon,np.median(med_consensus[protein]),protein_length, total_idr, fraction_idr, len(blocks[protein]),protter_url)))) + "\n")
+
+outf.close()
+
+
+# Run a test protein using d2p2 database
+protein = idr_data_available.pop()
+bl = blocks[protein]
+print("IDRs for " + protein+ " are : "+','.join(map(str, ('-'.join(str(e)for e in x) for x in bl))))
+print("Length of " +protein+" : "+str(sequence_length_idr[protein]))
+
+# Takes 3:30 mins for ~3000 proteins
diff --git a/Input/Ab-APEX-with-biotins-ptms-by-gene.rds b/Input/Ab-APEX-with-biotins-ptms-by-gene.rds
diff --git a/Input/BioSITe-with-biotins-ptms-by-gene.rds b/Input/BioSITe-with-biotins-ptms-by-gene.rds
diff --git a/Input/Biotin-PTM-IDR-data-for-all-studies-and-callers.rds b/Input/Biotin-PTM-IDR-data-for-all-studies-and-callers.rds
diff --git a/Input/DiDBIT-with-biotins-ptms-by-gene.rds b/Input/DiDBIT-with-biotins-ptms-by-gene.rds
diff --git a/Input/HEK293-GO-annotation-with-ancestors.rds b/Input/HEK293-GO-annotation-with-ancestors.rds
diff --git a/Input/HEK293-annotated-list-of-proteins.rds b/Input/HEK293-annotated-list-of-proteins.rds
diff --git a/Input/HEK293-with-IDR-VSL2b-and-PTMs.rds b/Input/HEK293-with-IDR-VSL2b-and-PTMs.rds
diff --git a/Input/PTMs-by-gene-from-Phosphositeplus.rds b/Input/PTMs-by-gene-from-Phosphositeplus.rds
diff --git a/Input/SpotBioID-with-biotins-ptms-by-gene.rds b/Input/SpotBioID-with-biotins-ptms-by-gene.rds
diff --git a/Input/go-pro-kegg.rda b/Input/go-pro-kegg.rda
diff --git a/README.md b/README.md
@@ -1,2 +1,76 @@
-# biotinIDR
-Contains code and input files for Biotin IDR study by David Paul Minde and Manasa Ramakrishna
+# biotinIDR : Contains code and input files for Biotin IDR study by David Paul Minde and Manasa Ramakrishna
+
+#### Main code  : R and Python code for analysing data
+
+Note: Before you run this code, you will need to download the following scripts
+a. R installed on your computer  
+b. Python installed on your computer 
+c. GO.R from https://github.com/TomSmithCGAT/CamProt_R/blob/master/GO.R (if running from scratch you'll need to modify the code to be able to read in the column "to.id" instead of "GO.ID". Alternatively, you can change the coloumn names in your input dataframe to be "GO.ID"
+d. 'd2p2.py' and 'protinfo.py' from https://github.com/TomSmithCGAT/CamProt/tree/master/camprot/proteomics (needs to be complied)
+
+ File | Description | Language/Format
+--------------------|--------------------------- |-----  
+biopepFunctions.R        |       Functions required for the R code to run    | R  
+biopep.pub.Rmd          |       R markdown file that contains the main code used in the data analysis   | R  
+nods.pub.html      |       HTML rendering of R markdown file biopep.pub.Rmd  | Html
+Get_IDRs-DM-v2.py          | Adds IDR info for all proteins in each of the biotin studies.   | Python  
+runpy.sh      | Calls Get_IDRs-DM-v2.py using various inputs and parameters     | Shell
+printUrl.py      | Uses Protter URLs generated by Get_IDRs-DM-v2.py and retrieved PDF files into specific folders      | Python  
+runUrl.sh      |Calls printUrl.py on a file of interest      | Shell  
+GO.R      | Tom's code to add ancestor terms to existing GO terms   | R    
+d2p2.py                         | Tom's code to use the d2p2 API to use 9 different callers to identify intrinsically disordered regions | Python     
+protinfo.py                     | Tom's code to query uniprot/swissprot database on Ensembl | Python  
+
+#### Input : Folder containing all input files needed for code to run
+
+Ab-APEX-with-biotins-ptms-by-gene.rds	| Biotinylation sites from Ab-APEX study summarised by gene with IDR and PTM information added in  
+BioSITe-with-biotins-ptms-by-gene.rds   | Biotinylation sites from BioSITE study summarised by gene with IDR and PTM information added in  
+DiDBIT-with-biotins-ptms-by-gene.rds   | Biotinylation sites from DiDBIT study summarised by gene with IDR and PTM information added in  
+SpotBioID-with-biotins-ptms-by-gene.rds   | Biotinylation sites from Ab-APEX study summarised by gene with IDR and PTM information added in  
+Biotin-PTM-IDR-data-for-all-studies-and-callers.rds   | Biotin,IDR and PTM information for all studies and caller combinations  
+HEK293-annotated-list-of-proteins.rds	| HEK293 list of proteins from Geiger et al  
+HEK293-GO-annotation-with-ancestors.rds   | GO annotations with ancestor terms for the HEK293 background protein list from Geiger et al  
+HEK293-with-IDR-VSL2b-and-PTMs.rds	| HEK293 peoteins from Geiger et al with PTMs and IDRs based on the algorithm VSL2b
+PTMs-by-gene-from-Phosphositeplus.rds	| PTM data from PhosphositePlus summarised by gene and for phosphorylation, ubiquitination, sumoylation, acetylation  
+go-pro-kegg.rda	| An object that contains all GO, Interpro and KEGG annotations for proteins in each of the 4 studies  
+
+# RDS files can be read using the command readRDS as shown below
+readRDS("Input/Biotin-PTM-IDR-data-for-all-studies-and-callers.rds")
+
+# RDA files can be loaded using the command "load" as shown below
+load("Input/go-pro-kegg.rda")
+
+#### Output : A description of all files generated by running nods_final.Rmd.  
+##### Date_Output : Date-tagged output folder This folder should contain the following files
+Most of these files are related to the Figures in the main manuscript with some additional information
+
+File| Contents   
+----------|-------------  
+Figure3A_Number.biotins.vs.FractionIDR-VSL2b.pdf	|	Violin plots comparing fraction of IDR and biotin number in each protein  
+Figure3B_Distribution-of-Biotins-in.out.of.IDRs-by-caller-and-study.pdf	|	Bar plot showing expected and observed rates of biotin for all studies and callers  
+Figure3B_Binomial-test-stats.txt	|	Bionomial test values and parameters for all studies and callers looking at obs biotins in IDRs  
+Figure3C_Distribution-of-FPU-by-caller-parameters-and-study.pdf	|	Number of proteins in FPU categories in each study and by caller  
+Figure3D_VSL2b_Biotins-in-IDR.vs.IDR-classes.violinPlots.pdf	|	Violin plots showing biotins in IDRs separated by FPU for caller VSL2b  
+Figure4A_Num.idrs.vs.num.ptms_HEK293-vs-biotin.pdf	|	Correlation plot between number of PTMs and Biotins for HEK293 proteome (Geiger) and HEK203 biotinome  
+Figure4B_Distribution-of-PTM-types-in-HEK-vs-in-Biotin-studies.pdf	|	Boxplots showing PTM counts by type in HEK293 proteome and biotinome  
+Figure4C_Binomial-test-stats.txt	|	Bionomial test values and parameters for all studies and callers looking at obs PTMs in IDRs 
+Figure4C_Distribution-of-PTMs-in.out.of.IDRs-by-study-VSL2b.pdf	|	Bar plot showing expected and observed rates of all types of PTMs for all studies  
+Figure4D_VSL2b_PTMs-in-IDR.vs.IDR-classes.violinPlots.pdf	|	Violin plots showing all PTMs in IDRs separated by FPU for caller VSL2b  
+Figure4D_VSL2b_Acytelation.vs.IDR-classes.violinPlots.pdf       |       Violin plots showing acetylation marks in IDRs separated by FPU for caller VSL2b  
+Figure4D_VSL2b_Phosphorylation.vs.IDR-classes.violinPlots.pdf	       |       Violin plots showing phosphorylation marks in IDRs separated by FPU for caller VSL2b  
+Figure4D_VSL2b_Sumoylation.vs.IDR-classes.violinPlots.pdf       |       Violin plots showing sumoylation marks in IDRs separated by FPU for caller VSL2b  
+Figure4D_VSL2b_Ubiquitination.vs.IDR-classes.violinPlots.pdf       |       Violin plots showing ubiquitination marks in IDRs separated by FPU for caller VSL2b  
+FigureS1A_VennDiagrams-for-biotinylation-data.pdf	|	Venn diagram showing overlap of proteins in biotinylation data  
+FigureS1C_Biotin-GO-CC-enrichment-with-anc-terms_8_enricher-dotplot.pdf	|	Bubble plots showing top 8 GO enrichment terms (including ancestors) for "cellular component" for each of the 4 studies  
+Biotin-GO-enrichment-with-anc-terms_8_enricher-dotplot.pdf	|	Bubble plots showing top 8 GO enrichment terms (including ancestors) for each of the 4 studies  
+Biotin-GO-enrichment_8_enricher-dotplot.pdf	|	Bubble plots showing top 8 GO enrichment terms for each of the 4 studies. No ancestor terms used.  
+Biotin-GO-CC-enrichment_8_enricher-dotplot.pdf	|	Bubble plots showing top 8 GO enrichment terms for "cellular component" for each of the 4 studies. No ancestor terms used.  
+GO-KEGG-enrichment-with-whole-human-genome-bg.pdf	|	GO enrichment plotting using clusterProfiler's inbuilt code and whole human genome as background  
+Effect-size-for-pairwise-comparison-of-PTMs-in-IDRs-VSL2b.pdf	|	Effect size plot based on Tukey's HSD for PTMs in IDRs  
+Effect-size-for-pairwise-comparison-of-biotin-in-IDRs-VSL2b.pdf   |       Effect size plot based on Tukey's HSD for PTMs in IDRs  
+Date_Time_Effect-size-for-pairwise-comparison-of-IDR-classes-and-parameters-VSL2b.pdf	   |       Effect size plot based on Tukey's HSD for multiple parameters and IDR classes using VSL2b calls for IDR
+Date_Time_VSL2b_Pairwise-t-test-pvals.txt
+Date_Time_VSL2b_Tukey-HSD-ci-pvals.txt
+
+
+### SupplTables.xlsx : Supplementary tables referred to in the manuscript. Contain data that can be queried/used. Contains 5 datasheets and an "Index" sheet describing each of the data sheets.  
diff --git a/SupplTables.xlsx b/SupplTables.xlsx
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		rmarkdown::render('biopep.pub.Rmd',output_file='biopep.pub.html')
		q