Skip to content
This repository has been archived by the owner on May 5, 2020. It is now read-only.

Commit

Permalink
First commit of all code and descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
emm13 committed Jul 18, 2018
1 parent ddb5784 commit 8e5b5c7
Show file tree
Hide file tree
Showing 20 changed files with 3,473 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
rmarkdown::render('biopep.pub.Rmd',output_file='biopep.pub.html')
q
183 changes: 183 additions & 0 deletions Get_IDRs-DM-v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/usr/bin/python
# Code for annotating Nucleolar RNABP with Intrinsically disordered regions

# Import packages and modules
import argparse
import csv
import d2p2
import protinfo
import numpy as np
from time import gmtime, strftime
import sys
import re

import collections
import imp
imp.reload(protinfo)
imp.reload(d2p2)


# Parsing user input
args = argparse.ArgumentParser(description='This program adds IDRs from certain callers to a uniprot protein list',
epilog="All IDRs should be in the output file")

args.add_argument('-i',dest="infile",type=str,
help='Input file containing protein IDs',required=True)
args.add_argument('-o', dest="outfile", type=str,
help='Name for output file')
args.add_argument('-c', dest = "consensus",type=int,default=3,
help='The minimum caller consensus you require for an IDR to qualify. Default = 3')
args.add_argument('-l',dest = "minlen",type=int,default=20,
help='The minimum length of IDR called by all consensus callers for an IDR to qualify. Default = 20')
args.add_argument('-w',dest = "whitelist",nargs='+',default=None,
help='The callers you want to include for IDR calling. Default is None which means all callers are included')
args.add_argument('-b',dest = "blacklist",nargs='+',default=None,
help='The callers you want to include for IDR calling. Default is None')

#args.print_help()
argp = args.parse_args()


# We start by reading in the file with Uniprot IDs for Mark and David's protein list
f = csv.reader(open(argp.infile,'rU'),delimiter="\t")
col_names = f.next()
#print(col_names)

# Extracting unipto id column index and storing information with uniprot.id as key
idcol = col_names.index('uniprot.id')
uniprot_ids = []
metadata = {}

# Save Uniprot IDs as well as metadata
for x in f:
uniprot_ids.append(x[idcol])
metadata[x[idcol]] = x

# Are there any tools in D2P2 that you want or do not want reported
# eg : VL-XT, VSL2b, PrDOS, PV2, Espritz and IUPred

tools_whitelist = argp.whitelist
tools_blacklist = argp.blacklist
consensus_req = argp.consensus # Minimum number of predictors that need to agree on a base being called disordered
min_block_size = argp.minlen # Minimum length of disordered region all of which should be called by at least 'consensus_req' number of disorder predictors
print tools_whitelist, tools_blacklist, consensus_req, min_block_size

# Declare outfile
temp = argp.infile.split("_")[3]

if tools_whitelist:
outfile = temp+"_c"+str(consensus_req)+"_len"+str(min_block_size)+"_"+'.'.join(tools_whitelist)+"_outfile.txt"
else:
outfile = temp+"_c"+str(consensus_req)+"_len"+str(min_block_size)+"_all-callers_outfile.txt"

outfile = "Python-output/"+outfile
print "Infile : "+argp.infile
print "Outfile : "+outfile

# If user inputs a specific name, use that
if argp.outfile:
outfile = argp.outfile

outf = open(outfile,'w')
outf.write('\t'.join(col_names)+"\t"+"\t".join(map(str, ("idrs", "consensus","overall.consensus","protein_length","total_idr_length", "fraction_idr", "number.of.idrs","protter.url"))) + "\n")

# Creating IDR data containers to hold the information for proteins that do and do not have IDR annotations
idr_data_available = set()
idr_data_unavailable = set()
sequence_length_idr = {}

blocks = collections.defaultdict(list) # Blocks of consensus calls on disorder
med_consensus = collections.defaultdict(list) # Mean consensus for the blocks of consensus calls
n = 0 # number of ids that we have iterated through

# Looping through the set of uniprot ids provided above and adding disorder annotations
# d2p2_entry is an item of class d2p2 that has the characteristics

for d2p2_entry in d2p2.iterator(uniprot_ids):

# Build the predictor list that will be used to call consensus on IDR sequence
d2p2_entry.rebuildConsensus(
tools_whitelist=tools_whitelist, tools_blacklist=tools_blacklist)

# Set minimum peptide length and consensus support for IDRs to be called
d2p2_entry.setIDRs(consensus_req, min_block_size)

# Add the IDRs to a list
#print(d2p2_entry.idrs)
blocks[d2p2_entry.name] = d2p2_entry.idrs

ku_con = np.array(d2p2_entry.consensus)
g = d2p2_entry.idrs
med = []

# Calculate median consensus for each IDR in block and save it in med_consensus
#for rang in g:
# calc = np.median(ku_con[int(rang[0]):int(rang[1])+1])
# med.append(float("%.2f" % round(calc,2)))
#
#med_consensus[d2p2_entry.name] = med

for rang in g:
#if tools_whitelist is not None:
calc = np.median(ku_con[int(rang[0]):int(rang[1])+1])
#print calc
med.append(float("%.2f" % round(calc,2)))
#else:
# med.append(1)
med_consensus[d2p2_entry.name] = med

# Add successful entries to data available
idr_data_available.add(d2p2_entry.name)
sequence_length_idr[d2p2_entry.name] = d2p2_entry.length

# Checking how long it takes to process requests
n += 1
if n%500 == 0:
print('proteins done: %i %s' % (
n, strftime("%Y-%m-%d %H:%M:%S", gmtime())))


# Add all uniprot ids without IDRs to idr_data_unavailable
idr_data_unavailable = set(uniprot_ids).difference(idr_data_available)

# Check output
print "IDR data available for "+str(len(idr_data_available))+" proteins." # ID available
print "IDR data missing for "+str(len(idr_data_unavailable))+" proteins." # ID unavailable


# Prints results to outfile
for protein in idr_data_available:
total_idr = 0
protein_length=0
fraction_idr = 0
for block in blocks[protein]:
total_idr += (block[1] - block[0])
protein_length = sequence_length_idr[protein]
fraction_idr = total_idr/float(protein_length)
idr_ranges = ','.join(map(str, ('-'.join(str(e)for e in x) for x in blocks[protein])))
medcon = ",".join(str(e) for e in med_consensus[protein])

# Trying to reconstruct the url for Protter using idr, biotin and PTMs
tw = tools_whitelist
if tw is None:
tw = ['VLXT','VSL2b','PrDOS','PV2','IUPred-S','IUPred-L','Espritz-N','Espritz-X','Espritz-D']
gene_name = metadata[protein][col_names.index('uniprot.id')]+" : "+metadata[protein][col_names.index('gene.names')]+" : "+metadata[protein][col_names.index('protein.names')]+" : " + str(",".join(tw))
#print(gene_name+"\n\n")
protter_url = "http://wlab.ethz.ch/protter/create?up="+protein+"&tm=auto&mc=lightsalmon&lc=blue&tml=numcount&numbers&legend&title="+gene_name+"&n:IDRs,fc:red,bc:peachpuff="+idr_ranges+"&n:Phosphorylation,s:box,cc:aqua,fc:midnightblue,bc:cornflowerblue="+metadata[protein][col_names.index('Phosphorylation')]+"&n:Ubiqutination,s:diamond,cc:black,fc:olive,bc:greenyellow="+metadata[protein][col_names.index('Ubiqutination')]+"&n:Acetylation,s:box,cc:lightpink,fc:lightpink,bc:deeppink="+metadata[protein][col_names.index('Acetylation')]+"&n:Biotins,s:circ,cc:white,fc:forestgreen,bc:forestgreen="+metadata[protein][col_names.index('loc.hits')]+"&format=pdf"
protter_url = re.sub("=NA&n:","=&n:", protter_url)
#print(protter_url)


# Write results to outfile
outf.write(("\t".join(metadata[protein])+"\t"+"\t".join(map(str, (idr_ranges,medcon,np.median(med_consensus[protein]),protein_length, total_idr, fraction_idr, len(blocks[protein]),protter_url)))) + "\n")

outf.close()


# Run a test protein using d2p2 database
protein = idr_data_available.pop()
bl = blocks[protein]
print("IDRs for " + protein+ " are : "+','.join(map(str, ('-'.join(str(e)for e in x) for x in bl))))
print("Length of " +protein+" : "+str(sequence_length_idr[protein]))

# Takes 3:30 mins for ~3000 proteins
Binary file added Input/Ab-APEX-with-biotins-ptms-by-gene.rds
Binary file not shown.
Binary file added Input/BioSITe-with-biotins-ptms-by-gene.rds
Binary file not shown.
Binary file not shown.
Binary file added Input/DiDBIT-with-biotins-ptms-by-gene.rds
Binary file not shown.
Binary file added Input/HEK293-GO-annotation-with-ancestors.rds
Binary file not shown.
Binary file added Input/HEK293-annotated-list-of-proteins.rds
Binary file not shown.
Binary file added Input/HEK293-with-IDR-VSL2b-and-PTMs.rds
Binary file not shown.
Binary file added Input/PTMs-by-gene-from-Phosphositeplus.rds
Binary file not shown.
Binary file added Input/SpotBioID-with-biotins-ptms-by-gene.rds
Binary file not shown.
Binary file added Input/go-pro-kegg.rda
Binary file not shown.
78 changes: 76 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,76 @@
# biotinIDR
Contains code and input files for Biotin IDR study by David Paul Minde and Manasa Ramakrishna
# biotinIDR : Contains code and input files for Biotin IDR study by David Paul Minde and Manasa Ramakrishna

#### Main code : R and Python code for analysing data

Note: Before you run this code, you will need to download the following scripts
a. R installed on your computer
b. Python installed on your computer
c. GO.R from https://github.com/TomSmithCGAT/CamProt_R/blob/master/GO.R (if running from scratch you'll need to modify the code to be able to read in the column "to.id" instead of "GO.ID". Alternatively, you can change the coloumn names in your input dataframe to be "GO.ID"
d. 'd2p2.py' and 'protinfo.py' from https://github.com/TomSmithCGAT/CamProt/tree/master/camprot/proteomics (needs to be complied)

File | Description | Language/Format
--------------------|--------------------------- |-----
biopepFunctions.R | Functions required for the R code to run | R
biopep.pub.Rmd | R markdown file that contains the main code used in the data analysis | R
nods.pub.html | HTML rendering of R markdown file biopep.pub.Rmd | Html
Get_IDRs-DM-v2.py | Adds IDR info for all proteins in each of the biotin studies. | Python
runpy.sh | Calls Get_IDRs-DM-v2.py using various inputs and parameters | Shell
printUrl.py | Uses Protter URLs generated by Get_IDRs-DM-v2.py and retrieved PDF files into specific folders | Python
runUrl.sh |Calls printUrl.py on a file of interest | Shell
GO.R | Tom's code to add ancestor terms to existing GO terms | R
d2p2.py | Tom's code to use the d2p2 API to use 9 different callers to identify intrinsically disordered regions | Python
protinfo.py | Tom's code to query uniprot/swissprot database on Ensembl | Python

#### Input : Folder containing all input files needed for code to run

Ab-APEX-with-biotins-ptms-by-gene.rds | Biotinylation sites from Ab-APEX study summarised by gene with IDR and PTM information added in
BioSITe-with-biotins-ptms-by-gene.rds | Biotinylation sites from BioSITE study summarised by gene with IDR and PTM information added in
DiDBIT-with-biotins-ptms-by-gene.rds | Biotinylation sites from DiDBIT study summarised by gene with IDR and PTM information added in
SpotBioID-with-biotins-ptms-by-gene.rds | Biotinylation sites from Ab-APEX study summarised by gene with IDR and PTM information added in
Biotin-PTM-IDR-data-for-all-studies-and-callers.rds | Biotin,IDR and PTM information for all studies and caller combinations
HEK293-annotated-list-of-proteins.rds | HEK293 list of proteins from Geiger et al
HEK293-GO-annotation-with-ancestors.rds | GO annotations with ancestor terms for the HEK293 background protein list from Geiger et al
HEK293-with-IDR-VSL2b-and-PTMs.rds | HEK293 peoteins from Geiger et al with PTMs and IDRs based on the algorithm VSL2b
PTMs-by-gene-from-Phosphositeplus.rds | PTM data from PhosphositePlus summarised by gene and for phosphorylation, ubiquitination, sumoylation, acetylation
go-pro-kegg.rda | An object that contains all GO, Interpro and KEGG annotations for proteins in each of the 4 studies

# RDS files can be read using the command readRDS as shown below
readRDS("Input/Biotin-PTM-IDR-data-for-all-studies-and-callers.rds")

# RDA files can be loaded using the command "load" as shown below
load("Input/go-pro-kegg.rda")

#### Output : A description of all files generated by running nods_final.Rmd.
##### Date_Output : Date-tagged output folder This folder should contain the following files
Most of these files are related to the Figures in the main manuscript with some additional information

File| Contents
----------|-------------
Figure3A_Number.biotins.vs.FractionIDR-VSL2b.pdf | Violin plots comparing fraction of IDR and biotin number in each protein
Figure3B_Distribution-of-Biotins-in.out.of.IDRs-by-caller-and-study.pdf | Bar plot showing expected and observed rates of biotin for all studies and callers
Figure3B_Binomial-test-stats.txt | Bionomial test values and parameters for all studies and callers looking at obs biotins in IDRs
Figure3C_Distribution-of-FPU-by-caller-parameters-and-study.pdf | Number of proteins in FPU categories in each study and by caller
Figure3D_VSL2b_Biotins-in-IDR.vs.IDR-classes.violinPlots.pdf | Violin plots showing biotins in IDRs separated by FPU for caller VSL2b
Figure4A_Num.idrs.vs.num.ptms_HEK293-vs-biotin.pdf | Correlation plot between number of PTMs and Biotins for HEK293 proteome (Geiger) and HEK203 biotinome
Figure4B_Distribution-of-PTM-types-in-HEK-vs-in-Biotin-studies.pdf | Boxplots showing PTM counts by type in HEK293 proteome and biotinome
Figure4C_Binomial-test-stats.txt | Bionomial test values and parameters for all studies and callers looking at obs PTMs in IDRs
Figure4C_Distribution-of-PTMs-in.out.of.IDRs-by-study-VSL2b.pdf | Bar plot showing expected and observed rates of all types of PTMs for all studies
Figure4D_VSL2b_PTMs-in-IDR.vs.IDR-classes.violinPlots.pdf | Violin plots showing all PTMs in IDRs separated by FPU for caller VSL2b
Figure4D_VSL2b_Acytelation.vs.IDR-classes.violinPlots.pdf | Violin plots showing acetylation marks in IDRs separated by FPU for caller VSL2b
Figure4D_VSL2b_Phosphorylation.vs.IDR-classes.violinPlots.pdf | Violin plots showing phosphorylation marks in IDRs separated by FPU for caller VSL2b
Figure4D_VSL2b_Sumoylation.vs.IDR-classes.violinPlots.pdf | Violin plots showing sumoylation marks in IDRs separated by FPU for caller VSL2b
Figure4D_VSL2b_Ubiquitination.vs.IDR-classes.violinPlots.pdf | Violin plots showing ubiquitination marks in IDRs separated by FPU for caller VSL2b
FigureS1A_VennDiagrams-for-biotinylation-data.pdf | Venn diagram showing overlap of proteins in biotinylation data
FigureS1C_Biotin-GO-CC-enrichment-with-anc-terms_8_enricher-dotplot.pdf | Bubble plots showing top 8 GO enrichment terms (including ancestors) for "cellular component" for each of the 4 studies
Biotin-GO-enrichment-with-anc-terms_8_enricher-dotplot.pdf | Bubble plots showing top 8 GO enrichment terms (including ancestors) for each of the 4 studies
Biotin-GO-enrichment_8_enricher-dotplot.pdf | Bubble plots showing top 8 GO enrichment terms for each of the 4 studies. No ancestor terms used.
Biotin-GO-CC-enrichment_8_enricher-dotplot.pdf | Bubble plots showing top 8 GO enrichment terms for "cellular component" for each of the 4 studies. No ancestor terms used.
GO-KEGG-enrichment-with-whole-human-genome-bg.pdf | GO enrichment plotting using clusterProfiler's inbuilt code and whole human genome as background
Effect-size-for-pairwise-comparison-of-PTMs-in-IDRs-VSL2b.pdf | Effect size plot based on Tukey's HSD for PTMs in IDRs
Effect-size-for-pairwise-comparison-of-biotin-in-IDRs-VSL2b.pdf | Effect size plot based on Tukey's HSD for PTMs in IDRs
Date_Time_Effect-size-for-pairwise-comparison-of-IDR-classes-and-parameters-VSL2b.pdf | Effect size plot based on Tukey's HSD for multiple parameters and IDR classes using VSL2b calls for IDR
Date_Time_VSL2b_Pairwise-t-test-pvals.txt
Date_Time_VSL2b_Tukey-HSD-ci-pvals.txt


### SupplTables.xlsx : Supplementary tables referred to in the manuscript. Contain data that can be queried/used. Contains 5 datasheets and an "Index" sheet describing each of the data sheets.
Binary file added SupplTables.xlsx
Binary file not shown.
Loading

0 comments on commit 8e5b5c7

Please sign in to comment.