-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
-Created FimoEvaluator that inserts documents into Mongodb instance.
-It is perfect and amazing in everyway -Several points we need to refactor (including clarifying the data structure in which it is stored.
- Loading branch information
WU Jia
committed
Aug 11, 2014
1 parent
eb822de
commit 46097fa
Showing
12 changed files
with
24,312 additions
and
26 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
#!/usr/bin/python | ||
MOTIF_FAMILY = ["SMAD3","SMAD4"] | ||
#i'm thinking about creating an iterator to store my stuff instead... | ||
#add_entry adds a dict to the iterator | ||
import scipy.stats as sci | ||
import pdb | ||
from pymongo import MongoClient | ||
|
||
class CandidateSequences: | ||
|
||
def __init__(self): | ||
self.entry_number = 0 | ||
self.sequence_dict = {} | ||
self.senspec_scores = {} | ||
self.sensitivity_scores = {} | ||
self.specificity_scores = {} | ||
self.mongo_client = "" | ||
|
||
def add_entry(self, line): | ||
#split the line into a dict | ||
try: | ||
entry_dict = self.parse_fimo(line) | ||
except IndexError: | ||
return False | ||
|
||
#check if the entry is valid | ||
if self.is_valid(entry_dict): | ||
#add the key if it does not exist | ||
key = entry_dict['sequence_name'] | ||
if key not in self.sequence_dict: | ||
self.sequence_dict[key] = [] | ||
#append the entry to the key-value pair, where the value is a list | ||
self.sequence_dict[key].append(entry_dict) | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def calculate_senspec(self): | ||
for key in self.sequence_dict: | ||
entry_list = self.sequence_dict[key] | ||
|
||
# to calculate sensitivity, get the zscore of the family and divide it by | ||
# the nonoverlapping number of motifs | ||
|
||
# to calculate specificity, get the zscore of the family and divide it by | ||
# the nubmer of motifs that bind that are not part of the family | ||
|
||
# to calculate the senspec score, multiply the sensitivity score with the | ||
# specificity score | ||
|
||
ontarget_list = self.get_ontarget_motifs(entry_list) | ||
#print(list(ontarget_list)) | ||
nonoverlapping_ontarget_list = self.get_nonoverlapping_motifs(ontarget_list) | ||
|
||
n_fam = len(nonoverlapping_ontarget_list) | ||
#calculate combined zscore | ||
z_fam = self.get_combined_zscore(nonoverlapping_ontarget_list) | ||
|
||
if n_fam == 0 : | ||
sensitivity_score = 0 | ||
z_fam = 0 | ||
else: | ||
sensitivity_score = z_fam/n_fam #n_fam contains non-overlapping motifs | ||
|
||
offtarget_list = self.get_offtarget_motifs(entry_list) | ||
offtarget_list = list(offtarget_list) | ||
n_other = len(offtarget_list) | ||
|
||
if n_other == 0: | ||
specificity_score = z_fam/1 | ||
else: | ||
specificity_score = z_fam/n_other #n_other contains motifs including overlapping | ||
|
||
senspec_score = specificity_score * sensitivity_score | ||
|
||
#write to object's collection of dicts | ||
|
||
self.senspec_scores[key]=senspec_score | ||
self.sensitivity_scores[key] = sensitivity_score | ||
self.specificity_scores[key] = specificity_score | ||
|
||
def insert_contents(self): | ||
#i need to abstract this later on, and create a settings file | ||
mongo_client = MongoClient('hera.chem-eng.northwestern.edu',27017) | ||
db = mongo_client['SeqGen_Database'] | ||
scores_collection = db['Sequence_Scores'] | ||
entries_collection = db['Sequence_Entries'] | ||
|
||
#bulk insert all entries | ||
#sequence_dict is a dict with the key as the name and the entries as the | ||
#entries | ||
for key in self.sequence_dict: | ||
entries_collection.insert(self.sequence_dict[key]) | ||
|
||
score_dict_list = [] | ||
#insert scores | ||
for key in self.senspec_scores: | ||
score_dict = {} | ||
score_dict['name'] = key | ||
score_dict['senspec'] = self.senspec_scores[key] | ||
score_dict['specificity'] = self.specificity_scores[key] | ||
score_dict['sensitivity'] = self.sensitivity_scores[key] | ||
score_dict_list.append(score_dict) | ||
|
||
scores_collection.insert(score_dict_list) | ||
mongo_client.close() | ||
return True | ||
|
||
#helper methods | ||
def get_combined_zscore(self, list): | ||
#if len(list) > 1: | ||
#else: | ||
#combined_zscore= sci.norm.ppf(list[0]['p_value']) | ||
#[sci.norm.ppf(item['p_value']) for item in list] | ||
combined_zscore = sum(( (-1) * sci.norm.ppf(float(item['p_value'])) ) for item in list) | ||
return combined_zscore | ||
|
||
def parse_fimo(self,line): | ||
entry_dict = {} | ||
entry = line.rstrip("\n").split("\t") | ||
entry_dict['motif_name'] = entry[0] | ||
entry_dict['sequence_name'] = entry[1] | ||
entry_dict['start'] = entry[2] | ||
entry_dict['stop'] = entry[3] | ||
entry_dict['strand'] = entry[4] | ||
entry_dict['p_value'] = entry[6] | ||
entry_dict['matched_sequence'] = entry[8] | ||
return entry_dict | ||
|
||
def is_valid(self,entry_dict): | ||
#check if the strand is positive | ||
positive = '+' | ||
if entry_dict['strand'] == positive: | ||
return True | ||
else: | ||
return False | ||
|
||
def get_dict(self): | ||
return self.sequence_dict | ||
|
||
def get_nonoverlapping_motifs(self,entry_list): | ||
non_overlap_list = [] | ||
filtered_list = [] | ||
#first sort list | ||
entry_list = sorted(entry_list, key=lambda k: float(k['p_value'])) | ||
if len(entry_list) > 0: | ||
#pop the first value out, add it to the non_overlap. | ||
non_overlap_list.append(entry_list.pop(0)) | ||
|
||
filtered_list = entry_list | ||
|
||
while(len(filtered_list) > 0): | ||
|
||
current_dict = non_overlap_list[-1] | ||
#[item for item in iterable if function(item)] | ||
filtered_list = [dict for dict in filtered_list if self.is_not_overlapping(dict, current_dict)] | ||
if len(filtered_list) > 0: | ||
non_overlap_list.append(filtered_list.pop(0)) | ||
|
||
return non_overlap_list | ||
|
||
def is_not_overlapping(self, dicta, dictb): | ||
a=[dicta['start'], dicta['stop']] | ||
b=[dictb['start'], dictb['stop']] | ||
#a and b are coordinates | ||
a = list(map(int,a)) | ||
b = list(map(int,b)) | ||
overlap = min(a[1],b[1]) - max(a[0],b[0]) | ||
if (overlap < 0): | ||
return True | ||
else: | ||
return False | ||
|
||
def get_ontarget_motifs(self,entry_list): | ||
ontargets = (item for item in entry_list if any(motif in item['motif_name'] for motif in MOTIF_FAMILY)) | ||
return ontargets | ||
# search list of dicts, motif_name section. | ||
# get all dicts that have a certain motif name | ||
|
||
def get_offtarget_motifs(self,entry_list): | ||
offtargets = (item for item in entry_list if not any(motif in item['motif_name'] for motif in MOTIF_FAMILY)) | ||
return offtargets | ||
# search list of dicts, motif_name section. | ||
# get all dicts that have a certain motif name | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/usr/bin/python | ||
|
||
import pdb | ||
import sys | ||
from CandidateSequences import CandidateSequences | ||
|
||
#main runtime, passes in the FIMO result pipe from shell script | ||
#overall idea is to process the entries and sort them as them come in. | ||
if __name__ == "__main__": | ||
result_list = [] | ||
#instead of a pure dict, instantiate a object that holds a dict and contains a | ||
#number of methods | ||
sequence_container = CandidateSequences() | ||
for place,line in enumerate(sys.stdin): | ||
#skip the first line | ||
if place: | ||
#create a dict, store entries in the dict | ||
#pass in old dict into function and line, | ||
#pdb.set_trace() | ||
is_valid = sequence_container.add_entry(line) | ||
#function parses line, and appends entry to the key | ||
result_list.append(line) | ||
|
||
#after the results are aggregated, perform calculations | ||
sequence_container.calculate_senspec() | ||
#insert to model | ||
#sequence_container.insert_contents(Model) | ||
sequence_container.insert_contents() | ||
|
||
big_dict = sequence_container.get_dict() | ||
with open('testing.txt','w') as testfile: | ||
#for key in big_dict: | ||
#testfile.write(key + '\n') | ||
testfile.write("".join(result_list)) | ||
|
||
#write parseline helper function that belongs to dict object | ||
|
||
#write parse line exceptions function that removes '+' strand | ||
#write/create dict object that stores keys | ||
#insert entries and return summary count | ||
|
||
|
||
#when all insertion is done: | ||
|
||
#summarize results from FIMO file. get sequence score | ||
#get on target and off target motifs | ||
#calculate sensitivity and specificity, return that information or store it? | ||
#these methods should be stored in the FIMOData object | ||
#access functions, getScores, getBlah | ||
|
||
#finally, insert database into mongo | ||
|
||
#should I write a model wrapper? nah its okay for now | ||
#use pymongo as a model wrapper | ||
#insert summarized batch into mongodb database | ||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
#this will be a test script to make sure the outputs are correctly piped from | ||
#to the python output | ||
|
||
sequence_file="/projects/p20519/jia_output/FIMO/SMAD3_02_2_10000000.0488" | ||
output_file="/projects/p20519/jia_output/FIMO/P53_01/SMAD3_test.txt" | ||
|
||
mkfifo ${output_file} | ||
fimo --max-stored-scores 500000000 --thresh 0.0001 --max-seq-length 250000000 --text /projects/p20519/jia_output/TRANSFAC2FIMO_3242014.txt ${sequence_file} >> ${output_file} & | ||
|
||
cat ${output_file} | python FimoEvaluator.py | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/bin/sh | ||
|
||
MOTIFS="SMAD3_02" | ||
|
||
COUNTER="1" | ||
|
||
for run_counter in {1..10} | ||
do | ||
COUNTER="$run_counter" | ||
for motif_name in $MOTIFS | ||
do | ||
cat <<EOS | msub - | ||
#!/bin/bash | ||
#MSUB -A p20519 | ||
#MSUB -M [email protected] | ||
#MSUB -e seqGen_error_file.err | ||
#MSUB -o seqGen_log_file.log | ||
#MSUB -l walltime=24:00:00 | ||
#MSUB -l nodes=1:ppn=1 | ||
#MSUB -j oe | ||
#MSUB -m bae | ||
#MSUB -V | ||
#MSUB -N ${motif_name}_${COUNTER} | ||
module load boost | ||
module load gcc | ||
module load utilities | ||
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 10000000 50 | ||
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000000.txt | ||
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000000.txt | ||
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 1000000 50 | ||
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000000.txt | ||
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000000.txt | ||
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_100000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 100000 50 | ||
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_100000.txt | ||
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_100000.txt | ||
/home/jjw036/SequenceGenerator/bin/seqGen | ||
/projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 10000 50 | ||
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000.txt | ||
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000.txt | ||
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 1000 50 | ||
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000.txt | ||
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000.txt | ||
EOS | ||
done | ||
jobs_running=true | ||
done |
Oops, something went wrong.