Skip to content

Commit

Permalink
-Created FimoEvaluator that inserts documents into Mongodb instance.
Browse files Browse the repository at this point in the history
-It is perfect and amazing in everyway
-Several points we need to refactor (including clarifying the data structure in
which it is stored.
  • Loading branch information
WU Jia committed Aug 11, 2014
1 parent eb822de commit 46097fa
Show file tree
Hide file tree
Showing 12 changed files with 24,312 additions and 26 deletions.
Binary file added scripts/.FimoEvaluator.py.swo
Binary file not shown.
Binary file added scripts/.FimoEvaluator.py.swp
Binary file not shown.
186 changes: 186 additions & 0 deletions scripts/CandidateSequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#!/usr/bin/python
MOTIF_FAMILY = ["SMAD3","SMAD4"]
#i'm thinking about creating an iterator to store my stuff instead...
#add_entry adds a dict to the iterator
import scipy.stats as sci
import pdb
from pymongo import MongoClient

class CandidateSequences:

def __init__(self):
self.entry_number = 0
self.sequence_dict = {}
self.senspec_scores = {}
self.sensitivity_scores = {}
self.specificity_scores = {}
self.mongo_client = ""

def add_entry(self, line):
#split the line into a dict
try:
entry_dict = self.parse_fimo(line)
except IndexError:
return False

#check if the entry is valid
if self.is_valid(entry_dict):
#add the key if it does not exist
key = entry_dict['sequence_name']
if key not in self.sequence_dict:
self.sequence_dict[key] = []
#append the entry to the key-value pair, where the value is a list
self.sequence_dict[key].append(entry_dict)
return True
else:
return False


def calculate_senspec(self):
for key in self.sequence_dict:
entry_list = self.sequence_dict[key]

# to calculate sensitivity, get the zscore of the family and divide it by
# the nonoverlapping number of motifs

# to calculate specificity, get the zscore of the family and divide it by
# the nubmer of motifs that bind that are not part of the family

# to calculate the senspec score, multiply the sensitivity score with the
# specificity score

ontarget_list = self.get_ontarget_motifs(entry_list)
#print(list(ontarget_list))
nonoverlapping_ontarget_list = self.get_nonoverlapping_motifs(ontarget_list)

n_fam = len(nonoverlapping_ontarget_list)
#calculate combined zscore
z_fam = self.get_combined_zscore(nonoverlapping_ontarget_list)

if n_fam == 0 :
sensitivity_score = 0
z_fam = 0
else:
sensitivity_score = z_fam/n_fam #n_fam contains non-overlapping motifs

offtarget_list = self.get_offtarget_motifs(entry_list)
offtarget_list = list(offtarget_list)
n_other = len(offtarget_list)

if n_other == 0:
specificity_score = z_fam/1
else:
specificity_score = z_fam/n_other #n_other contains motifs including overlapping

senspec_score = specificity_score * sensitivity_score

#write to object's collection of dicts

self.senspec_scores[key]=senspec_score
self.sensitivity_scores[key] = sensitivity_score
self.specificity_scores[key] = specificity_score

def insert_contents(self):
#i need to abstract this later on, and create a settings file
mongo_client = MongoClient('hera.chem-eng.northwestern.edu',27017)
db = mongo_client['SeqGen_Database']
scores_collection = db['Sequence_Scores']
entries_collection = db['Sequence_Entries']

#bulk insert all entries
#sequence_dict is a dict with the key as the name and the entries as the
#entries
for key in self.sequence_dict:
entries_collection.insert(self.sequence_dict[key])

score_dict_list = []
#insert scores
for key in self.senspec_scores:
score_dict = {}
score_dict['name'] = key
score_dict['senspec'] = self.senspec_scores[key]
score_dict['specificity'] = self.specificity_scores[key]
score_dict['sensitivity'] = self.sensitivity_scores[key]
score_dict_list.append(score_dict)

scores_collection.insert(score_dict_list)
mongo_client.close()
return True

#helper methods
def get_combined_zscore(self, list):
#if len(list) > 1:
#else:
#combined_zscore= sci.norm.ppf(list[0]['p_value'])
#[sci.norm.ppf(item['p_value']) for item in list]
combined_zscore = sum(( (-1) * sci.norm.ppf(float(item['p_value'])) ) for item in list)
return combined_zscore

def parse_fimo(self,line):
entry_dict = {}
entry = line.rstrip("\n").split("\t")
entry_dict['motif_name'] = entry[0]
entry_dict['sequence_name'] = entry[1]
entry_dict['start'] = entry[2]
entry_dict['stop'] = entry[3]
entry_dict['strand'] = entry[4]
entry_dict['p_value'] = entry[6]
entry_dict['matched_sequence'] = entry[8]
return entry_dict

def is_valid(self,entry_dict):
#check if the strand is positive
positive = '+'
if entry_dict['strand'] == positive:
return True
else:
return False

def get_dict(self):
return self.sequence_dict

def get_nonoverlapping_motifs(self,entry_list):
non_overlap_list = []
filtered_list = []
#first sort list
entry_list = sorted(entry_list, key=lambda k: float(k['p_value']))
if len(entry_list) > 0:
#pop the first value out, add it to the non_overlap.
non_overlap_list.append(entry_list.pop(0))

filtered_list = entry_list

while(len(filtered_list) > 0):

current_dict = non_overlap_list[-1]
#[item for item in iterable if function(item)]
filtered_list = [dict for dict in filtered_list if self.is_not_overlapping(dict, current_dict)]
if len(filtered_list) > 0:
non_overlap_list.append(filtered_list.pop(0))

return non_overlap_list

def is_not_overlapping(self, dicta, dictb):
a=[dicta['start'], dicta['stop']]
b=[dictb['start'], dictb['stop']]
#a and b are coordinates
a = list(map(int,a))
b = list(map(int,b))
overlap = min(a[1],b[1]) - max(a[0],b[0])
if (overlap < 0):
return True
else:
return False

def get_ontarget_motifs(self,entry_list):
ontargets = (item for item in entry_list if any(motif in item['motif_name'] for motif in MOTIF_FAMILY))
return ontargets
# search list of dicts, motif_name section.
# get all dicts that have a certain motif name

def get_offtarget_motifs(self,entry_list):
offtargets = (item for item in entry_list if not any(motif in item['motif_name'] for motif in MOTIF_FAMILY))
return offtargets
# search list of dicts, motif_name section.
# get all dicts that have a certain motif name

Binary file added scripts/CandidateSequences.pyc
Binary file not shown.
57 changes: 57 additions & 0 deletions scripts/FimoEvaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/python

import pdb
import sys
from CandidateSequences import CandidateSequences

#main runtime, passes in the FIMO result pipe from shell script
#overall idea is to process the entries and sort them as them come in.
if __name__ == "__main__":
result_list = []
#instead of a pure dict, instantiate a object that holds a dict and contains a
#number of methods
sequence_container = CandidateSequences()
for place,line in enumerate(sys.stdin):
#skip the first line
if place:
#create a dict, store entries in the dict
#pass in old dict into function and line,
#pdb.set_trace()
is_valid = sequence_container.add_entry(line)
#function parses line, and appends entry to the key
result_list.append(line)

#after the results are aggregated, perform calculations
sequence_container.calculate_senspec()
#insert to model
#sequence_container.insert_contents(Model)
sequence_container.insert_contents()

big_dict = sequence_container.get_dict()
with open('testing.txt','w') as testfile:
#for key in big_dict:
#testfile.write(key + '\n')
testfile.write("".join(result_list))

#write parseline helper function that belongs to dict object

#write parse line exceptions function that removes '+' strand
#write/create dict object that stores keys
#insert entries and return summary count


#when all insertion is done:

#summarize results from FIMO file. get sequence score
#get on target and off target motifs
#calculate sensitivity and specificity, return that information or store it?
#these methods should be stored in the FIMOData object
#access functions, getScores, getBlah

#finally, insert database into mongo

#should I write a model wrapper? nah its okay for now
#use pymongo as a model wrapper
#insert summarized batch into mongodb database


Binary file not shown.
13 changes: 13 additions & 0 deletions scripts/fimo_eval_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

#this will be a test script to make sure the outputs are correctly piped from
#to the python output

sequence_file="/projects/p20519/jia_output/FIMO/SMAD3_02_2_10000000.0488"
output_file="/projects/p20519/jia_output/FIMO/P53_01/SMAD3_test.txt"

mkfifo ${output_file}
fimo --max-stored-scores 500000000 --thresh 0.0001 --max-seq-length 250000000 --text /projects/p20519/jia_output/TRANSFAC2FIMO_3242014.txt ${sequence_file} >> ${output_file} &

cat ${output_file} | python FimoEvaluator.py

2 changes: 1 addition & 1 deletion scripts/rename_file_lines.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
for filename in $(find /projects/p20519/jia_output/FIMO/P53_01_*.txt -type f); do
for filename in $(find /projects/p20519/jia_output/FIMO/*.txt -type f); do
#decompose filename
base_filename=$(basename "$filename")
extension="${base_filename##*.}"
Expand Down
27 changes: 2 additions & 25 deletions scripts/run_script3.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,32 +1,9 @@
#!/bin/sh

MOTIFS="GATA1_01
GATA1_02
GATA1_03
GATA1_04
GATA1_05
GATA1_06
GATA1_Q6
P53_01
P53_02
P53_03
P53_04
P53_05
TRP53_01
TRP53_02
TRP53_03
SMAD_Q6
SMAD_Q6_01
SMAD1_01
SMAD2_Q6
SMAD3_03
SMAD3_Q6
SMAD3_Q6_01
SMAD4_04
SMAD5_Q5"
MOTIFS="SMAD3_02"
COUNTER="1"

for run_counter in {11..20}
for run_counter in {1..10}
do
COUNTER="$run_counter"
for motif_name in $MOTIFS
Expand Down
55 changes: 55 additions & 0 deletions scripts/run_script_library.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/sh

MOTIFS="SMAD3_02"

COUNTER="1"

for run_counter in {1..10}
do
COUNTER="$run_counter"
for motif_name in $MOTIFS
do
cat <<EOS | msub -
#!/bin/bash
#MSUB -A p20519
#MSUB -M [email protected]
#MSUB -e seqGen_error_file.err
#MSUB -o seqGen_log_file.log
#MSUB -l walltime=24:00:00
#MSUB -l nodes=1:ppn=1
#MSUB -j oe
#MSUB -m bae
#MSUB -V
#MSUB -N ${motif_name}_${COUNTER}
module load boost
module load gcc
module load utilities
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 10000000 50
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000000.txt
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000000.txt
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 1000000 50
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000000.txt
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000000.txt
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_100000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 100000 50
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_100000.txt
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_100000.txt
/home/jjw036/SequenceGenerator/bin/seqGen
/projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 10000 50
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000.txt
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_10000.txt
/home/jjw036/SequenceGenerator/bin/seqGen /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000.txt /home/jjw036/SequenceGenerator/TRANSFAC2FIMO.txt $motif_name 1000 50
7za a -t7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}.7z /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000.txt
rm /projects/p20519/jia_output/SequenceGenerator/${motif_name}_${COUNTER}_1000.txt
EOS
done
jobs_running=true
done
Loading

0 comments on commit 46097fa

Please sign in to comment.