Skip to content

Commit 7cee304

Browse files
author
nmih
committed
Resolve circular import for blast_pdb function
1 parent 0771500 commit 7cee304

4 files changed

Lines changed: 124 additions & 116 deletions

File tree

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
setup(
77
name='ssbio',
8-
version='0.9.9.8',
8+
version='0.9.9.8a',
99
author='Nathan Mih',
1010
author_email='nmih@ucsd.edu',
1111
license='MIT',
1212
url='http://github.com/SBRG/ssbio',
13-
download_url = 'https://github.com/SBRG/ssbio/archive/v0.9.9.8.tar.gz',
13+
download_url = 'https://github.com/SBRG/ssbio/archive/v0.9.9.8a.tar.gz',
1414
description='Tools to enable structural systems biology',
1515
packages=find_packages(),
1616
package_dir={'ssbio': 'ssbio'},

ssbio/databases/pdb.py

Lines changed: 1 addition & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -433,111 +433,6 @@ def best_structures(uniprot_id, outname=None, outdir=None, seq_ident_cutoff=0.0,
433433
return data
434434

435435

436-
def blast_pdb(seq, outfile='', outdir='', evalue=0.0001, seq_ident_cutoff=0.0, link=False, force_rerun=False):
437-
"""Returns a list of BLAST hits of a sequence to available structures in the PDB.
438-
439-
Args:
440-
seq (str): Your sequence, in string format
441-
outfile (str): Name of output file
442-
outdir (str, optional): Path to output directory. Default is the current directory.
443-
evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal, 0.0001 is stringent (default).
444-
seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
445-
link (bool, optional): Set to True if a link to the HTML results should be displayed
446-
force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False
447-
448-
Returns:
449-
list: Rank ordered list of BLAST hits in dictionaries.
450-
451-
"""
452-
453-
if len(seq) < 12:
454-
raise ValueError('Sequence must be at least 12 residues long.')
455-
if link:
456-
page = 'PDB results page: http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=HTML'.format(seq, evalue)
457-
print(page)
458-
459-
parser = etree.XMLParser(ns_clean=True)
460-
461-
outfile = op.join(outdir, outfile)
462-
if ssbio.utils.force_rerun(force_rerun, outfile):
463-
# Load the BLAST XML results if force_rerun=True
464-
page = 'http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=XML'.format(
465-
seq, evalue)
466-
req = requests.get(page)
467-
if req.status_code == 200:
468-
response = req.text
469-
470-
# Save the XML file
471-
if outfile:
472-
with open(outfile, 'w') as f:
473-
f.write(response)
474-
475-
# Parse the XML string
476-
tree = etree.ElementTree(etree.fromstring(response, parser))
477-
log.debug('Loaded BLAST results from REST server')
478-
else:
479-
log.error('BLAST request timed out')
480-
return []
481-
else:
482-
tree = etree.parse(outfile, parser)
483-
log.debug('{}: Loaded existing BLAST XML results'.format(outfile))
484-
485-
# Get length of original sequence to calculate percentages
486-
len_orig = float(len(seq))
487-
488-
root = tree.getroot()
489-
hit_list = []
490-
491-
for hit in root.findall('BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
492-
info = {}
493-
494-
hitdef = hit.find('Hit_def')
495-
if hitdef is not None:
496-
info['hit_pdb'] = hitdef.text.split('|')[0].split(':')[0].lower()
497-
info['hit_pdb_chains'] = hitdef.text.split('|')[0].split(':')[2].split(',')
498-
499-
# One PDB can align to different parts of the sequence
500-
# Will just choose the top hit for this single PDB
501-
hsp = hit.findall('Hit_hsps/Hsp')[0]
502-
503-
# Number of identical residues
504-
hspi = hsp.find('Hsp_identity')
505-
if hspi is not None:
506-
info['hit_num_ident'] = int(hspi.text)
507-
info['hit_percent_ident'] = int(hspi.text)/len_orig
508-
509-
if int(hspi.text)/len_orig < seq_ident_cutoff:
510-
log.debug('{}: does not meet sequence identity cutoff'.format(hitdef.text.split('|')[0].split(':')[0]))
511-
continue
512-
513-
# Number of similar residues (positive hits)
514-
hspp = hsp.find('Hsp_positive')
515-
if hspp is not None:
516-
info['hit_num_similar'] = int(hspp.text)
517-
info['hit_percent_similar'] = int(hspp.text) / len_orig
518-
519-
# Total number of gaps (unable to align in either query or subject)
520-
hspg = hsp.find('Hsp_gaps')
521-
if hspg is not None:
522-
info['hit_num_gaps'] = int(hspg.text)
523-
info['hit_percent_gaps'] = int(hspg.text) / len_orig
524-
525-
# E-value of BLAST
526-
hspe = hsp.find('Hsp_evalue')
527-
if hspe is not None:
528-
info['hit_evalue'] = float(hspe.text)
529-
530-
# Score of BLAST
531-
hsps = hsp.find('Hsp_score')
532-
if hsps is not None:
533-
info['hit_score'] = float(hsps.text)
534-
535-
hit_list.append(info)
536-
537-
log.debug("{}: Number of BLAST hits".format(len(hit_list)))
538-
return hit_list
539-
540-
541436
def blast_pdb_df(blast_results):
542437
"""Make a dataframe of BLAST results"""
543438
cols = ['hit_pdb', 'hit_pdb_chains', 'hit_evalue', 'hit_score', 'hit_num_ident', 'hit_percent_ident',
@@ -738,7 +633,7 @@ def get_bioassembly_info(pdb_id, biomol_num, cache=False, outdir=None, force_rer
738633
def download_biomol(pdb_id, biomol_num, outdir, file_type='pdb', force_rerun=False):
739634
import zlib
740635
from six.moves.urllib_error import URLError
741-
from six.moves.urllib.request import urlopen, urlretrieve
636+
from six.moves.urllib.request import urlopen
742637
import contextlib
743638

744639
ssbio.utils.make_dir(outdir)

ssbio/databases/pdb_seq.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from os import path as op
2+
3+
import requests
4+
from lxml import etree
5+
6+
import ssbio.utils
7+
import logging
8+
log = logging.getLogger(__name__)
9+
10+
def blast_pdb(seq, outfile='', outdir='', evalue=0.0001, seq_ident_cutoff=0.0, link=False, force_rerun=False):
11+
"""Returns a list of BLAST hits of a sequence to available structures in the PDB.
12+
13+
Args:
14+
seq (str): Your sequence, in string format
15+
outfile (str): Name of output file
16+
outdir (str, optional): Path to output directory. Default is the current directory.
17+
evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal, 0.0001 is stringent (default).
18+
seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
19+
link (bool, optional): Set to True if a link to the HTML results should be displayed
20+
force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False
21+
22+
Returns:
23+
list: Rank ordered list of BLAST hits in dictionaries.
24+
25+
"""
26+
27+
if len(seq) < 12:
28+
raise ValueError('Sequence must be at least 12 residues long.')
29+
if link:
30+
page = 'PDB results page: http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=HTML'.format(seq, evalue)
31+
print(page)
32+
33+
parser = etree.XMLParser(ns_clean=True)
34+
35+
outfile = op.join(outdir, outfile)
36+
if ssbio.utils.force_rerun(force_rerun, outfile):
37+
# Load the BLAST XML results if force_rerun=True
38+
page = 'http://www.rcsb.org/pdb/rest/getBlastPDB1?sequence={}&eCutOff={}&maskLowComplexity=yes&matrix=BLOSUM62&outputFormat=XML'.format(
39+
seq, evalue)
40+
req = requests.get(page)
41+
if req.status_code == 200:
42+
response = req.text
43+
44+
# Save the XML file
45+
if outfile:
46+
with open(outfile, 'w') as f:
47+
f.write(response)
48+
49+
# Parse the XML string
50+
tree = etree.ElementTree(etree.fromstring(response, parser))
51+
log.debug('Loaded BLAST results from REST server')
52+
else:
53+
log.error('BLAST request timed out')
54+
return []
55+
else:
56+
tree = etree.parse(outfile, parser)
57+
log.debug('{}: Loaded existing BLAST XML results'.format(outfile))
58+
59+
# Get length of original sequence to calculate percentages
60+
len_orig = float(len(seq))
61+
62+
root = tree.getroot()
63+
hit_list = []
64+
65+
for hit in root.findall('BlastOutput_iterations/Iteration/Iteration_hits/Hit'):
66+
info = {}
67+
68+
hitdef = hit.find('Hit_def')
69+
if hitdef is not None:
70+
info['hit_pdb'] = hitdef.text.split('|')[0].split(':')[0].lower()
71+
info['hit_pdb_chains'] = hitdef.text.split('|')[0].split(':')[2].split(',')
72+
73+
# One PDB can align to different parts of the sequence
74+
# Will just choose the top hit for this single PDB
75+
hsp = hit.findall('Hit_hsps/Hsp')[0]
76+
77+
# Number of identical residues
78+
hspi = hsp.find('Hsp_identity')
79+
if hspi is not None:
80+
info['hit_num_ident'] = int(hspi.text)
81+
info['hit_percent_ident'] = int(hspi.text)/len_orig
82+
83+
if int(hspi.text)/len_orig < seq_ident_cutoff:
84+
log.debug('{}: does not meet sequence identity cutoff'.format(hitdef.text.split('|')[0].split(':')[0]))
85+
continue
86+
87+
# Number of similar residues (positive hits)
88+
hspp = hsp.find('Hsp_positive')
89+
if hspp is not None:
90+
info['hit_num_similar'] = int(hspp.text)
91+
info['hit_percent_similar'] = int(hspp.text) / len_orig
92+
93+
# Total number of gaps (unable to align in either query or subject)
94+
hspg = hsp.find('Hsp_gaps')
95+
if hspg is not None:
96+
info['hit_num_gaps'] = int(hspg.text)
97+
info['hit_percent_gaps'] = int(hspg.text) / len_orig
98+
99+
# E-value of BLAST
100+
hspe = hsp.find('Hsp_evalue')
101+
if hspe is not None:
102+
info['hit_evalue'] = float(hspe.text)
103+
104+
# Score of BLAST
105+
hsps = hsp.find('Hsp_score')
106+
if hsps is not None:
107+
info['hit_score'] = float(hsps.text)
108+
109+
hit_list.append(info)
110+
111+
log.debug("{}: Number of BLAST hits".format(len(hit_list)))
112+
return hit_list

ssbio/protein/sequence/seqprop.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
from Bio.SeqRecord import SeqRecord
1919
from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition, CompoundLocation
2020
from more_itertools import locate
21+
22+
import ssbio.databases.pdb_seq
2123
from ssbio.core.object import Object
2224
import ssbio.utils
23-
import ssbio.databases.pdb
2425
import ssbio.protein.sequence.utils
2526
import ssbio.protein.sequence.utils.fasta
2627
import ssbio.protein.sequence.properties.residues
@@ -731,13 +732,13 @@ def blast_pdb(self, seq_ident_cutoff=0, evalue=0.0001, display_link=False,
731732
return None
732733

733734
try:
734-
blast_results = ssbio.databases.pdb.blast_pdb(self.seq_str,
735-
outfile='{}_blast_pdb.xml'.format(custom_slugify(self.id)),
736-
outdir=outdir,
737-
force_rerun=force_rerun,
738-
evalue=evalue,
739-
seq_ident_cutoff=seq_ident_cutoff,
740-
link=display_link)
735+
blast_results = ssbio.databases.pdb_seq.blast_pdb(self.seq_str,
736+
outfile='{}_blast_pdb.xml'.format(custom_slugify(self.id)),
737+
outdir=outdir,
738+
force_rerun=force_rerun,
739+
evalue=evalue,
740+
seq_ident_cutoff=seq_ident_cutoff,
741+
link=display_link)
741742
except requests.ConnectionError as e:
742743
log.error('{}: BLAST request timed out'.format(self.id))
743744
print(e)

0 commit comments

Comments
 (0)