BiG-SCAPE 1.1.5

- Added reference MIBiG 3.1 BGCs - Added rules for 'prodigiosin' and 'PpyS-KS' (PKS Other); 'CDPS' and 'mycosporine-like' (NRPS); 'crocagin' (RiPP) - Officially dropped Python 2 support - Code Cleanup
medema-group · Nov 14, 2022 · 8552350 · 8552350
1 parent c2e87c8
commit 8552350
Show file tree

Hide file tree

Showing 6 changed files with 1,030 additions and 249 deletions.
diff --git a/Annotated_MIBiG_reference/MIBiG_3.1_final.zip b/Annotated_MIBiG_reference/MIBiG_3.1_final.zip
diff --git a/ArrowerSVG.py b/ArrowerSVG.py
@@ -8,15 +8,8 @@
 #                heavily modified by Jorge Navarro 2016              #
 ######################################################################
 
-# Makes sure the script can be used with Python 2 as well as Python 3.
-from __future__ import print_function, division
-from sys import version_info
-if version_info[0]==2:
-    range = xrange
-
 import os
 import sys
-import argparse
 from Bio import SeqIO
 from random import uniform
 from colorsys import hsv_to_rgb
@@ -37,30 +30,6 @@
 domains_color_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "domains_color_file.tsv")
 
 
-# read various color data
-def read_color_genes_file():
-    # Try to read already-generated colors for genes
-    color_genes = {}
-
-    if os.path.isfile(gene_color_file):
-        print("  Found file with gene colors")
-        with open(gene_color_file, "r") as color_genes_handle:
-            for line in color_genes_handle:
-                # handle comments and empty lines
-                if line[0] != "#" and line.strip():
-                    row = line.strip().split("\t")
-                    name = row[0]
-                    rgb = row[1].split(",")
-                    color_genes[name] = [int(rgb[x]) for x in range(3)]
-    else:
-        print("  Gene color file was not found. A new file will be created")
-        with open(gene_color_file, "w") as color_genes_handle:
-            color_genes_handle.write("NoName\t255,255,255\n")
-        color_genes = {"NoName":[255, 255, 255]}
-
-    return color_genes
-
-
 def read_color_domains_file():
     # Try to read colors for domains
     color_domains = {}
@@ -80,26 +49,6 @@ def read_color_domains_file():
         color_domains_handle = open(domains_color_file, "a+")
 
     return color_domains
-
-
-# Try to read categories:
-def read_pfam_domain_categories():
-    pfam_category = {}
-
-    if os.path.isfile(pfam_domain_categories):
-        print("  Found file with Pfam domain categories")
-        with open(pfam_domain_categories, "r") as cat_handle:            
-            for line in cat_handle:
-                # handle comments and empty lines
-                if line[0] != "#" and line.strip():
-                    row = line.strip().split("\t")
-                    domain = row[1]
-                    category = row[0]
-                    pfam_category[domain] = category
-    else:
-        print("  File pfam_domain_categories was NOT found")
-
-    return pfam_category
 
 
 # --- Draw arrow for gene

diff --git a/bigscape.py b/bigscape.py
@@ -1,45 +1,35 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 
 """
 BiG-SCAPE
 
 PI: Marnix Medema               [email protected]
 
-Maintainers/developers:
-Jorge Navarro                   [email protected]
-Satria Kautsar                  [email protected]
+Maintainers:
+Jorge Navarro                   [email protected]
 
-Developer:
+Developers:
+Jorge Navarro                   [email protected]
+Satria Kautsar                  [email protected]
 Emmanuel (Emzo) de los Santos   [email protected]
 
 
 Usage:   Please see `python bigscape.py -h`
 
 Example: python bigscape.py -c 8 --pfam_dir ./ -i ./inputfiles -o ./results
 
-Status: beta
 
 Official repository:
-https://git.wageningenur.nl/medema-group/BiG-SCAPE
+https://github.com/medema-group/BiG-SCAPE
 
 
 # License: GNU Affero General Public License v3 or later
-# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
+# A copy of GNU AGPL v3 should have been included in this software package in 
+# LICENSE.txt.
 """
 
-# Makes sure the script can be used with Python 2 as well as Python 3.
-from __future__ import print_function
-from __future__ import division
-
-from sys import version_info
-if version_info[0]==2:
-    range = xrange
-    import cPickle as pickle # for storing and retrieving dictionaries
-elif version_info[0]==3:
-    import pickle # for storing and retrieving dictionaries
-
-from math import exp, log
+import pickle # for storing and retrieving dictionaries
 import os
 import subprocess
 import sys
@@ -55,18 +45,17 @@
 import zipfile
 
 from Bio import SeqIO
-from Bio.SeqFeature import BeforePosition, AfterPosition
-from Bio import AlignIO
-from Bio import pairwise2
-from Bio.SubsMat.MatrixInfo import pam250 as scoring_matrix
+# from Bio.SeqFeature import BeforePosition, AfterPosition
+# from Bio import AlignIO
+# from Bio import pairwise2
+# from Bio.SubsMat.MatrixInfo import pam250 as scoring_matrix
 from Bio import Phylo
 
 from functions import *
 from ArrowerSVG import *
 
 import numpy as np
 from array import array
-from scipy.sparse import lil_matrix
 from scipy.optimize import linear_sum_assignment
 import json
 import shutil
@@ -86,7 +75,6 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
 
     biosynthetic_genes = set()
     product_list_per_record = []
-    fasta_data = []
     save_fasta = True
     adding_sequence = False
     contig_edge = False
@@ -1050,7 +1038,6 @@ def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, c
     S_anchor = domain_difference_anchor
 
     # Cases 2 and 3 (now merged)
-    missing_aligned_domain_files = []
     for shared_domain in intersect:
         specific_domain_list_A = BGCs[A][shared_domain]
         specific_domain_list_B = BGCs[B][shared_domain]
@@ -1073,37 +1060,8 @@ def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, c
                 matches = 0
                 gaps = 0
 
-                try:
-                    aligned_seqA = AlignedDomainSequences[sequence_tag_a]
-                    aligned_seqB = AlignedDomainSequences[sequence_tag_b]
-
-                except KeyError:
-                    # For some reason we don't have the multiple alignment files. 
-                    # Try manual alignment
-                    if shared_domain not in missing_aligned_domain_files and verbose:
-                        # this will print everytime an unfound <domain>.algn is not found for every
-                        # distance calculation (but at least, not for every domain pair!)
-                        print("  Warning: {}.algn not found. Trying pairwise alignment...".format(shared_domain))
-                        missing_aligned_domain_files.append(shared_domain)
-
-                    try:
-                        unaligned_seqA = temp_domain_fastas[sequence_tag_a]
-                        unaligned_seqB = temp_domain_fastas[sequence_tag_b]
-                    except KeyError:
-                        # parse the file for the first time and load all the sequences
-                        with open(os.path.join(domains_folder, shared_domain + ".fasta"),"r") as domain_fasta_handle:
-                            temp_domain_fastas = fasta_parser(domain_fasta_handle)
-
-                        unaligned_seqA = temp_domain_fastas[sequence_tag_a]
-                        unaligned_seqB = temp_domain_fastas[sequence_tag_b]
-
-                    # gap_open = -15
-                    # gap_extend = -6.67. These parameters were set up by Emzo
-                    alignScore = pairwise2.align.globalds(unaligned_seqA, unaligned_seqB, scoring_matrix, -15, -6.67, one_alignment_only=True)
-                    bestAlignment = alignScore[0]
-                    aligned_seqA = bestAlignment[0]
-                    aligned_seqB = bestAlignment[1]
-
+                aligned_seqA = AlignedDomainSequences[sequence_tag_a]
+                aligned_seqB = AlignedDomainSequences[sequence_tag_b]
 
                 # - Calculate aligned domain sequences similarity -
                 # Sequences *should* be of the same length unless something went
@@ -1406,38 +1364,40 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
         # We cannot get all the info exclusively from the pfd because that only
         # contains ORFs with predicted domains (and we need to draw empty genes
         # as well)
-        for line in open(fastaFile):
-            if line[0] == ">":
-                header = line.strip()[1:].split(':')
-                orf = header[0]
-                if header[2]:
-                    orfDict[orf]["id"] = header[2]
-                elif header[4]:
-                    orfDict[orf]["id"] = header[4]
-                else:
-                    orfDict[orf]["id"] = orf
-
-                ## broken gene goes into cluster, need this so js doesn't throw an error
-                if int(header[6]) <= 1:
-                    orfDict[orf]["start"] = 1
-                else:
-                    orfDict[orf]["start"] = int(header[6])
-
-                orfDict[orf]["end"] = int(header[7])
-
-                if header[-1] == '+':
-                    orfDict[orf]["strand"] = 1
-                else:
-                    orfDict[orf]["strand"] = -1
+        with open(fastaFile) as fastaFile_handle:
+            for line in fastaFile_handle:
+                if line[0] == ">":
+                    header = line.strip()[1:].split(':')
+                    orf = header[0]
+                    if header[2]:
+                        orfDict[orf]["id"] = header[2]
+                    elif header[4]:
+                        orfDict[orf]["id"] = header[4]
+                    else:
+                        orfDict[orf]["id"] = orf
+
+                    ## broken gene goes into cluster, need this so js doesn't throw an error
+                    if int(header[6]) <= 1:
+                        orfDict[orf]["start"] = 1
+                    else:
+                        orfDict[orf]["start"] = int(header[6])
+
+                    orfDict[orf]["end"] = int(header[7])
 
-                orfDict[orf]["domains"] = []
+                    if header[-1] == '+':
+                        orfDict[orf]["strand"] = 1
+                    else:
+                        orfDict[orf]["strand"] = -1
+
+                    orfDict[orf]["domains"] = []
 
         ## now read pfd file to add the domains to each of the orfs
-        for line in open(pfdFile):
-            entry = line.split('\t')
-            orf = entry[-1].strip().split(':')[0]
-            pfamID = entry[5].split('.')[0]
-            orfDict[orf]["domains"].append({'code': pfamID, 'start': int(entry[3]), 'end': int(entry[4]), 'bitscore': float(entry[1])})
+        with open(pfdFile) as pfdFile_handle:
+            for line in pfdFile_handle:
+                entry = line.split('\t')
+                orf = entry[-1].strip().split(':')[0]
+                pfamID = entry[5].split('.')[0]
+                orfDict[orf]["domains"].append({'code': pfamID, 'start': int(entry[3]), 'end': int(entry[4]), 'bitscore': float(entry[1])})
         # order of ORFs is important here because I use it to get a translation
         # between the "list of ORFs with domains" to "list of all ORFs" later on
         bgcJsonDict[bgcName]['orfs'] = sorted(orfDict.values(), key=itemgetter("start"))
@@ -1815,7 +1775,7 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
                         domainGenes2allGenes[bgc][has_domains] = orf
                         has_domains += 1
 
-            assert (len(members) > 0), "Error: bs_families[{}] have no members, something went wrong?".format(fam_idx)
+            assert len(members) > 0, f"Error: bs_families[{family}] have no members, something went wrong?"
 
             ref_genes_ = set()
             aln = []
@@ -2082,12 +2042,15 @@ def CMD_parser():
                         sequences. Use if alignments have been generated in a \
                         previous run.")
 
-    parser.add_argument("--mibig", dest="mibig21", default=False, action="store_true",
-                        help="Use included BGCs from then MIBiG database. Only \
-                        relevant (i.e. those with distance < max(cutoffs) against\
-                        the input set) will be used. Currently uses version 2.1\
-                        of MIBiG. See https://mibig.secondarymetabolites.org/")
+    parser.add_argument("--mibig", dest="mibig31", default=False, 
+                        action="store_true", help="Include MIBiG 3.1 BGCs as \
+                        reference (https://mibig.secondarymetabolites.org/). \
+                        These BGCs will only be kept if they are connected to \
+                        a region in the input set (distance < max(cutoffs)).")
 
+    parser.add_argument("--mibig21", dest="mibig21", default=False, action="store_true",
+                        help="Include BGCs from version 2.1 of MIBiG")
+
     parser.add_argument("--mibig14", dest="mibig14", default=False, action="store_true",
                         help="Include BGCs from version 1.4 of MIBiG")
 
@@ -2104,7 +2067,8 @@ def CMD_parser():
                         domain_includelist.txt file", default=False,
                         action="store_true")
 
-    parser.add_argument("--version", action="version", version="%(prog)s 1.1.4 (2022-04-14)")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.1.5 \
+        (2022-11-14)")
 
     return parser.parse_args()
 
@@ -2230,6 +2194,7 @@ def __init__(self, accession_id, description, product, records, max_width, bgc_s
     verbose = options.verbose
 
     selected_mibig = 0
+    if options.mibig31: selected_mibig += 1
     if options.mibig21: selected_mibig += 1
     if options.mibig14: selected_mibig += 1
     if options.mibig13: selected_mibig += 1
@@ -2360,7 +2325,9 @@ def __init__(self, accession_id, description, product, records, max_width, bgc_s
     # (file, final folder, number of bgcs)
     mibig_set = set()
     if use_relevant_mibig:
-        if options.mibig21:
+        if options.mibig31:
+            mibig_zipfile_numbgcs = ("MIBiG_3.1_final.zip", "MIBiG_3.1_final", 2502)
+        elif options.mibig21:
             mibig_zipfile_numbgcs = ("MIBiG_2.1_final.zip", "MIBiG_2.1_final", 1923)
         elif options.mibig14:
             mibig_zipfile_numbgcs = ("MIBiG_1.4_final.zip", "MIBiG_1.4_final", 1808)
@@ -2628,7 +2595,9 @@ def __init__(self, accession_id, description, product, records, max_width, bgc_s
                 print("   Processing: " + outputbase)
 
             pfdFile = os.path.join(pfd_folder, outputbase + ".pfd")
-            filtered_matrix = [[part.strip() for part in line.split('\t')] for line in open(pfdFile)]
+            with open(pfdFile) as pfd_handle:
+                filtered_matrix = [[part.strip() for part in line.split('\t')] 
+                    for line in pfd_handle]
 
             # save each domain sequence from a single BGC in its corresponding file
             fasta_file = os.path.join(bgc_fasta_folder, outputbase + ".fasta")
@@ -2714,21 +2683,27 @@ def __init__(self, accession_id, description, product, records, max_width, bgc_s
     # read hmm file. We'll need that info anyway for final visualization
     print("  Parsing hmm file for domain information")
     pfam_info = {}
+    name = ""
+    acc = "" # compulsory line
+    desc = ""
     with open(os.path.join(pfam_dir, "Pfam-A.hmm"), "r") as pfam:
-        putindict = False
-        # assuming that the order of the information never changes
         for line in pfam:
-            if line[:4] == "NAME":
-                name = line.strip()[6:]
-            if line[:3] == "ACC":
-                acc = line.strip()[6:].split(".")[0]
-            if line[:4] == "DESC":
-                desc = line.strip()[6:]
-                putindict = True
-
-            if putindict:
-                putindict = False
-                pfam_info[acc] = (name, desc)
+            if line.strip() == "//":
+                # found new record
+                if acc: pfam_info[acc] = (name, desc)
+
+                # clear data in any case
+                name = ""
+                acc = "" # compulsory line
+                desc = ""
+
+                continue
+
+            if line[:4] not in {"NAME", "ACC ", "DESC"}: continue
+
+            if line[:4] == "NAME": name = line.strip()[6:]
+            elif line[:3] == "ACC": acc = line.strip()[6:].split(".")[0]
+            elif line[:4] == "DESC": desc = line.strip()[6:]
     print("    Done")
 
     # verify if there are figures already generated