enasequence · froggleston · Aug 31, 2017 · Oct 6, 2017 · Oct 6, 2017 · Nov 19, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
 ### Mac ###
 .DS_Store
 **/.DS_Store
+
+### compiled python ###
+python/*.pyc
+python3/__pycache__
diff --git a/README.md b/README.md
@@ -17,6 +17,55 @@ Both Python 2 and Python 3 scripts are available.  The Python 2 scripts can be f
 
 To run these scripts you will need to have Python installed.  You can download either Python 2 or Python 3 from [here](https://www.python.org/downloads/). If you already have Python installed, you can find out which version when you start the python interpreter.  If using Python 2, we suggest you upgrade to the latest version if you don't already have it: 2.7.
 
+Note that EBI now uses HTTPS servers. This can create a problem when using Python 3 on a Mac due to an oft-missed
+installation step. Please run the "Install Certificates.command" command to ensure your Python 3 installation on
+the Mac can correctly authenticate against the servers. To do this, run the following from a terminal window, updating
+the Python version with the correct version of Python 3 that you have installed:
+open "/Applications/Python 3.6/Install Certificates.command"
+
+We have had a report from a user than when Python 3 was installed using homebrew, the following code needed to be run instead:
+```
+# install_certifi.py
+#
+# sample script to install or update a set of default Root Certificates
+# for the ssl module.  Uses the certificates provided by the certifi package:
+#       https://pypi.python.org/pypi/certifi
+
+import os
+import os.path
+import ssl
+import stat
+import subprocess
+import sys
+
+STAT_0o775 = ( stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR
+             | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP
+             | stat.S_IROTH |                stat.S_IXOTH )
+
+openssl_dir, openssl_cafile = os.path.split(
+    ssl.get_default_verify_paths().openssl_cafile)
+
+print(" -- pip install --upgrade certifi")
+subprocess.check_call([sys.executable,
+    "-E", "-s", "-m", "pip", "install", "--upgrade", "certifi"])
+
+import certifi
+
+# change working directory to the default SSL directory
+os.chdir(openssl_dir)
+relpath_to_certifi_cafile = os.path.relpath(certifi.where())
+print(" -- removing any existing file or link")
+try:
+    os.remove(openssl_cafile)
+except FileNotFoundError:
+    pass
+print(" -- creating symlink to certifi certificate bundle")
+os.symlink(relpath_to_certifi_cafile, openssl_cafile)
+print(" -- setting permissions")
+os.chmod(openssl_cafile, STAT_0o775)
+print(" -- update complete")
+```
+
 ## Installing and running the scripts
 
 Download the [latest release](https://github.com/enasequence/enaBrowserTools/releases/latest) and extract it to the preferred location on your computer. You will now have the enaBrowserTools folder, containing both the python 2 and 3 option scripts.  If you are using a Unix/Linux or Mac computer, we suggest you add the following aliases to your .bashrc or .bash_profile file. Where INSTALLATION_DIR is the location where you have saved the enaBrowserTools to and PYTHON_CHOICE will depend on whether you are using the Python 2 or Python 3 scripts.
@@ -107,12 +156,16 @@ optional arguments:
                         File format required. Format requested must be
                         permitted for data type selected. sequence, assembly
                         and wgs accessions: embl(default) and fasta formats.
-                        read group: submitted, fastq and sra
-                        formats. analysis group: submitted only.
+                        read group: submitted, fastq and sra formats. analysis
+                        group: submitted only.
   -d DEST, --dest DEST  Destination directory (default is current running
                         directory)
   -w, --wgs             Download WGS set for each assembly if available
                         (default is false)
+  -e, --extract-wgs     Extract WGS scaffolds for each assembly if available
+                        (default is false)
+  -exp, --expanded      Expand CON scaffolds when downloading embl format
+                        (default is false)
   -m, --meta            Download read or analysis XML in addition to data
                         files (default is false)
   -i, --index           Download CRAM index files with submitted CRAM files,
@@ -141,7 +194,7 @@ usage: enaGroupGet [-h] [-g {sequence,wgs,assembly,read,analysis}]
                    [-i] [-a] [-as ASPERA_SETTINGS] [-t] [-v]
                    accession
 
-Download data for a given study or sample
+Download data for a given study or sample, or (for sequence and assembly) taxon
 
 positional arguments:
   accession             Study or sample accession or NCBI tax ID to fetch data
@@ -162,6 +215,10 @@ optional arguments:
                         directory)
   -w, --wgs             Download WGS set for each assembly if available
                         (default is false)
+  -e, --extract-wgs     Extract WGS scaffolds for each assembly if available
+                        (default is false)
+  -exp, --expanded      Expand CON scaffolds when downloading embl format
+                        (default is false)
   -m, --meta            Download read or analysis XML in addition to data
                         files (default is false)
   -i, --index           Download CRAM index files with submitted CRAM files,
@@ -180,10 +237,10 @@ optional arguments:
 
 # Tips
 
-From version 1.4, when downloading read data if you use the default format (that is, don't use the format option), the scripts will look for available files in the following priority: submitted, sra, fastq. 
+From version 1.4, when downloading read data if you use the default format (that is, don't use the format option), the scripts will look for available files in the following priority: submitted, sra, fastq.
 
 A word of advice for read formats:
-- submitted: only read data submitted to ENA have files available as submitted by the user. 
+- submitted: only read data submitted to ENA have files available as submitted by the user.
 - sra:  this is the NCBI SRA format, and is the format in which all NCBI/DDBJ data is mirrored to ENA.
 - fastq:  not all submitted format files can be converted to FASTQ
 

diff --git a/python/__init__.py b/python/__init__.py
diff --git a/python/assemblyGet.py b/python/assemblyGet.py
@@ -20,6 +20,7 @@
 import os
 import sys
 import argparse
+import gzip
 import xml.etree.ElementTree as ElementTree
 
 import utils
@@ -31,7 +32,7 @@
 PATCH = 'patch'
 
 def check_format(output_format):
-    if format not in [utils.EMBL_FORMAT, utils.FASTA_FORMAT]:
+    if output_format not in [utils.EMBL_FORMAT, utils.FASTA_FORMAT]:
         sys.stderr.write(
             'ERROR: Invalid format. Please select a valid format for this accession: {0}\n'.format([utils.EMBL_FORMAT, utils.FASTA_FORMAT])
         )
@@ -69,31 +70,72 @@ def parse_sequence_report(local_sequence_report):
     patch_list = [l.split('\t')[0] for l in lines[1:] if PATCH in l.split('\t')[3]]
     return (replicon_list, unlocalised_list, unplaced_list, patch_list)
 
-def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, quiet):
+def extract_wgs_sequences(accession_list):
+    wgs_sequences = [a for a in accession_list if utils.is_wgs_sequence(a)]
+    other_sequences = [a for a in accession_list if not utils.is_wgs_sequence(a)]
+    return wgs_sequences, other_sequences
+
+def download_sequence_set(accession_list, mol_type, assembly_dir, output_format, expanded, quiet):
     failed_accessions = []
-    if len(accession_list) > 0:
+    count = 0
+    sequence_cnt = len(accession_list)
+    divisor = utils.get_divisor(sequence_cnt)
+    if sequence_cnt > 0:
         if not quiet:
-            print 'fetching sequences: ' + mol_type
-        target_file = os.path.join(assembly_dir, utils.get_filename(mol_type, output_format))
+            print 'fetching {0} sequences: {1}'.format(sequence_cnt, mol_type)
+        target_file_path = os.path.join(assembly_dir, utils.get_filename(mol_type, output_format))
+        target_file = open(target_file_path, 'w')
         for accession in accession_list:
-            success = sequenceGet.append_record(target_file, accession, output_format)
+            success = sequenceGet.write_record(target_file, accession, output_format, expanded)
             if not success:
                 failed_accessions.append(accession)
+            else:
+                count += 1
+                if count % divisor == 0 and not quiet:
+                    print 'downloaded {0} of {1} sequences'.format(count, sequence_cnt)
+        if not quiet:
+            print 'downloaded {0} of {1} sequences'.format(count, sequence_cnt)
+        target_file.close()
     elif not quiet:
         print 'no sequences: ' + mol_type
     if len(failed_accessions) > 0:
-        print 'Failed to fetch following ' + mol_type + ', format ' + output_format
-        print failed_accessions.join(',')
+        print 'Failed to fetch following {0}, format {1}'.format(mol_type, output_format)
+        print ','.join(failed_accessions)
 
-def download_sequences(sequence_report, assembly_dir, output_format, quiet):
+def download_sequences(sequence_report, assembly_dir, output_format, expanded, quiet):
     local_sequence_report = os.path.join(assembly_dir, sequence_report)
     replicon_list, unlocalised_list, unplaced_list, patch_list = parse_sequence_report(local_sequence_report)
-    download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, quiet)
-    download_sequence_set(unlocalised_list, UNLOCALISED, assembly_dir, output_format, quiet)
-    download_sequence_set(unplaced_list, UNPLACED, assembly_dir, output_format, quiet)
-    download_sequence_set(patch_list, PATCH, assembly_dir, output_format, quiet)
+    wgs_scaffolds, other_unlocalised = extract_wgs_sequences(unlocalised_list)
+    wgs_unplaced, other_unplaced = extract_wgs_sequences(unplaced_list)
+    download_sequence_set(replicon_list, REPLICON, assembly_dir, output_format, expanded, quiet)
+    download_sequence_set(other_unlocalised, UNLOCALISED, assembly_dir, output_format, expanded, quiet)
+    download_sequence_set(other_unplaced, UNPLACED, assembly_dir, output_format, expanded, quiet)
+    download_sequence_set(patch_list, PATCH, assembly_dir, output_format, expanded, quiet)
+    wgs_scaffolds.extend(wgs_unplaced)
+    return wgs_scaffolds
 
-def download_assembly(dest_dir, accession, output_format, fetch_wgs, quiet=False):
+def extract_wgs_scaffolds(assembly_dir, wgs_scaffolds, wgs_set, output_format, quiet):
+    if not quiet:
+        print 'extracting {0} WGS scaffolds from WGS set file'.format(len(wgs_scaffolds))
+    accs = [a.split('.')[0] for a in wgs_scaffolds]
+    wgs_file_path = os.path.join(assembly_dir, wgs_set + utils.get_wgs_file_ext(output_format))
+    target_file_path = os.path.join(assembly_dir, utils.get_filename('wgs_scaffolds', output_format))
+    write_line = False
+    target_file = open(target_file_path, 'w')
+    with gzip.open(wgs_file_path, 'rb') as f:
+        for line in f:
+            if utils.record_start_line(line, output_format):
+                if utils.extract_acc_from_line(line, output_format) in accs:
+                    write_line = True
+                else:
+                    write_line = False
+                    target_file.flush()
+            if write_line:
+                target_file.write(line)
+    target_file.flush()
+    target_file.close()
+
+def download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded, quiet=False, handler=None):
     if output_format is None:
         output_format = utils.EMBL_FORMAT
     assembly_dir = os.path.join(dest_dir, accession)
@@ -106,12 +148,24 @@ def download_assembly(dest_dir, accession, output_format, fetch_wgs, quiet=False
     has_sequence_report = False
     # download sequence report
     if sequence_report is not None:
-        has_sequence_report = utils.get_ftp_file(sequence_report, assembly_dir)
+        has_sequence_report = utils.get_ftp_file(sequence_report, assembly_dir, handler)
+    # parse sequence report and download sequences
+    wgs_scaffolds = []
+    wgs_scaffold_cnt = 0
+    if has_sequence_report:
+        wgs_scaffolds = download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, expanded, quiet)
+        wgs_scaffold_cnt = len(wgs_scaffolds)
+        if wgs_scaffold_cnt > 0:
+            if not quiet:
+                print 'Assembly contains {} WGS scaffolds, will fetch WGS set'.format(wgs_scaffold_cnt)
+            fetch_wgs = True
+    else:
+        fetch_wgs = True
     # download wgs set if needed
     if wgs_set is not None and fetch_wgs:
         if not quiet:
             print 'fetching wgs set'
-        sequenceGet.download_wgs(assembly_dir, wgs_set, output_format)
-    # parse sequence report and download sequences
-    if has_sequence_report:
-        download_sequences(sequence_report.split('/')[-1], assembly_dir, output_format, quiet)
+        sequenceGet.download_wgs(assembly_dir, wgs_set, output_format, handler)
+        # extract wgs scaffolds from WGS file
+        if wgs_scaffold_cnt > 0 and extract_wgs:
+            extract_wgs_scaffolds(assembly_dir, wgs_scaffolds, wgs_set, output_format, quiet)
diff --git a/python/enaDataGet.py b/python/enaDataGet.py
@@ -25,6 +25,7 @@
 import assemblyGet
 import readGet
 import utils
+import traceback
 
 def set_parser():
     parser = argparse.ArgumentParser(prog='enaDataGet',
@@ -40,6 +41,10 @@ def set_parser():
                         help='Destination directory (default is current running directory)')
     parser.add_argument('-w', '--wgs', action='store_true',
                         help='Download WGS set for each assembly if available (default is false)')
+    parser.add_argument('-e', '--extract-wgs', action='store_true',
+                        help='Extract WGS scaffolds for each assembly if available (default is false)')
+    parser.add_argument('-exp', '--expanded', action='store_true',
+                        help='Expand CON scaffolds when downloading embl format (default is false)')
     parser.add_argument('-m', '--meta', action='store_true',
                         help='Download read or analysis XML in addition to data files (default is false)')
     parser.add_argument('-i', '--index', action='store_true',
@@ -50,7 +55,10 @@ def set_parser():
     parser.add_argument('-as', '--aspera-settings', default=None,
                     help="""Use the provided settings file, will otherwise check
                         for environment variable or default settings file location.""")
-    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.4.1')
+    parser.add_argument('-r', '--redirect-handler', default=None,
+                        choices=['queue', 'file'],
+                        help="""File download progress handler. Specify an output handler to process the download progress. Default is no handler (output is printed to stdout). 'queue' redirects all output to a queue handler, such as RabbitMQ. 'file' redirects to a file handle (default is [current_file_download.log]).""")
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.5.3')
     return parser
 
 
@@ -62,42 +70,51 @@ def set_parser():
     output_format = args.format
     dest_dir = args.dest
     fetch_wgs = args.wgs
+    extract_wgs = args.extract_wgs
+    expanded = args.expanded
     fetch_meta = args.meta
     fetch_index = args.index
     aspera = args.aspera
     aspera_settings = args.aspera_settings
+    handler = args.redirect_handler
 
     if aspera or aspera_settings is not None:
         aspera = utils.set_aspera(aspera_settings)
 
     try:
         if utils.is_wgs_set(accession):
+            print("Downloading WGS set")
             if output_format is not None:
                 sequenceGet.check_format(output_format)
-            sequenceGet.download_wgs(dest_dir, accession, output_format)
+            sequenceGet.download_wgs(dest_dir, accession, output_format, handler)
         elif not utils.is_available(accession):
             sys.stderr.write('ERROR: Record does not exist or is not available for accession provided\n')
             sys.exit(1)
         elif utils.is_sequence(accession):
+            print("Downloading sequence(s)")
             if output_format is not None:
                 sequenceGet.check_format(output_format)
-            sequenceGet.download_sequence(dest_dir, accession, output_format)
+            sequenceGet.download_sequence(dest_dir, accession, output_format, expanded, handler)
         elif utils.is_analysis(accession):
+            print("Downloading analysis")
             if output_format is not None:
                 readGet.check_read_format(output_format)
-            readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera)
+            readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera, handler)
         elif utils.is_run(accession) or utils.is_experiment(accession):
+            print("Downloading reads")
             if output_format is not None:
                 readGet.check_read_format(output_format)
-            readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera)
+            readGet.download_files(accession, output_format, dest_dir, fetch_index, fetch_meta, aspera, handler)
         elif utils.is_assembly(accession):
+            print("Downloading assembly")
             if output_format is not None:
                 assemblyGet.check_format(output_format)
-            assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs)
+            assemblyGet.download_assembly(dest_dir, accession, output_format, fetch_wgs, extract_wgs, expanded, handler)
         else:
             sys.stderr.write('ERROR: Invalid accession provided\n')
             sys.exit(1)
         print 'Completed'
     except Exception:
+        traceback.print_exc()
         utils.print_error()
         sys.exit(1)