CGATOxford · kevinrue · Feb 6, 2018 · Feb 8, 2018 · Feb 9, 2018 · Feb 9, 2018
diff --git a/CGATPipelines/Pipeline/Cluster.py b/CGATPipelines/Pipeline/Cluster.py
@@ -19,7 +19,7 @@
 try:
     import drmaa
     HAS_DRMAA = True
-except:
+except (ImportError, RuntimeError):
 # the following does not work on Travis
 #except ImportError or RuntimeError:
     HAS_DRMAA = False
@@ -325,6 +325,18 @@ def collectSingleJobFromCluster(session, job_id,
             (retval.exitStatus,
              "".join(stderr), statement))
 
+    if ((retval.hasExited is False or retval.wasAborted is True) and not
+        ignore_errors):
+
+        raise OSError(
+            "-------------------------------------------------\n"
+            "Cluster job was aborted (%s) and/or failed to exit (%s) "
+            "while running the following statement:\n"
+            "\n%s\n"
+            "(Job may have been cancelled by the user or the scheduler)\n"
+            "----------------------------------------------------------\n" %
+            (retval.wasAborted, not retval.hasExited, statement))
+
     try:
         os.unlink(job_path)
     except OSError:

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
@@ -41,7 +41,7 @@
 try:
     import drmaa
     HAS_DRMAA = True
-except:
+except (ImportError, RuntimeError):
 # the following does not work on Travis
 #except ImportError or RuntimeError:
     HAS_DRMAA = False
@@ -337,12 +337,15 @@ def peekParameters(workingdir,
             return {}
 
     statement = "python %s -f -v 0 dump" % pipeline
+
+    os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')})
     process = subprocess.Popen(statement,
                                cwd=workingdir,
                                shell=True,
                                stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
+                               stderr=subprocess.PIPE,
+                               env=os.environ.copy())
 
     # process.stdin.close()
     stdout, stderr = process.communicate()
@@ -901,6 +904,12 @@ def main(args=sys.argv):
                     # create the session proxy
                     startSession()
 
+                elif not options.without_cluster and not HAS_DRMAA:
+                    E.critical("DRMAA API not found so cannot talk to a cluster.")
+                    E.critical("Please use --local to run the pipeline"
+                               " on this host: {}".format(os.uname()[1]))
+                    sys.exit(-1)
+
                 #
                 #   make sure we are not logging at the same time in
                 #   different processes

diff --git a/CGATPipelines/Pipeline/Execution.py b/CGATPipelines/Pipeline/Execution.py
@@ -33,7 +33,7 @@
 try:
     import drmaa
     HAS_DRMAA = True
-except:
+except (ImportError, RuntimeError):
 # the following does not work on Travis
 #except ImportError or RuntimeError:
     HAS_DRMAA = False
@@ -136,12 +136,14 @@ def execute(statement, **kwargs):
     if statement.endswith(";"):
         statement = statement[:-1]
 
+    os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')})
     process = subprocess.Popen(statement % kwargs,
                                cwd=cwd,
                                shell=True,
                                stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
+                               stderr=subprocess.PIPE,
+                               env=os.environ.copy())
 
     # process.stdin.close()
     stdout, stderr = process.communicate()
@@ -581,6 +583,7 @@ def _writeJobScript(statement, job_memory, job_name, shellfile):
                 statement = pipes.quote(statement)
                 statement = "%s -c %s" % (shell, statement)
 
+            os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')})
             process = subprocess.Popen(
                 expandStatement(
                     statement,
@@ -589,7 +592,8 @@ def _writeJobScript(statement, job_memory, job_name, shellfile):
                 shell=True,
                 stdin=subprocess.PIPE,
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE)
+                stderr=subprocess.PIPE,
+                env=os.environ.copy())
 
             # process.stdin.close()
             stdout, stderr = process.communicate()

diff --git a/CGATPipelines/Pipeline/Parameters.py b/CGATPipelines/Pipeline/Parameters.py
@@ -431,13 +431,6 @@ def getParameters(filenames=["pipeline.ini", ],
         if os.path.exists(fn):
             filenames.insert(0, fn)
 
-    if user_ini:
-        # read configuration from a users home directory
-        fn = os.path.join(os.path.expanduser("~"),
-                          ".cgat")
-        if os.path.exists(fn):
-            filenames.insert(0, fn)
-
     if default_ini:
         # The link between CGATPipelines and Pipeline.py
         # needs to severed at one point.
@@ -448,6 +441,17 @@ def getParameters(filenames=["pipeline.ini", ],
                                       'configuration',
                                       'pipeline.ini'))
 
+    if user_ini:
+        # read configuration from a users home directory
+        fn = os.path.join(os.path.expanduser("~"),
+                          ".cgat")
+        if os.path.exists(fn):
+            if 'pipeline.ini' in filenames:
+                index = filenames.index('pipeline.ini')
+                filenames.insert(index,fn)
+            else:
+                filenames.append(fn)
+
     # IMS: Several legacy scripts call this with a string as input
     # rather than a list. Check for this and correct
 

diff --git a/CGATPipelines/PipelineBamStats.py b/CGATPipelines/PipelineBamStats.py
@@ -502,7 +502,7 @@ def summarizeTagsWithinContext(tagfile,
                                contextfile,
                                outfile,
                                min_overlap=0.5,
-                               job_memory="4G"):
+                               job_memory="15G"):
     '''count occurances of tags in genomic context.
 
     Examines the genomic context to where tags align.
@@ -1076,38 +1076,6 @@ def loadBAMStats(infiles, outfile):
         P.run()
 
 
-def buildPicardRnaSeqMetrics(infiles, strand, outfile):
-    '''run picard:RNASeqMetrics
-    Arguments
-    ---------
-    infiles : string
-        Input filename in :term:`BAM` format.
-        Genome file in refflat format
-            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
-    outfile : string
-        Output filename with picard output.
-    '''
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 20
-    infile, genome = infiles
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        P.touch(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
-    REF_FLAT=%(genome)s
-    INPUT=%(infile)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    STRAND=%(strand)s
-    VALIDATION_STRINGENCY=SILENT
-    '''
-    P.run()
-
-
 def loadPicardRnaSeqMetrics(infiles, outfiles):
     '''load picard rna stats into database.
     Loads tables into the database

diff --git a/CGATPipelines/PipelineGO.py b/CGATPipelines/PipelineGO.py
@@ -22,7 +22,8 @@
 PARAMS = {}
 
 
-def createGOFromENSEMBL(infile, outfile):
+def createGOFromENSEMBL(infile, outfile,
+                        job_memory="5G"):
     """get GO assignments from ENSEMBL
 
     Download GO assignments from the ENSEMBL database and store in
@@ -46,7 +47,6 @@ def createGOFromENSEMBL(infile, outfile):
 
     """
 
-    job_memory = "5G"
     statement = '''
     cgat runGO
     --filename-dump=%(outfile)s
@@ -277,7 +277,8 @@ def getGODescriptions(infile):
         table[fields.index("description")])])
 
 
-def createGOSlimFromENSEMBL(infile, outfile):
+def createGOSlimFromENSEMBL(infile, outfile,
+                            job_memory="5G"):
     """build GO SLIM assignments.
 
     This method downloads a GOSlim specification
@@ -315,15 +316,14 @@ def createGOSlimFromENSEMBL(infile, outfile):
     P.run()
 
     E.info("mapping GO to GOSlim")
-    job_memory = "5G"
+
     statement = '''
     map2slim -outmap %(outfile)s.map
     %(goslim_fn)s
     %(ontology_fn)s
     '''
     P.run()
 
-    job_memory = "5G"
     statement = '''
     zcat < %(infile)s
     | cgat runGO
@@ -406,7 +406,7 @@ def runGOFromFiles(outfile,
 
     options = " ".join(options)
     statement = '''
-    cgat runGO 
+    cgat runGO
     --filename-input=%(go_file)s
     --genes-tsv-file=%(fg_file)s
     --output-filename-pattern='%(outdir)s/%%(set)s.%%(go)s.%%(section)s'

diff --git a/CGATPipelines/PipelineGeneset.py b/CGATPipelines/PipelineGeneset.py
@@ -53,7 +53,8 @@ def mapUCSCToEnsembl(genome):
 
 
 def annotateGenome(infile, outfile,
-                   only_proteincoding=False):
+                   only_proteincoding=False,
+                   job_memory="4G"):
     '''annotate genomic regions with reference gene set.
 
     The method applies the following filters to an ENSEMBL gene set:
@@ -120,7 +121,8 @@ def annotateGenome(infile, outfile,
 
 
 def annotateGeneStructure(infile, outfile,
-                          only_proteincoding=False):
+                          only_proteincoding=False,
+                          job_memory="4G"):
     """annotate genomic regions with gene structure.
 
     The method applies the following filters to an ENSEMBL gene set:
@@ -1439,7 +1441,8 @@ def sortGTF(infile, outfile, order="contig+gene"):
     P.run()
 
 
-def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles):
+def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles,
+                                     job_memory="4G"):
     '''output a bed file with functional annotations.
 
     The genomic region a gene covers is taken from the `gtffile`.

diff --git a/CGATPipelines/PipelineGtfsubset.py b/CGATPipelines/PipelineGtfsubset.py
@@ -124,7 +124,8 @@ def connectToUCSC(host="genome-mysql.cse.ucsc.edu",
 def getRepeatDataFromUCSC(dbhandle,
                           repclasses,
                           outfile,
-                          remove_contigs_regex=None):
+                          remove_contigs_regex=None,
+                          job_memory="4G"):
     '''download data from UCSC database and write to `outfile` in
     :term:`gff` format.
 
@@ -202,7 +203,8 @@ def getRepeatDataFromUCSC(dbhandle,
     os.unlink(tmpfilename)
 
 
-def buildGenomicContext(infiles, outfile, distance=10):
+def buildGenomicContext(infiles, outfile, distance=10,
+                        job_memory="4G"):
 
     '''build a :term:`bed` formatted file with genomic context.
     The output is a bed formatted file, annotating genomic segments
@@ -295,7 +297,7 @@ def buildGenomicContext(infiles, outfile, distance=10):
         os.unlink(x)
 
 
-def buildFlatGeneSet(infile, outfile):
+def buildFlatGeneSet(infile, outfile, job_memory="4G"):
     '''build a flattened gene set.
     All transcripts in a gene are merged into a single transcript by
     combining overlapping exons.
@@ -337,7 +339,8 @@ def buildFlatGeneSet(infile, outfile):
     P.run()
 
 
-def loadGeneInformation(infile, outfile, only_proteincoding=False):
+def loadGeneInformation(infile, outfile, only_proteincoding=False,
+                        job_memory="4G"):
     '''load gene-related attributes from :term:`gtf` file into database.
     This method takes transcript-associated features from an
     :term:`gtf` file and collects the gene-related attributes in the
@@ -354,7 +357,6 @@ def loadGeneInformation(infile, outfile, only_proteincoding=False):
        If True, only consider protein coding genes.
     '''
 
-    job_memory = "4G"
     table = P.toTable(outfile)
 
     if only_proteincoding:

diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini
@@ -89,7 +89,7 @@ priority=-10
 ################################################################
 [report]
 # number of threads to use to build the documentation
-threads=10
+threads=1
 
 memory=1G
 

diff --git a/CGATPipelines/pipeline_bamstats.py b/CGATPipelines/pipeline_bamstats.py
@@ -270,7 +270,7 @@ def intBam(infile, outfile):
     If there is no sequence quality then make a softlink. Picard tools
     has an issue when quality score infomation is missing'''
 
-    if PARAMS["bam_sequence_stipped"] is True:
+    if PARAMS["bam_sequence_stripped"] is True:
         PipelineBamStats.addPseudoSequenceQuality(infile,
                                                   outfile)
     else:
@@ -559,7 +559,7 @@ def buildIntronLevelReadCounts(infiles, outfile):
 
 @active_if(SPLICED_MAPPING)
 @transform(intBam,
-           regex("BamFiles.dir/(.*).bam$"),
+           regex("BamFiles.dir/(\S+).bam$"),
            add_inputs(PARAMS["annotations_interface_geneset_coding_exons_gtf"]),
            r"Paired_QC.dir/\1.transcriptprofile.gz")
 def buildTranscriptProfiles(infiles, outfile):
@@ -754,7 +754,8 @@ def loadStrandSpecificity(infiles, outfile):
 # These tasks allow ruffus to pipeline tasks together
 
 
-@follows(loadPicardStats,
+@follows(buildTranscriptProfiles,
+         loadPicardStats,
          loadPicardDuplicationStats,
          loadBAMStats,
          loadContextStats,
@@ -796,7 +797,7 @@ def renderJupyterReport():
                                                'Jupyter_report'))
 
     statement = ''' cp %(report_path)s/* Jupyter_report.dir/ ; cd Jupyter_report.dir/;
-                    jupyter nbconvert --ExecutePreprocessor.timeout=None --allow-errors --to html --execute *.ipynb;
+                    jupyter nbconvert --ExecutePreprocessor.timeout=None --to html --execute *.ipynb --allow-errors;
                     mkdir _site;
                     mv -t _site *.html cgat_logo.jpeg oxford.png'''
 

diff --git a/CGATPipelines/pipeline_bamstats/pipeline.ini b/CGATPipelines/pipeline_bamstats/pipeline.ini
@@ -40,7 +40,7 @@ paired_end=0
 
 # sometimes a bam has its sequence quality stripped to save space
 # if this is the case then specify below:
-sequence_stipped=0
+sequence_stripped=0
 
 ################################################################
 ## name of the database that you want to generate

diff --git a/CGATPipelines/pipeline_docs/pipeline_bamstats/Jupyter_report/CGAT_bamstats_report.ipynb b/CGATPipelines/pipeline_docs/pipeline_bamstats/Jupyter_report/CGAT_bamstats_report.ipynb