-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathbraker.pl
More file actions
executable file
·10205 lines (9580 loc) · 450 KB
/
Copy pathbraker.pl
File metadata and controls
executable file
·10205 lines (9580 loc) · 450 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
####################################################################################################
# #
# braker.pl #
# Pipeline for predicting genes with GeneMark-EX* and AUGUSTUS #
# #
# Authors: Lars Gabriel, Katharina Hoff, Simone Lange, Mario Stanke, Alexandre Lomsadze, #
# Tomas Bruna, Mark Borodovsky #
# #
# Contact: katharina.hoff@uni-greifswald.de #
# #
# #
# This script is under the Artistic Licence #
# (http://www.opensource.org/licenses/artistic-license.php) #
# #
# *EX = ES/ET/EP/ETP, currently distributed as GeneMark-ETP #
####################################################################################################
use Getopt::Long;
use File::Compare;
use File::Copy;
use File::Path qw(make_path rmtree);
use Module::Load::Conditional qw(can_load check_install requires);
use Scalar::Util::Numeric qw(isint);
use POSIX qw(floor);
use List::Util qw[min max];
use Parallel::ForkManager;
use FindBin;
use lib "$FindBin::RealBin/.";
use File::Which; # exports which()
use File::Which qw(which where); # exports which() and where()
use YAML qw(DumpFile);
use Cwd;
use Cwd 'abs_path';
use File::Spec::Functions qw(rel2abs);
use File::Basename qw(dirname basename fileparse);
use File::Copy;
use helpMod_braker
qw( find checkFile formatDetector relToAbs setParInConfig addParToConfig uptodate gtf2fasta clean_abort );
use Term::ANSIColor qw(:constants);
use strict;
use warnings;
my $usage = <<'ENDUSAGE';
DESCRIPTION
braker.pl Pipeline for predicting genes with GeneMark-EX and AUGUSTUS with
RNA-Seq and/or proteins
SYNOPSIS
braker.pl [OPTIONS] --genome=genome.fa {--bam=rnaseq.bam | --prot_seq=prot.fa}
INPUT FILE OPTIONS
--genome=genome.fa fasta file with DNA sequences
--bam=rnaseq.bam bam file with spliced alignments from
RNA-Seq
--prot_seq=prot.fa A protein sequence file in multi-fasta
format used to generate protein hints.
Unless otherwise specified, braker.pl will
run in "EP mode" or "ETP mode which uses
ProtHint to generate protein hints and
GeneMark-EP+ or GeneMark-ETP to
train AUGUSTUS.
--hints=hints.gff Alternatively to calling braker.pl with a
bam or protein fasta file, it is possible to
call it with a .gff file that contains
introns extracted from RNA-Seq and/or
protein hints (most frequently coming
from ProtHint). If you wish to use the
ProtHint hints, use its
"prothint_augustus.gff" output file.
This flag also allows the usage of hints
from additional extrinsic sources for gene
prediction with AUGUSTUS. To consider such
additional extrinsic information, you need
to use the flag --extrinsicCfgFiles to
specify parameters for all sources in the
hints file (including the source "E" for
intron hints from RNA-Seq).
In ETP mode, this option can be used together
with --geneMarkGtf and --traingenes to provide
BRAKER with results of a previous GeneMark-ETP
run, so that the GeneMark-ETP step can be
skipped. In this case, specify the hintsfile of
a previous BRAKER run here, or generate a
hintsfile from the GeneMark-ETP working
directory with the script get_etp_hints.py.
--rnaseq_sets_ids=SRR1111,SRR1115 IDs of RNA-Seq sets that are either in
one of the directories specified with
--rnaseq_sets_dir, or that can be downloaded
from SRA. If you want to use local files, you
can use unaligned reads in FASTQ format
(they have to be named ID.fastq if unpaired or
ID_1.fastq, ID_2.fastq if paired), or aligned reads
as a BAM file (named ID.bam).
--rnaseq_sets_dir=/path/to/rna_dir1 Locations where the local files of RNA-Seq data
reside that were specified with --rnaseq_sets_ids.
FREQUENTLY USED OPTIONS
--species=sname Species name. Existing species will not be
overwritten. Uses Sp_1 etc., if no species
is assigned
--AUGUSTUS_ab_initio output ab initio predictions by AUGUSTUS
in addition to predictions with hints by
AUGUSTUS
--softmasking_off Turn off softmasking option (enables by
default, discouraged to disable!)
--esmode Run GeneMark-ES (genome sequence only) and
train AUGUSTUS on long genes predicted by
GeneMark-ES. Final predictions are ab initio
--gff3 Output in GFF3 format (default is gtf
format)
--threads Specifies the maximum number of threads that
can be used during computation. Be aware:
optimize_augustus.pl will use max. 8
threads; augustus will use max. nContigs in
--genome=file threads.
--workingdir=/path/to/wd/ Set path to working directory. In the
working directory results and temporary
files are stored
--nice Execute all system calls within braker.pl
and its submodules with bash "nice"
(default nice value)
--alternatives-from-evidence=true Output alternative transcripts based on
explicit evidence from hints (default is
true).
--fungus GeneMark-EX option: run algorithm with
branch point model (most useful for fungal
genomes)
--crf Execute CRF training for AUGUSTUS;
resulting parameters are only kept for
final predictions if they show higher
accuracy than HMM parameters.
--keepCrf keep CRF parameters even if they are not
better than HMM parameters
--makehub Create track data hub with make_hub.py
for visualizing BRAKER results with the
UCSC GenomeBrowser
--busco_lineage=lineage If you provide a BUSCO lineage, BRAKER will
run compleasm on genome level to generate hints
from BUSCO to enhance BUSCO discovery in the
protein set. Also, if you provide a BUSCO
lineage, BRAKER will run compleasm to assess
the protein sets that go into TSEBRA combination,
and will determine the TSEBRA mode to maximize
BUSCO. Do not provide a busco_lineage if you
want to determina natural BUSCO sensivity of
BRAKER!
--email E-mail address for creating track data hub
--version Print version number of braker.pl
--help Print this help message
CONFIGURATION OPTIONS (TOOLS CALLED BY BRAKER)
--AUGUSTUS_CONFIG_PATH=/path/ Set path to config directory of AUGUSTUS
(if not specified as environment
variable). BRAKER1 will assume that the
directories ../bin and ../scripts of
AUGUSTUS are located relative to the
AUGUSTUS_CONFIG_PATH. If this is not the
case, please specify AUGUSTUS_BIN_PATH
(and AUGUSTUS_SCRIPTS_PATH if required).
The braker.pl commandline argument
--AUGUSTUS_CONFIG_PATH has higher priority
than the environment variable with the
same name.
--AUGUSTUS_BIN_PATH=/path/ Set path to the AUGUSTUS directory that
contains binaries, i.e. augustus and
etraining. This variable must only be set
if AUGUSTUS_CONFIG_PATH does not have
../bin and ../scripts of AUGUSTUS relative
to its location i.e. for global AUGUSTUS
installations. BRAKER1 will assume that
the directory ../scripts of AUGUSTUS is
located relative to the AUGUSTUS_BIN_PATH.
If this is not the case, please specify
--AUGUSTUS_SCRIPTS_PATH.
--AUGUSTUS_SCRIPTS_PATH=/path/ Set path to AUGUSTUS directory that
contains scripts, i.e. splitMfasta.pl.
This variable must only be set if
AUGUSTUS_CONFIG_PATH or AUGUSTUS_BIN_PATH
do not contains the ../scripts directory
of AUGUSTUS relative to their location,
i.e. for special cases of a global
AUGUSTUS installation.
--BAMTOOLS_PATH=/path/to/ Set path to bamtools (if not specified as
environment BAMTOOLS_PATH variable). Has
higher priority than the environment
variable.
--GENEMARK_PATH=/path/to/ Set path to GeneMark-ET (if not specified
as environment GENEMARK_PATH variable).
Has higher priority than environment
variable.
--SAMTOOLS_PATH=/path/to/ Optionally set path to samtools (if not
specified as environment SAMTOOLS_PATH
variable) to fix BAM files automatically,
if necessary. Has higher priority than
environment variable.
--PROTHINT_PATH=/path/to/ Set path to the directory with prothint.py.
(if not specified as PROTHINT_PATH
environment variable). Has higher priority
than environment variable.
--DIAMOND_PATH=/path/to/diamond Set path to diamond, this is an alternative
to NCIB blast; you only need to specify one
out of DIAMOND_PATH or BLAST_PATH, not both.
DIAMOND is a lot faster that BLAST and yields
highly similar results for BRAKER.
--BLAST_PATH=/path/to/blastall Set path to NCBI blastall and formatdb
executables if not specified as
environment variable. Has higher priority
than environment variable.
--COMPLEASM_PATH=/path/to/compleasm Set path to compleasm (if not specified as
environment variable). Has higher priority
than environment variable.
--PYTHON3_PATH=/path/to Set path to python3 executable (if not
specified as envirnonment variable and if
executable is not in your $PATH).
--JAVA_PATH=/path/to Set path to java executable (if not
specified as environment variable and if
executable is not in your $PATH), only
required with flags --UTR=on and --addUTR=on
--GUSHR_PATH=/path/to Set path to gushr.py exectuable (if not
specified as an environment variable and if
executable is not in your $PATH), only required
with the flags --UTR=on and --addUTR=on
--MAKEHUB_PATH=/path/to Set path to make_hub.py (if option --makehub
is used).
--CDBTOOLS_PATH=/path/to cdbfasta/cdbyank are required for running
fix_in_frame_stop_codon_genes.py. Usage of
that script can be skipped with option
'--skip_fixing_broken_genes'.
EXPERT OPTIONS
--augustus_args="--some_arg=bla" One or several command line arguments to
be passed to AUGUSTUS, if several
arguments are given, separate them by
whitespace, i.e.
"--first_arg=sth --second_arg=sth".
--skipGeneMark-ES Skip GeneMark-ES and use provided
GeneMark-ES output (e.g. provided with
--geneMarkGtf=genemark.gtf)
--skipGeneMark-ET Skip GeneMark-ET and use provided
GeneMark-ET output (e.g. provided with
--geneMarkGtf=genemark.gtf)
--skipGeneMark-EP Skip GeneMark-EP and use provided
GeneMark-EP output (e.g. provided with
--geneMarkGtf=genemark.gtf)
--skipGeneMark-ETP Skip GeneMark-ETP and use provided
GeneMark-ETP output (e.g. provided with
--gmetp_results_dir=GeneMark-ETP/)
--geneMarkGtf=file.gtf If skipGeneMark-ET is used, braker will by
default look in the working directory in
folder GeneMarkET for an already existing
gtf file. Instead, you may provide such a
file from another location. If geneMarkGtf
option is set, skipGeneMark-ES/ET/EP/ETP is
automatically also set. Note that gene and
transcript ids in the final output may not
match the ids in the input genemark.gtf
because BRAKER internally re-assigns these
ids.
In ETP mode, this option hast to be used together
with --traingenes and --hints to provide BRAKER
with results of a previous GeneMark-ETP run.
--gmetp_results_dir Location of results from a previous
GeneMark-ETP run, which will be used to
skip the GeneMark-ETP step. This option
can be used instead of --geneMarkGtf,
--traingenes, and --hints to skip GeneMark.
--rounds The number of optimization rounds used in
optimize_augustus.pl (default 5)
--skipAllTraining Skip GeneMark-EX (training and
prediction), skip AUGUSTUS training, only
runs AUGUSTUS with pre-trained and already
existing parameters (not recommended).
Hints from input are still generated.
This option automatically sets
--useexisting to true.
--useexisting Use the present config and parameter files
if they exist for 'species'; will overwrite
original parameters if BRAKER performs
an AUGUSTUS training.
--filterOutShort It may happen that a "good" training gene,
i.e. one that has intron support from
RNA-Seq in all introns predicted by
GeneMark-EX, is in fact too short. This flag
will discard such genes that have
supported introns and a neighboring
RNA-Seq supported intron upstream of the
start codon within the range of the
maximum CDS size of that gene and with a
multiplicity that is at least as high as
20% of the average intron multiplicity of
that gene.
--skipOptimize Skip optimize parameter step (not
recommended).
--skipIterativePrediction Skip iterative prediction in --epmode (does
not affect other modes, saves a bit of runtime)
--skipGetAnnoFromFasta Skip calling the python3 script
getAnnoFastaFromJoingenes.py from the
AUGUSTUS tool suite. This script requires
python3, biopython and re (regular
expressions) to be installed. It produces
coding sequence and protein FASTA files
from AUGUSTUS gene predictions and provides
information about genes with in-frame stop
codons. If you enable this flag, these files
will not be produced and python3 and
the required modules will not be necessary
for running brkaker.pl.
--skip_fixing_broken_genes If you do not have python3, you can choose
to skip the fixing of stop codon including
genes (not recommended).
--eval=reference.gtf Reference set to evaluate predictions
against (using evaluation scripts from GaTech)
--eval_pseudo=pseudo.gff3 File with pseudogenes that will be excluded
from accuracy evaluation (may be empty file)
--AUGUSTUS_hints_preds=s File with AUGUSTUS hints predictions; will
use this file as basis for UTR training;
only UTR training and prediction is
performed if this option is given.
--flanking_DNA=n Size of flanking region, must only be
specified if --AUGUSTUS_hints_preds is given
(for UTR training in a separate braker.pl
run that builds on top of an existing run)
--verbosity=n 0 -> run braker.pl quiet (no log)
1 -> only log warnings
2 -> also log configuration
3 -> log all major steps
4 -> very verbose, log also small steps
--downsampling_lambda=d The distribution of introns in training
gene structures generated by GeneMark-EX
has a huge weight on single-exon and
few-exon genes. Specifying the lambda
parameter of a poisson distribution will
make braker call a script for downsampling
of training gene structures according to
their number of introns distribution, i.e.
genes with none or few exons will be
downsampled, genes with many exons will be
kept. Default value is 2.
If you want to avoid downsampling, you have
to specify 0.
--checkSoftware Only check whether all required software
is installed, no execution of BRAKER
--nocleanup Skip deletion of all files that are typically not
used in an annotation project after
running braker.pl. (For tracking any
problems with a braker.pl run, you
might want to keep these files, therefore
nocleanup can be activated.)
DEVELOPMENT OPTIONS (PROBABLY STILL DYSFUNCTIONAL)
--splice_sites=patterns list of splice site patterns for UTR
prediction; default: GTAG, extend like this:
--splice_sites=GTAG,ATAC,...
this option only affects UTR training
example generation, not gene prediction
by AUGUSTUS
--overwrite Overwrite existing files (except for
species parameter files) Beware, currently
not implemented properly!
--extrinsicCfgFiles=file1,file2,... Depending on the mode in which braker.pl
is executed, it may require one ore several
extrinsicCfgFiles. Don't use this option
unless you know what you are doing!
--stranded=+,-,+,-,... If UTRs are trained, i.e.~strand-specific
bam-files are supplied and coverage
information is extracted for gene prediction,
create stranded ep hints. The order of
strand specifications must correspond to the
order of bam files. Possible values are
+, -, .
If stranded data is provided, ONLY coverage
data from the stranded data is used to
generate UTR examples! Coverage data from
unstranded data is used in the prediction
step, only.
The stranded label is applied to coverage
data, only. Intron hints are generated
from all libraries treated as "unstranded"
(because splice site filtering eliminates
intron hints from the wrong strand, anyway).
--optCfgFile=ppx.cfg Optional custom config file for AUGUSTUS
for running PPX (currently not
implemented)
--grass Switch this flag on if you are using braker.pl
for predicting genes in grasses with
GeneMark-EX. The flag will enable
GeneMark-EX to handle GC-heterogenicity
within genes more properly.
NOTHING IMPLEMENTED FOR GRASS YET!
--transmasked_fasta=file.fa Transmasked genome FASTA file for GeneMark-EX
(to be used instead of the regular genome
FASTA file).
--min_contig=INT Minimal contig length for GeneMark-EX, could
for example be set to 10000 if transmasked_fasta
option is used because transmasking might
introduce many very short contigs.
--translation_table=INT Change translation table from non-standard
to something else.
DOES NOT WORK YET BECAUSE BRAKER DOESNT
SWITCH TRANSLATION TABLE FOR GENEMARK-EX, YET!
--gc_probability=DECIMAL Probablity for donor splice site pattern GC
for gene prediction with GeneMark-EX,
default value is 0.001
--gm_max_intergenic=INT Adjust maximum allowed size of intergenic
regions in GeneMark-EX. If not used, the value
is automatically determined by GeneMark-EX.
--traingenes=file.gtf Training genes that are used instead of training
genes generated with GeneMark.
In ETP mode, this option can be used together
with --geneMarkGtf and --hints to provide BRAKER
with results of a previous GeneMark-ETP run, so
that the GeneMark-ETP step can be skipped.
In this case, use training.gtf from that run as
argument.
--UTR=on create UTR training examples from RNA-Seq
coverage data; requires options
--bam=rnaseq.bam.
Alternatively, if UTR parameters already
exist, training step will be skipped and
those pre-existing parameters are used.
DO NOT USE IN CONTAINER!
TRY NOT TO USE AT ALL!
--addUTR=on Adds UTRs from RNA-Seq coverage data to
augustus.hints.gtf file. Does not perform
training of AUGUSTUS or gene prediction with
AUGUSTUS and UTR parameters.
DO NOT USE IN CONTAINER!
TRY NOT TO USE AT ALL!
EXAMPLE
To run with RNA-Seq
braker.pl [OPTIONS] --genome=genome.fa --species=speciesname \
--bam=accepted_hits.bam
braker.pl [OPTIONS] --genome=genome.fa --species=speciesname \
--hints=rnaseq.gff
To run with protein sequences
braker.pl [OPTIONS] --genome=genome.fa --species=speciesname \
--prot_seq=proteins.fa
braker.pl [OPTIONS] --genome=genome.fa --species=speciesname \
--hints=prothint_augustus.gff
To run with RNA-Seq and protein sequences
braker.pl [OPTIONS] --genome=genome.fa --species=speciesname \
--prot_seq=proteins.fa --rnaseq_sets_ids=id_rnaseq1,id_rnaseq2 \
--rnaseq_sets_dir=/path/to/local/rnaseq/files
braker.pl [OPTIONS] --genome=genome.fa --species=speciesname \
--prot_seq=proteins.fa --bam=id_rnaseq1.bam,id_rnaseq2.bam
ENDUSAGE
# Declartion of global variables ###############################################
my $v = 4; # determines what is printed to log
my $version = "3.0.8";
my $rootDir;
my $logString = ""; # stores log messages produced before opening log file
$logString .= "\#**********************************************************************************\n";
$logString .= "\# BRAKER CONFIGURATION \n";
$logString .= "\#**********************************************************************************\n";
$logString .= "\# BRAKER CALL: ". $0 . " ". (join " ", @ARGV) . "\n";
$logString .= "\# ". (localtime) . ": braker.pl version $version\n";
my $prtStr;
my $alternatives_from_evidence = "true";
# output alternative transcripts based on explicit evidence
# from hints
my $augpath; # path to augustus
my $augustus_cfg_path; # augustus config path, higher priority than
# $AUGUSTUS_CONFIG_PATH on system
my $augustus_bin_path; # path to augustus folder binaries folder
my $augustus_scripts_path; # path to augustus scripts folder
my $AUGUSTUS_CONFIG_PATH;
my $AUGUSTUS_BIN_PATH;
my $AUGUSTUS_SCRIPTS_PATH;
my $PYTHON3_PATH;
my $MAKEHUB_PATH;
my $CDBTOOLS_PATH;
my $cdbtools_path;
my $makehub_path;
my $email; # for make_hub.py
my @bam; # bam file names
my @rna_sets_dir; # directories with FASTQ or BAM files of RNA-Seq sets
my @stranded; # contains +,-,+,-... corresponding to
# bam files
my $checkOnly = 0;
my $bamtools_path;
my $BAMTOOLS_BIN_PATH;
my $SRATOOLS_PATH;
my $sratools_path;
my $HISAT2_PATH;
my $hisat2_path;
my $bool_species = "true"; # false, if $species contains forbidden words
my $cmdString; # to store shell commands
my $CPU = 1; # number of CPUs that can be used
my $currentDir = cwd(); # working superdirectory where program is called from
my $errorfile; # stores current error file name
my $errorfilesDir; # directory for error files
my @extrinsicCfgFiles; # assigned extrinsic files
my $extrinsicCfgFile; # file to be used for running AUGUSTUS
my $extrinsicCfgFile1; # user supplied file 1
my $extrinsicCfgFile2; # user supplied file 2
my $extrinsicCfgFile3; # user supplied file 3
my $extrinsicCfgFile4; # user supplied file 4
my @files; # contains all files in $rootDir
my $flanking_DNA; # length of flanking DNA, default value is
# min{ave. gene length/2, 10000}
my @forbidden_words; # words/commands that are not allowed in species name
my $fungus = 0; # option for GeneMark-ET
my $gb_good_size; # number of LOCUS entries in 'train.gb'
my $genbank; # genbank file name
my $genemarkDir; # directory for GeneMark-ET output
my $GENEMARK_PATH;
my $GM_path; # GeneMark-ET path
my $PROTHINT_PATH;
my $prothint_path;
my $PROTHINT_REQUIRED = "prothint.py 2.6.0"; # Version of ProtHint required for this BRAKER version
my $TSEBRA_PATH;
my $tsebra_path;
my $genome; # name of sequence file
my %scaffSizes; # length of scaffolds
my $gff3 = 0; # create output file in GFF3 format
my $help; # print usage
my @hints; # input hints file names
my $hintsfile; # hints file (all hints)
my %rnaseq_libs; # rnaseq libs as fastq files (key: libID, value: array of paired or unpaired FASTQ files)
my @rnaseq_sra_ids; # SRA IDs of rnaseq libs
my $prot_hintsfile; # hints file with protein hints
my $genemark_hintsfile; # hinsfile compatible with GeneMark-E*
my $limit = 10000000; # maximum for generic species Sp_$limit
my $logfile; # contains used shell commands
my $optCfgFile; # optinonal extrinsic config file for AUGUSTUS
my $otherfilesDir; # directory for other files besides GeneMark-ET output and
# parameter files
my $annot; # reference annotation to compare predictions to
my $annot_pseudo; # file with pseudogenes to be excluded from accuracy measurements
my %accuracy; # stores accuracy results of gene prediction runs
my $overwrite = 0; # overwrite existing files (except for species parameter files)
my $parameterDir; # directory of parameter files for species
my $perlCmdString; # stores perl commands
my $printVersion = 0; # print version number, if set
my $SAMTOOLS_PATH;
my $SAMTOOLS_PATH_OP; # path to samtools executable
my $scriptPath = dirname($0); # path of directory where this script is located
my $skipGeneMarkET = 0; # skip GeneMark-ET and use provided GeneMark-ET output
my $skipGeneMarkEP = 0; # skip GeneMark-EP and use provided GeneMark-EP output
my $skipGeneMarkETP = 0;
my $skipGeneMarkES = 0;
my $skipoptimize = 0; # skip optimize parameter step
my $skipIterativePrediction;
my $skipAllTraining = 0; # skip all training (including no GeneMark-EX run)
my $skipGetAnnoFromFasta = 0; # requires python3 & biopython
my $species; # species name
my $soft_mask = 1; # soft-masked flag
my $soft_off = 0; # disable softmasking flag
my $standard = 0; # index for standard malus/ bonus value
# (currently 0.1 and 1e1)
my $chunksize = 2500000; # chunksize for running AUGUSTUS in parallel
my $skipTSEBRA = 1; # skip TSEBRA if tsebra.py can't be found
my $stdoutfile; # stores current standard output
my $string; # string for storing script path
my $augustus_args; # string that stores command line arguments to be passed to
# augustus
my $testsize1; # number of genes to test AUGUSTUS accuracy on after trainnig
my $testsize2; # number of genes to test AUGUSTUS with during optimize_augustus.pl
my $useexisting
= 0; # use existing species config and parameter files, no changes
# on those files
my $UTR = "off"; # UTR prediction on/off. currently not available für new
# species
my $addUTR = "off";
my $workDir; # in the working directory results and temporary files are
# stored
my $filterOutShort; # filterOutShort option (see help)
my $augustusHintsPreds; # already existing AUGUSTUS hints prediction without UTR
my $makehub; # flag for creating track data hub
my $etpplus_dir; # directory of old GeneMark-ETP run (used to restart)
# Hint type from input hintsfile will be checked
# a) GeneMark-ET (requires intron hints) and
# b) selection of exrinsic.cfg file is affected by hint types
my @allowedHints = (
"Intron", "intron", "start", "stop",
"ass", "dss", "exonpart", "exon",
"CDSpart", "UTRpart", "nonexonpart", "ep"
);
# REMOVE Intron when GeneMark introns format has been fixed!
my $crf; # flag that determines whether CRF training should be tried
my $keepCrf;
my $nice; # flag that determines whether system calls should be executed
# with bash nice (default nice value)
my ( $target_1, $target_2, $target_3, $target_4, $target_5) = 0;
# variables that store AUGUSTUS accuracy after different
# training steps
my @prot_seq_files; # variable to store protein sequence file name
my $DIAMOND_PATH; # path to diamond, alternative to BLAST
my $diamond_path; # command line argument value for $DIAMOND_PATH
my $COMPLEASM_PATH; # path to compleasm
my $compleasm_path; # command line argument value for $COMPLEASM_PATH
my $busco_lineage;
my $BLAST_PATH; # path to blastall and formatdb ncbi blast executable
my $blast_path; # command line argument value for $BLAST_PATH
my $python3_path; # command line argument value for $PYTHON3_PATH
my $java_path;
my $JAVA_PATH;
my $gushr_path;
my $GUSHR_PATH;
my %hintTypes; # stores hint types occuring over all generated and supplied
# hints for comparison
my $rounds = 5; # rounds used by optimize_augustus.pl
my $geneMarkGtf; # GeneMark output file (for skipGeneMark-ET option if not in
# braker working directory)
my $ESmode = 0; # flag for executing GeneMark-ES with genome sequence only
my $EPmode = 0; # flag for executing GeneMark-EP instead of GeneMark-ET
my $ETPmode = 0; # flag for executing GeneMark-EPT
my $GeneMarkIntronThreshold;
# use this value to screen hintsfile for GeneMark-EX. If few
# hints with multiplicity higher than this value are
# contained, braker will be aborted. Default value is determined by mode of
# GeneMark-EX: currently 10 for ET and 4 for EP/EPT
my $ab_initio; # flag for output of AUGUSTUS ab initio predictions
my $foundRNASeq = 0; # stores whether any external RNA-Seq input was found
my $foundProt = 0; # stores whether any external protein input was found
my $foundProteinHint = 0; # stores whether hintsfile contains src=P
my $lambda = 2; # labmda of poisson distribution for downsampling of training genes
my @splice_cmd_line;
my @splice;
my $AUGUSTUS_hints_preds; # for UTR training only (updating existing runs)
my $cleanup = 1; # enable file and directory cleanup after successful run
# list of forbidden words for species name
my $nocleanup;
my $transmasked_fasta; # transmaked genome file for GeneMark
my $min_contig; # min contig length for GeneMark, e.g. to be used in combination
# with transmasked_fasta
my $grass; # switch on GC treatment for GeneMark-ES/ET
my $ttable = 1; # translation table to be used
my $gc_prob = 0.001;
my $gm_max_intergenic;
my $skip_fixing_broken_genes; # skip execution of fix_in_frame_stop_codon_genes.py
my $traingtf;
my @rna_seq_libs_ids; # IDs of all RNA-seq libraries for ET/ETPmode
my $genome_size; # genome size in bp
#! currently not used
my @gc_range = (0.32, 0.73); # minimum and maximum gc range
@forbidden_words = (
"system", "exec", "passthru", "run", "fork", "qx",
"backticks", "chmod", "chown", "chroot", "unlink", "do",
"eval", "kill", "rm", "mv", "grep", "cd",
"top", "cp", "for", "done", "passwd", "while",
"nice", "ln"
);
if ( @ARGV == 0 ) {
print "$usage\n";
exit(0);
}
GetOptions(
'alternatives-from-evidence=s' => \$alternatives_from_evidence,
'AUGUSTUS_CONFIG_PATH=s' => \$augustus_cfg_path,
'AUGUSTUS_BIN_PATH=s' => \$augustus_bin_path,
'AUGUSTUS_SCRIPTS_PATH=s' => \$augustus_scripts_path,
'DIAMOND_PATH=s' => \$diamond_path,
'BLAST_PATH=s' => \$blast_path,
'PYTHON3_PATH=s' => \$python3_path,
'JAVA_PATH=s' => \$java_path,
'GUSHR_PATH=s' => \$gushr_path,
'CDBTOOLS_PATH=s' => \$cdbtools_path,
'MAKEHUB_PATH=s' => \$makehub_path,
'bam=s' => \@bam,
'BAMTOOLS_PATH=s' => \$bamtools_path,
'SRATOOLS_PATH=s' => \$sratools_path,
'HISAT2_PATH=s' => \$hisat2_path,
'threads=i' => \$CPU,
'fungus!' => \$fungus,
'extrinsicCfgFiles=s' => \@extrinsicCfgFiles,
'GENEMARK_PATH=s' => \$GM_path,
'PROTHINT_PATH=s' => \$prothint_path,
'TSEBRA_PATH=s' => \$tsebra_path,
'AUGUSTUS_hints_preds=s' => \$AUGUSTUS_hints_preds,
'genome=s' => \$genome,
'gff3' => \$gff3,
'hints=s' => \@hints,
'optCfgFile=s' => \$optCfgFile,
'overwrite!' => \$overwrite,
'SAMTOOLS_PATH=s' => \$SAMTOOLS_PATH_OP,
'COMPLEASM_PATH=s' => \$compleasm_path,
'busco_lineage=s' => \$busco_lineage,
'skipGeneMark-ES!' => \$skipGeneMarkES,
'skipGeneMark-ET!' => \$skipGeneMarkET,
'skipGeneMark-EP!' => \$skipGeneMarkEP,
'skipGeneMark-ETP!' => \$skipGeneMarkETP,
'skipOptimize!' => \$skipoptimize,
'skipIterativePrediction!' => \$skipIterativePrediction,
'skipAllTraining!' => \$skipAllTraining,
'skipGetAnnoFromFasta!' => \$skipGetAnnoFromFasta,
'species=s' => \$species,
'softmasking_off!' => \$soft_off,
'useexisting!' => \$useexisting,
'UTR=s' => \$UTR,
'addUTR=s' => \$addUTR,
'workingdir=s' => \$workDir,
'filterOutShort!' => \$filterOutShort,
'crf!' => \$crf,
'keepCrf!' => \$keepCrf,
'nice!' => \$nice,
'help!' => \$help,
'prot_seq=s' => \@prot_seq_files,
'augustus_args=s' => \$augustus_args,
'rounds=s' => \$rounds,
'geneMarkGtf=s' => \$geneMarkGtf,
'esmode!' => \$ESmode,
'AUGUSTUS_ab_initio!' => \$ab_initio,
'eval=s' => \$annot,
'eval_pseudo=s' => \$annot_pseudo,
'verbosity=i' => \$v,
'downsampling_lambda=s' => \$lambda,
'splice_sites=s' => \@splice_cmd_line,
'flanking_DNA=i' => \$flanking_DNA,
'stranded=s' => \@stranded,
'checkSoftware!' => \$checkOnly,
'nocleanup!' => \$nocleanup,
'grass!' => \$grass,
'transmasked_fasta=s' => \$transmasked_fasta,
'min_contig=s' => \$min_contig,
'makehub!' => \$makehub,
'email=s' => \$email,
'version!' => \$printVersion,
'translation_table=s' => \$ttable,
'skip_fixing_broken_genes!' => \$skip_fixing_broken_genes,
'gc_probability=s' => \$gc_prob,
'gm_max_intergenic=s' => \$gm_max_intergenic,
'gmetp_results_dir=s' => \$etpplus_dir,
'traingenes=s' => \$traingtf,
'rnaseq_sets_ids=s' => \@rna_seq_libs_ids,
'rnaseq_sets_dirs=s' => \@rna_sets_dir
) or die("Error in command line arguments\n");
if ($help) {
print $usage;
exit(0);
}
if ($printVersion) {
print "braker.pl version $version\n";
exit(0);
}
if( $soft_off == 1 ) {
$soft_mask = 0;
}
if($nocleanup){
$cleanup = 0;
}
# Define publications to be cited ##############################################
# braker1, braker2, braker-whole, aug-cdna, aug-hmm, diamond, blast1, blast2,
# gm-es, gm-et, gm-ep, gm-etp, gm-fungus, samtools, bamtools, spaln,
# spaln2, makehub, bedtools, sratoolkit, hisat2, stringtie, gffread
my %pubs;
$pubs{'braker1'} = "\nHoff, K. J., Lange, S., Lomsadze, A., Borodovsky, M., & Stanke, M. (2016). BRAKER1: unsupervised RNA-Seq-based genome annotation with GeneMark-ET and AUGUSTUS. Bioinformatics, 32(5), 767-769.\n";
$pubs{'braker2'} = "\nBruna, T., Hoff, K.J., Lomsadze, A., Stanke, M., & Borodovsky, M. (2021). BRAKER2: Automatic Eukaryotic Genome Annotation with GeneMark-EP+ and AUGUSTUS Supported by a Protein Database. NAR Genomics and Bioinformatics 3(1), lqaa108.\n";
$pubs{'braker-whole'} = "\nHoff, K. J., Lomsadze, A., Borodovsky, M., & Stanke, M. (2019). Whole-genome annotation with BRAKER. In Gene Prediction (pp. 65-95). Humana, New York, NY.\n";
$pubs{'aug-cdna'} = "\nStanke, M., Diekhans, M., Baertsch, R., & Haussler, D. (2008). Using native and syntenically mapped cDNA alignments to improve de novo gene finding. Bioinformatics, 24(5), 637-644.\n";
$pubs{'aug-hmm'} = "\nStanke, M., Schöffmann, O., Morgenstern, B., & Waack, S. (2006). Gene prediction in eukaryotes with a generalized hidden Markov model that uses hints from external sources. BMC Bioinformatics, 7(1), 62.\n";
$pubs{'diamond'} = "\nBuchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59.\n";
$pubs{'blast1'} = "\nAltschul, S. F., Gish, W., Miller, W., Myers, E. W., & Lipman, D. J. (1990). Basic local alignment search tool. Journal of Molecular Biology, 215(3), 403-410.\n";
$pubs{'blast2'} = "\nCamacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., & Madden, T. L. (2009). BLAST+: architecture and applications. BMC Bioinformatics, 10(1), 421.\n";
$pubs{'gm-es'} = "\nLomsadze, A., Ter-Hovhannisyan, V., Chernoff, Y. O., & Borodovsky, M. (2005). Gene identification in novel eukaryotic genomes by self-training algorithm. Nucleic acids research, 33(20), 6494-6506.\n";
$pubs{'gm-et'} = "\nLomsadze, A., Burns, P. D., & Borodovsky, M. (2014). Integration of mapped RNA-Seq reads into automatic training of eukaryotic gene finding algorithm. Nucleic acids research, 42(15), e119-e119.\n";
$pubs{'gm-ep'} = "\nBruna, T., Lomsadze, A., & Borodovsky, M. (2020). GeneMark-EP+: eukaryotic gene prediction with self-training in the space of genes and proteins. NAR Genomics and Bioinformatics, 2(2), lqaa026.\n";
$pubs{'gm-etp'} = "\nBruna, T., Lomsadze, A., & Borodovsky, M. (2023). GeneMark-ETP: Automatic Gene Finding in Eukaryotic Genomes in Consistence with Extrinsic Data. bioRxiv, https://doi.org/10.1101/2023.01.13.524024.\n";
$pubs{'gm-fungus'} = "\nTer-Hovhannisyan, V., Lomsadze, A., Chernoff, Y. O., & Borodovsky, M. (2008). Gene prediction in novel fungal genomes using an ab initio algorithm with unsupervised training. Genome research, 18(12), 1979-1990.\n";
$pubs{'samtools'} = "\nLi, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., ... & Durbin, R. (2009). The sequence alignment/map format and SAMtools. Bioinformatics, 25(16), 2078-2079.\n";
$pubs{'bamtools'} = "\nBarnett, D. W., Garrison, E. K., Quinlan, A. R., Strömberg, M. P., & Marth, G. T. (2011). BamTools: a C++ API and toolkit for analyzing and managing BAM files. Bioinformatics, 27(12), 1691-1692.\n";
$pubs{'spaln'} = "\nGotoh, O. (2008). A space-efficient and accurate method for mapping and aligning cDNA sequences onto genomic sequence. Nucleic acids research, 36(8), 2630-2638.\n";
$pubs{'spaln2'} = "\nIwata, H., & Gotoh, O. (2012). Benchmarking spliced alignment programs including Spaln2, an extended version of Spaln that incorporates additional species-specific features. Nucleic acids research, 40(20), e161-e161.\n";
$pubs{'gemoma1'} = "\nKeilwagen, J., Hartung, F., Grau, J. (2019) GeMoMa: Homology-based gene prediction utilizing intron position conservation and RNA-seq data. Methods Mol Biol. 1962:161-177, doi: 10.1007/978-1-4939-9173-0_9.\n";
$pubs{'gemoma2'} = "\nKeilwagen, J., Wenk, M., Erickson, J.L., Schattat, M.H., Grau, J., Hartung F. (2016) Using intron position conservation for homology-based gene prediction. Nucleic Acids Research, 44(9):e89.\n";
$pubs{'gemoma3'} = "\nKeilwagen, J., Hartung, F., Paulini, M., Twardziok, S.O., Grau, J. (2018) Combining RNA-seq data and homology-based gene prediction for plants, animals and fungi. BMC Bioinformatics, 19(1):189.\n";
$pubs{'bedtools'} = "\nQuinlan, A. R. (2014). BEDTools: the Swiss‐army tool for genome feature analysis. Current protocols in bioinformatics, 47(1):11-12.\n";
$pubs{'sratoolkit'} = "\nSRA Toolkit Development Team (2020). SRA Toolkit. https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software.\n";
$pubs{'hisat2'} = "\nKim, D., Paggi, J. M., Park, C., Bennett, C., & Salzberg, S. L. (2019). Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nature biotechnology, 37(8):907-915.\n";
$pubs{'stringtie'} = "\nKovaka, S., Zimin, A. V., Pertea, G. M., Razaghi, R., Salzberg, S. L., & Pertea, M. (2019). Transcriptome assembly from long-read RNA-seq alignments with StringTie2. Genome biology, 20(1):1-13.\n";
$pubs{'gffread'} = "\nPertea, G., & Pertea, M. (2020). GFF utilities: GffRead and GffCompare. F1000Research, 9.\n";
$pubs{'tsebra'} = "\nGabriel, L., Hoff, K. J., Bruna, T., Borodovsky, M., & Stanke, M. (2021). TSEBRA: transcript selector for BRAKER. BMC Bioinformatics, 22:566.\n";
$pubs{'braker3'} = "\nGabriel, L., Bruna, T., Hoff, K. J., Ebel, M., Lomsadze, A., Borodovsky, M., & Stanke, M. (2023). BRAKER3: Fully Automated Genome Annotation Using RNA-Seq and Protein Evidence with GeneMark-ETP, AUGUSTUS and TSEBRA. bioRxiv, https://doi.org/10.1101/2023.06.10.544449.\n";
$pubs{'busco'} = "\nSimao, F. A., Waterhouse, R. M., Ioannidis, P., Kriventseva, E. V., & Zdobnov, E. M. (2015). BUSCO: assessing genome assembly and annotation completeness with single-copy orthologs. Bioinformatics, 31(19), 3210-3212.\n";
$pubs{'miniprot'} = "\nLi, H. (2023). Protein-to-genome alignment with miniprot. Bioinformatics, 30(1):btad014.\n";
$pubs{'compleasm'} = "\nHuang, N., & Li, H. (2023). compleasm: a faster and more accurate reimplementation of BUSCO. Bioinformatics 39(10):btad595.\n";
$pubs{'braker-c-i'} = "\nBruna, T., Gabriel, L., & Hoff, K. J. (2024). Navigating Eukaryotic Genome Annotation Pipelines: A Route Map to BRAKER, Galba, and TSEBRA. arXiv preprint at https://doi.org/10.48550/arXiv.2403.19416.\n";
$pubs{'makehub'} = "\nHoff, K. J. (2019). MakeHub: fully automated generation of UCSC genome browser assembly hubs. Genomics, Proteomics and Bioinformatics, 17(5), 546-549.\n";
# Make paths to input files absolute ###########################################
make_paths_absolute();
# Set working directory ########################################################
my $wdGiven;
# if no working directory is set, use current directory
if ( !defined $workDir ) {
$wdGiven = 0;
$workDir = $currentDir;
}else {
$wdGiven = 1;
my $last_char = substr( $workDir, -1 );
if ( $last_char eq "\/" ) {
chop($workDir);
}
my $tmp_dir_name = abs_path($workDir);
$workDir = $tmp_dir_name;
if ( not( -d $workDir ) ) {
$prtStr = "\# " . (localtime) . ": Creating directory $workDir.\n";
$logString .= $prtStr if ( $v > 2 );
mkdir($workDir) or die ("ERROR: in file " . __FILE__ ." at line "
. __LINE__ ."\nFailed to create directory $workDir!\n");
}
}
# check the write permission of $workDir #######################################
if ( !-w $workDir ) {
$prtStr
= "\# "
. (localtime)
. ": ERROR: in file " . __FILE__ ." at line ". __LINE__ ."\n"
. "Do not have write permission for $workDir.\nPlease"
. " use command 'chmod' to reset permissions, or specify another working "
. "directory with option --workingdir=...\n";
$logString .= $prtStr;
print STDERR $logString;
exit(1);
}
# determine in which mode to run braker.pl #####################################
determineRunMode();
# configure which tools BRAKER is going to run #################################
# * use command line argument if given
# * else use environment variable if present
# * else try to guess (might fail)
$prtStr
= "\# "
. (localtime)
. ": Configuring of BRAKER for using external tools...\n";
$logString .= $prtStr if ( $v > 2 );
# check which RNA-Seq sets are given as ID/FASTQ/BAM
if (@rna_seq_libs_ids) {
check_rnaseq_sets();
}
# set AUGUSTUS_CONFIG_PATH
set_AUGUSTUS_CONFIG_PATH();
set_AUGUSTUS_BIN_PATH();
set_AUGUSTUS_SCRIPTS_PATH();
fix_AUGUSTUS_CONFIG_PATH();
set_PYTHON3_PATH();
if (defined($busco_lineage)){
set_COMPLEASM_PATH(); # todo: make sure that compleasm to hints actually passes the path, or make the hints script a hints parser, only
}
if($UTR eq "on" || $addUTR eq "on"){
set_JAVA_PATH();
set_GUSHR_PATH();
}
if (!defined($geneMarkGtf) && !$skipAllTraining &&
!defined($AUGUSTUS_hints_preds) || $ETPmode) {
set_GENEMARK_PATH()
}
if (@rnaseq_sra_ids) {
set_SRATOOLS_PATH();
}
if (%rnaseq_libs || @rnaseq_sra_ids) {
set_HISAT2_PATH();
}
if(@bam || %rnaseq_libs || @rnaseq_sra_ids) {
set_BAMTOOLS_PATH();
set_SAMTOOLS_PATH();
}
if (not ($skipAllTraining)) {
set_BLAST_or_DIAMOND_PATH();
}
if (@prot_seq_files && !$ESmode) {
set_PROTHINT_PATH();
}
set_TSEBRA_PATH();
if ( $makehub ) {
set_MAKEHUB_PATH();
}
if( not($skip_fixing_broken_genes)) {
set_CDBTOOLS_PATH();
}
if($checkOnly){
$prtStr = "\# " . (localtime)
. ": Exiting braker.pl because it had been started with "
. "--softwareCheck option. No training or prediction or file format "
. "check will be performed.\n";
$logString .= $prtStr;
print $logString;
exit(0);
}
my $perl = which 'perl';
# check for known issues that may cause problems with braker.pl ################
check_upfront();
# check whether braker.pl options are set in a compatible way ##################
check_options();
# Starting braker pipeline #####################################################
$logString .= "\#**********************************************************************************\n";
$logString .= "\# CREATING DIRECTORY STRUCTURE \n";
$logString .= "\#**********************************************************************************\n";
# check whether $rootDir already exists
if ( $wdGiven == 1 ) {
$rootDir = $workDir;
}
else {
$rootDir = "$workDir/braker";
}
if ( -d "$rootDir/$species" && !$overwrite && $wdGiven == 0 ) {
$prtStr
= "#*********\n"
. ": WARNING: $rootDir/$species already exists. Braker will use"
. " existing files, if they are newer than the input files. You can "
. "choose another working directory with --workingdir=dir or overwrite "
. "it with --overwrite.\n"
. "#*********\n";
$logString .= $prtStr if ( $v > 0 );
}
# create $rootDir
if ( !-d $rootDir ) {
$prtStr = "\# "
. (localtime)
. ": create working directory $rootDir.\n"
. "mkdir $rootDir\n";
make_path($rootDir) or die("ERROR in file " . __FILE__ ." at line "
. __LINE__ ."\nFailed to create direcotry $rootDir!\n");
$logString .= $prtStr if ( $v > 2 );
}
if (%rnaseq_libs || @rnaseq_sra_ids) {
if (!-d "$rootDir/rnaseq") {
$prtStr = "\# "
. (localtime)
. ": create working directory $rootDir/rnaseq for RNA-Seq libraries.\n"
. "mkdir $rootDir/rnaseq\n";
make_path("$rootDir/rnaseq") or die("ERROR in file " . __FILE__ ." at line "
. __LINE__ ."\nFailed to create direcotry $rootDir/rnaseq!\n");
$logString .= $prtStr if ( $v > 2 );
}
}
my $genemarkesDir;
# set other directories
if ( $EPmode == 0 && $ETPmode == 0 && $ESmode == 0) {
$genemarkDir = "$rootDir/GeneMark-ET";