24
24
my $min_base_ratio = $ARGV [7];
25
25
my $max_trim = $ARGV [8];
26
26
my $verbose = $ARGV [9];
27
- my $minContigLength = $ARGV [11];
27
+ my $minContigLength = $ARGV [11];
28
28
my $libraryfile = $ARGV [12];
29
29
my $gaps = $ARGV [13];
30
30
my $threads = $ARGV [14];
31
-
32
31
33
32
my $log = $base_name . " .logfile.txt" ;
34
33
my $summaryfile = $base_name ." .summaryfile.txt" ;
39
38
my ($bin );
40
39
if ($extending == 1){
41
40
42
- &ExtendContigs($base_name , $filecontig , $filenameOutExt , $Bin );
41
+ &ExtendContigs($base_name , $filecontig , $filenameOutExt );
43
42
print SUMFILE " \n " if ($minContigLength > 0);
44
43
&FormatContigs() if ($minContigLength > 0);
45
44
}else {
48
47
49
48
close SUMFILE;
50
49
close LOG;
51
-
50
+
52
51
mkpath(' process_OK' );
53
52
# --------------------------------------------------
54
53
55
54
# ##EXTEND CONTIGS WITH UNMAPPED READS
56
55
sub ExtendContigs{
57
- my ($base_name , $filecontig , $filenameOutExt , $Bin ) = @_ ;
56
+ my ($base_name , $filecontig , $filenameOutExt ) = @_ ;
58
57
my ($seq );
59
58
# -------------------------------------------------NOW MAP SINGLE READS TO INITIAL CONTIGS FILE.
60
59
my $readfile = " reads/" . $filenameOutExt ;
@@ -67,8 +66,8 @@ sub ExtendContigs{
67
66
# --------------------------------------------ASSEMBLY START
68
67
69
68
ASSEMBLY:
70
- open (IN,$filecontig ) || die " Can't open $filecontig -- fatal\n " ;
71
- my ($exttig_count , $counter , $NCount , $orig_mer , $prevhead ) = (0,0,0,0, ' ' );
69
+ open (IN, $filecontig ) || die " Can't open $filecontig -- fatal\n " ;
70
+ my ($exttig_count , $counter , $NCount , $orig_mer , $prevhead ) = (0, 0, 0, 0, ' ' );
72
71
while (<IN>){
73
72
s /\r\n / \n / ;
74
73
chomp ;
@@ -91,12 +90,12 @@ sub ExtendContigs{
91
90
my $reversetig = reverseComplement($seqrc ); # ## return to sequence, as inputted
92
91
if ($leng > $orig_mer ){ # ## commented out: && $start_sequence ne $seqrc && $start_sequence ne $reversetig
93
92
my $cov = $total_bases / $leng ;
94
- printf TIG " >extcontig%i |size%i |read%i |cov%.2f|seed:$prevhead \n %s \n " , ($counter ,$leng ,$reads_needed ,$cov ,$reversetig ); # print contigs to file
93
+ printf TIG " >extcontig%i |size%i |read%i |cov%.2f|seed:$prevhead \n %s \n " , ($counter , $leng , $reads_needed , $cov , $reversetig ); # print contigs to file
95
94
$exttig_count ++;
96
95
}else {
97
96
my $cov = $reads_needed = 0;
98
97
my $singlet_leng = length ($start_sequence );
99
- printf TIG " >contig%i |size%i |read%i |cov%.2f|seed:$prevhead \n %s \n " , ($counter ,$leng ,$reads_needed ,$cov ,$reversetig ); # print singlets to file
98
+ printf TIG " >contig%i |size%i |read%i |cov%.2f|seed:$prevhead \n %s \n " , ($counter , $leng , $reads_needed , $cov , $reversetig ); # print singlets to file
100
99
}
101
100
}
102
101
CounterPrint(++$counter );
@@ -124,8 +123,8 @@ sub ExtendContigs{
124
123
sub FormatContigs{
125
124
&printMessage(" \n =>" .getDate()." : Storing contigs to format for scaffolding\n " );
126
125
open (TIG, " >$contig " ) || die " Can't write to $contig -- fatal\n " ;
127
- open (IN,$filecontig ) || die " Can't open $filecontig -- fatal\n " ;
128
- my ($counter , $seq , $prevhead , $step ) = (0,' ' ,' ' , 100);
126
+ open (IN, $filecontig ) || die " Can't open $filecontig -- fatal\n " ;
127
+ my ($counter , $seq , $prevhead , $step ) = (0, ' ' , ' ' , 100);
129
128
while (<IN>){
130
129
s /\r\n / \n / ;
131
130
chomp ;
@@ -138,7 +137,7 @@ sub FormatContigs{
138
137
CounterPrint($counter );
139
138
$step = $step + 100;
140
139
}
141
- printf TIG " >contig%i |size%i |read%i |cov%.2f|seed:$prevhead \n %s \n " , ($counter ,$length_seq ,0, 0.00,$seq );
140
+ printf TIG " >contig%i |size%i |read%i |cov%.2f|seed:$prevhead \n %s \n " , ($counter , $length_seq , 0, 0.00, $seq );
142
141
}
143
142
$prevhead = $head ;
144
143
$seq = ' ' ;
@@ -157,15 +156,15 @@ sub doExtension{
157
156
my ($direction , $orig_mer , $seq , $reads_needed , $total_bases , $min_overlap , $base_overlap , $min_base_ratio , $verbose , $tig_count , $max_trim ) = @_ ;
158
157
159
158
my $previous = $seq ;
160
- my ($extended , $trim_ct ) = (1,0);
159
+ my ($extended , $trim_ct ) = (1, 0);
161
160
162
161
if ($orig_mer > $MAX ){$orig_mer =$MAX ;} # ## Deals with special cases where the seed sequences are different from the read set (and possibly very large) - goal here is not to increase sequence coverage of seed, but rather to extend it.
163
162
164
163
TRIM:
165
164
while ($trim_ct <= $max_trim ){
166
165
while ($extended ){
167
166
168
- my ($pos ,$current_reads ,$current_bases ,$span ) = (0,0,0, " " );
167
+ my ($pos , $current_reads , $current_bases , $span ) = (0, 0, 0, " " );
169
168
170
169
# ## Added 19March08
171
170
if (length ($seq ) >= $MAX ){ # $seq is length of contig being extended -- if larger than largest read, make sure the largest read could align and all subsequent rds.
@@ -177,10 +176,10 @@ sub doExtension{
177
176
my $overhang = {};
178
177
my @overlapping_reads = ();
179
178
for (my $x =1;$x <= ($orig_mer * 2);$x ++){
180
- ($overhang -> {$x }{' A' },$overhang -> {$x }{' C' },$overhang -> {$x }{' G' },$overhang -> {$x }{' T' }) = (0,0,0, 0);
179
+ ($overhang -> {$x }{' A' }, $overhang -> {$x }{' C' }, $overhang -> {$x }{' G' }, $overhang -> {$x }{' T' }) = (0, 0, 0, 0);
181
180
}
182
181
183
- # ## COLLECT SEQUENCES
182
+ # ## COLLECT SEQUENCES
184
183
while ($span >= $min_overlap ){ # will slide the subseq, until the user-defined min overlap size
185
184
186
185
$pos = length ($seq ) - $span ;
@@ -200,9 +199,9 @@ sub doExtension{
200
199
201
200
# Collect all overhangs
202
201
push @overlapping_reads , $pass ; # ## all overlapping reads
203
- my @over = split (// ,$dangle );
202
+ my @over = split (// , $dangle );
204
203
my $ct_oh = 0;
205
-
204
+
206
205
foreach my $bz (@over ){
207
206
$ct_oh ++; # ## tracks overhang position passed the seed
208
207
$overhang -> {$ct_oh }{$bz } += $bin -> {$sub }{$pass };
@@ -242,7 +241,7 @@ sub doExtension{
242
241
last CONSENSUS;
243
242
}
244
243
}
245
- $previous_bz = $bz ;
244
+ $previous_bz = $bz ;
246
245
$ct_dna ++;
247
246
}
248
247
}
@@ -252,7 +251,7 @@ sub doExtension{
252
251
if ($consensus ne " " ){
253
252
254
253
print " Will extend $seq \n with: $consensus \n\n " if ($verbose );
255
- my $temp_sequence = $seq . $consensus ; # # this is the contig extension
254
+ my $temp_sequence = $seq . $consensus ; # # this is the contig extension
256
255
my $integral = 0;
257
256
my $position = length ($temp_sequence ) - ($startspan + length ($consensus ));
258
257
my $temp_sequence_end = substr ($temp_sequence , $position );
@@ -290,7 +289,7 @@ sub doExtension{
290
289
$extended = 1;
291
290
print " \n $direction prime EXTENSION ROUND $trim_ct COMPLETE UNTIL $max_trim nt TRIMMED OFF => TRIMMED SEQUENCE:$seq \n\n " if ($verbose );
292
291
}
293
-
292
+
294
293
}# ## while trimming within bounds
295
294
296
295
print " \n *** NOTHING ELSE TO BE DONE IN $direction prime- PERHAPS YOU COULD DECREASE THE MINIMUM OVERLAP -m (currently set to -m $min_overlap ) ***\n\n " if ($verbose );
@@ -302,19 +301,19 @@ sub doExtension{
302
301
# ##DELETE READ DATA IF IT HAS BEEN USED FOR EXTENDING A CONTIG
303
302
sub deleteData {
304
303
my ($sequence ) = @_ ;
305
-
304
+
306
305
my $subnor = substr ($sequence , 0, 10);
307
306
my $comp_seq = reverseComplement($sequence );
308
307
my $subrv = substr ($comp_seq , 0, 10);
309
-
308
+
310
309
# remove k-mer from hash table and prefix tree
311
310
delete $bin -> {$subrv }{$comp_seq };
312
311
delete $bin -> {$subnor }{$sequence };
313
312
}
314
313
315
314
sub getUnmappedReads{
316
315
my ($contigFile , $readfiles ) = @_ ;
317
- my ($library ,$fnames ) = (" start" ," " );
316
+ my ($library , $fnames ) = (" start" , " " );
318
317
319
318
# obtain sequences to map against the contigs
320
319
open (FILELIB, " < $libraryfile " ) || die " Can't open $libraryfile -- fatal\n " ;
@@ -330,9 +329,11 @@ sub getUnmappedReads{
330
329
close FILELIB;
331
330
my $unpaired = " reads/$base_name .singlereads.fasta" ;
332
331
$files -> {$unpaired }++ if (-e $unpaired );
333
- foreach my $f (keys %$files ){ $fnames .=" $f ," ;}
332
+ foreach my $f (keys %$files ){
333
+ $fnames .= " $f ," ;
334
+ }
334
335
chop $fnames ;
335
-
336
+
336
337
# build bowtie index of contigs and map reads to the index
337
338
my $bowtieout = $base_name . " .$library .bowtieIndex" ;
338
339
die " Contig file ($contigFile ) not found. Exiting...\n " if (!(-e $contigFile ));
@@ -344,7 +345,7 @@ sub getUnmappedReads{
344
345
# map reads with bowtie and obtain unmapped reads. Store the unmapped reads into a hash and use them for contig extension
345
346
open (IN, " $procline " ) || die " Can't open bowtie output -- fatal\n " ;
346
347
my ($counter , $step ) = (0, 100000);
347
- my ($orig , $rc , $subrv ,$subnor , $orig_mer );
348
+ my ($orig , $rc , $subrv , $subnor , $orig_mer );
348
349
while (<IN>){
349
350
my @t = split (/ \t / );
350
351
next if ($t [2] ne ' *' );
0 commit comments