-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastq_ReadExtract_bySeq.pl
102 lines (83 loc) · 3.04 KB
/
Fastq_ReadExtract_bySeq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl
# Name: Fastq_ReadExtract_bySeq.pl
# Author: Tovah Markowitz
# Date: 11/7/2013
# Description: extract specific queries from a fastq file based upon sequence
# Input: name of fastq file
# Output: queries in fastq format
use strict;
use Data::Dumper;
use Getopt::Long;
use Pod::Usage;
my $help=0;
my ($fastqFile, $querySeq, $queryPos, $outputFile);
# set commandline options.
GetOptions ("input=s" => \$fastqFile,
"qSeq=s" => \$querySeq,
"qPos=i" => \$queryPos,
"output=s" => \$outputFile,
"help" => \$help,) or pod2usage(2);
pod2usage(1) if $help;
#############################################
my $sequences=LoadFastq($fastqFile);
# create regular expression pattern based upon query position and query sequence
my $regexp;
if ($queryPos) {
$regexp=join( "", "^[ACTGN]{", $queryPos-1, "}", $querySeq );
} else {
$regexp=$querySeq;
}
open (OUT, ">$outputFile") or die ("Can not open $outputFile for writing.\n");
# for each sequence check if querySeq exists at the noted position
foreach my $name (sort keys ( %{$sequences} )) {
if ($sequences -> {$name} -> {'seq'} =~ m/$regexp/) {
my $at = $name;
my $plus = $name;
$plus =~ s/>/+/;
$at =~ s/>/@/;
print OUT
"$at\n$sequences->{$name}->{'seq'}$plus\n$sequences->{$name}->{'qual'}";
}
}
close OUT;
#############################################
sub LoadFastq {
# read FASTQ file into a hash of hashes
my $fastqFile = shift;
# ensure fastq file exists else die
open (IN, "<$fastqFile") or die ("Can not find your input file $fastqFile: No such file or directory\n");
# set up necessary arrays
my $names;
my %sequences;
my $list;
# for as long as the file has more data:
while (<IN>){
chomp;
# place every line in one of the two hashes,
# using the @ symbol to identify the start of each sequence
if ($_ =~ /@/) {
$names = $_;
$names =~ s/@/>/;
$sequences{$names} -> {'seq'} = <IN>;
$list = <IN>;
$sequences{$names} -> {'quals'} = <IN>;
}
}
close IN;
return \%sequences;
}
###########################################
__END__
=head1 SYNOPSIS
Fastq_ReadExtract_bySeq.pl [Options]
Purpose: To find reads from a FASTQ file with a specific query sequence.
The function will only find reads with an exact match to the entire query.
Calling qPos will search for the query string at a specific position within
the read.
Options:
-help/-h brief help message
-input/-i input fastq file
-qSeq/-qS query sequence
-qPos/-qP (optional) query position
-output/-o output fastq file name
=cut