1
+ # !/usr/bin/perl
2
+
3
+ use warnings;
4
+ use strict;
5
+ use fralib;
6
+ use File::Basename;
7
+ use Getopt::Long;
8
+ use Pod::Usage;
9
+
10
+ =head1 NAME
11
+
12
+ affya2mk
13
+
14
+ =head1 SYNOPSIS
15
+
16
+ affya2mk [options] <affy-annotation-file>
17
+
18
+ -h detailed help message
19
+ -dbsnp specify the dbSNP version of the annotation
20
+ PLEASE note that there exists PERLEGEN SNPs
21
+ that are not found in dbSNP that are in SNP6
22
+ affy-annotation-file affymetrix annotation file.
23
+
24
+ example: affya2mk pscalare.txt
25
+
26
+ Converts Affymetrix annotation (tab delimited) to mk format.
27
+ Output file is named based on<affy-annotation-file> with the extension renamed
28
+ to .mk Please note that this programme does not check for correctness of the
29
+ field values.
30
+
31
+ =head1 DESCRIPTION
32
+
33
+ The extracted fields followed by its new name are:
34
+ 1)Probe Set ID snp-id
35
+ 2)dbSNP RS ID rs-id
36
+ 3)Chromosome chromosome
37
+ 4)Physical Position position
38
+ 5)ChrX pseudo-autosomal pseudo-autosomal
39
+ ChrX pseudo-autosomal region 1 pseudo-autosomal
40
+ ChrX pseudo-autosomal region 2 pseudo-autosomal
41
+ 6)DB SNP Version dbsnp (you will need this to describe the
42
+ snp annotations)
43
+ 7)Allele A alleles
44
+ 8)Allele B alleles
45
+ 9)Flank flanks
46
+
47
+ An additional field - alleles-strand which describes the allele orientation
48
+ with reference to the flanks is added.
49
+
50
+ =cut
51
+
52
+ my $help ;
53
+ my $headerProcessed ;
54
+ my $colNo ;
55
+ my $affyAnnotationFile ;
56
+ my $mkFile ;
57
+ my %label2Column ;
58
+ my $PARIsSeparate = 0;
59
+ my $globalDBSNPVersion ;
60
+ my $annotationFileType = " " ;
61
+
62
+ # initialize options
63
+ Getopt::Long::Configure (' bundling' );
64
+
65
+ if (!GetOptions (' h' => \$help , ' dbsnp=i' => \$globalDBSNPVersion )
66
+ || $help
67
+ || (defined ($globalDBSNPVersion ) && $globalDBSNPVersion < 0)
68
+ || scalar (@ARGV )!=1)
69
+ {
70
+ if ($help )
71
+ {
72
+ pod2usage(-verbose => 2);
73
+ }
74
+ else
75
+ {
76
+ pod2usage(1);
77
+ }
78
+ }
79
+
80
+ $affyAnnotationFile = $ARGV [0];
81
+
82
+ # reads in the GISSNP illumina annotation
83
+ open (AFFY_ANNOTATION, $affyAnnotationFile ) || die " Cannot open $affyAnnotationFile \n " ;
84
+
85
+ if (!defined ($mkFile ))
86
+ {
87
+ my ($name , $dir , $ext ) = fileparse($affyAnnotationFile , ' \..*' );
88
+ $mkFile = " $name .mk" ;
89
+ }
90
+
91
+ $headerProcessed = 0;
92
+ open (MK, " >$mkFile " ) || die " Cannot open $mkFile \n " ;
93
+
94
+ while (<AFFY_ANNOTATION>)
95
+ {
96
+ s /\r ?\n ?$// ;
97
+
98
+ if (!$headerProcessed )
99
+ {
100
+ next if (/ ^#/ );
101
+
102
+ print MK " snp-id\t rs-id\t chromosome\t strand\t position\t pseudo-autosomal\t dbsnp\t alleles\t alleles-strand\t flanks\n " ;
103
+
104
+ $colNo = s /\t / \t / g + 1;
105
+
106
+ my @fields = split (' \t' , $_ , $colNo );
107
+
108
+ my @labels = (' Probe Set ID' , ' dbSNP RS ID' , ' Chromosome' , ' Strand' , ' DB SNP Version' , ' Physical Position' , ' ChrX pseudo-autosomal region' , ' Flank' , ' Allele A' , ' Allele B' );
109
+
110
+ SEARCH_LABEL: for my $label (@labels )
111
+ {
112
+ for my $col (0 .. $#fields )
113
+ {
114
+ if ($fields [$col ] eq $label )
115
+ {
116
+ $label2Column {$label }=$col ;
117
+ next SEARCH_LABEL;
118
+ }
119
+ }
120
+
121
+ if ($label eq ' ChrX pseudo-autosomal region' )
122
+ {
123
+ $PARIsSeparate = 1;
124
+ push ( @labels , ' ChrX pseudo-autosomal region 1' );
125
+ push ( @labels , ' ChrX pseudo-autosomal region 2' );
126
+ }
127
+ elsif ($label eq ' DB SNP Version' && defined ($globalDBSNPVersion ))
128
+ {
129
+ # use user defined global dbsnp version instead
130
+ }
131
+ else
132
+ {
133
+ if ($label eq ' DB SNP Version' )
134
+ {
135
+ die " Cannot find '$label ' in $mkFile , you can circumvent this with the --dbsnp option" ;
136
+ }
137
+ else
138
+ {
139
+ die " Cannot find '$label ' in $mkFile " ;
140
+ }
141
+ }
142
+ }
143
+
144
+ $headerProcessed = 1;
145
+ }
146
+ else
147
+ {
148
+ my @fields = split (' \t' , $_ , -1);
149
+
150
+ my $snpId = trim($fields [$label2Column {' Probe Set ID' }]);
151
+ my $rsId = trim($fields [$label2Column {' dbSNP RS ID' }]);
152
+ my $chromosome = trim($fields [$label2Column {' Chromosome' }]);
153
+ $chromosome = $chromosome eq ' MT' ? ' M' : $chromosome ;
154
+ my $strand = trim($fields [$label2Column {' Strand' }]);
155
+ my $dbsnp = defined ($globalDBSNPVersion ) ? $globalDBSNPVersion : trim($fields [$label2Column {' DB SNP Version' }]);;
156
+ my $position = trim($fields [$label2Column {' Physical Position' }]);
157
+ my $pseudoAutosomal ;
158
+ if ($PARIsSeparate )
159
+ {
160
+ my $par1 = trim($fields [$label2Column {' ChrX pseudo-autosomal region 1' }]);
161
+ my $par2 = trim($fields [$label2Column {' ChrX pseudo-autosomal region 2' }]);
162
+ # print "$snpId\t$par1\t$par2\n";
163
+ $pseudoAutosomal = ($par1 =~ / 0|---/ && $par2 =~ / 0|---/ )? 0 : 1;
164
+ }
165
+ else
166
+ {
167
+ $pseudoAutosomal = trim($fields [$label2Column {' ChrX pseudo-autosomal region' }]);
168
+ }
169
+ my $alleleA = trim($fields [$label2Column {' Allele A' }]);
170
+ my $alleleB = trim($fields [$label2Column {' Allele B' }]);
171
+ my $alleles = " $alleleA /$alleleB " ;
172
+ my $allelesStrand = ' ref' ;
173
+ my $flanks = uc (trim($fields [$label2Column {' Flank' }]));
174
+
175
+ if ($rsId eq ' ---' )
176
+ {
177
+ $rsId = ' n/a' ;
178
+ $dbsnp = ' n/a' ;
179
+ }
180
+
181
+ if ($chromosome eq ' ---' )
182
+ {
183
+ $chromosome = ' n/a' ;
184
+ $position = ' n/a' ;
185
+ $strand = ' n/a' ;
186
+ $pseudoAutosomal = ' n/a' ;
187
+ }
188
+
189
+ # handles only biallelic SNPs
190
+ if ($alleles !~/ [ACGT]\/ [ACGT]/i )
191
+ {
192
+ warn " $snpId : alleles not valid - $alleles " ;
193
+ $alleles = ' n/a' ;
194
+ }
195
+
196
+ if ($flanks !~/ ([ACGTMRWSYKVHDBNX]*)\[ ([ACGT]\/ [ACGT])\] ([ACGTMRWSYKVHDBNX]*)/i
197
+ || (length ($1 )==0 && length ($3 )==0))
198
+ {
199
+ warn " $snpId : flanks not valid - $flanks " ;
200
+ $allelesStrand = ' n/a' ;
201
+ $flanks = ' n/a' ;
202
+ }
203
+
204
+ printf MK " $snpId \t $rsId \t $chromosome \t $strand \t $position \t $pseudoAutosomal \t $dbsnp \t $alleles \t $allelesStrand \t $flanks \n " ;
205
+ }
206
+ }
207
+
208
+ close (AFFY_ANNOTATION);
209
+ close (MK);
0 commit comments