-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAlex.hs
379 lines (277 loc) · 14.1 KB
/
Alex.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
{------------------------------------------------------------------------------
SCANNING TEXT
This module provides the run-time interface to the `Alex' package. The
functions in this module take the raw tables generated by `lx' and constructs
scanners, functions for cutting input text into tokens.
Many scanners need to maintain the location of the tokens in the input text for
diagnostics generation and for parsing layout-sensitive languages like Haskell.
Thus the first section defines `Posn' type for locating tokens in the input
text.
Two scanning packages are given. The first, `scan', generates simple stateless
scanners that generate streams of tokens. The second, `gscan', provides
general scanners with access to the scanner's internal state, hooks for
application-specific state and no restriction on the return type of the
scanner.
`gscan' should be adequate for most application but, if it isn't, the
components used to assemble it are available at the end of the module for
reassembly into a suitable configuration.
Chris Dornan, Aug-95, 10-Jul-96, 29-Sep-97
------------------------------------------------------------------------------}
module Alex where
import Array
{------------------------------------------------------------------------------
Token Positions
------------------------------------------------------------------------------}
-- `Posn' records the location of a token in the input text. It has three
-- fields: the address (number of chacaters preceding the token), line number
-- and column of a token within the file. `start_pos' gives the position of the
-- start of the file and `eof_pos' a standard encoding for the end of file.
-- `move_pos' calculates the new position after traversing a given character,
-- assuming the usual eight character tab stops.
data Posn = Pn !Int !Int !Int
deriving (Eq,Show)
start_pos:: Posn
start_pos = Pn 0 1 1
eof_pos:: Posn
eof_pos = Pn (-1) (-1) (-1)
move_pos:: Posn -> Char -> Posn
move_pos (Pn a l c) '\t' = Pn (a+1) l (((c+7) `div` 8)*8+1)
move_pos (Pn a l c) '\n' = Pn (a+1) (l+1) 1
move_pos (Pn a l c) _ = Pn (a+1) l (c+1)
{------------------------------------------------------------------------------
scan
------------------------------------------------------------------------------}
-- The @Scan@ package generates simple scanners that convert input text to
-- streams of tokens. The scanners are stateless as each token generated is a
-- function of its textual content and location.
--
-- The token actions take the form of an association list associating each
-- token name with an action function that constructs the token from the text
-- matched and its location. The stop action is invoked when no more input can
-- be tokenised; it takes the residual input and its position and generates the
-- remaining stream of tokens, usually the empty list or an end-of-file token
-- if the empty string is passed, an error token otherwise.
type Actions t = ([(String,TokenAction t)], StopAction t)
type TokenAction t = Posn -> String -> t
type StopAction t = Posn -> String -> [t]
-- @load_scan@ combines the actions with the dump generated by \lx\ to produce
-- a @Scan@ structure that can be passed to @scan@. @scan@ takes the scanner
-- and the input text and generates a stream of tokens. It assumes that the
-- text is at the start of the input with the position set to @start_pos@ (see
-- above) and sets the last character read to newline (the last character read
-- is used to resolve leading context specifications); @scan'@ can be used to
-- override these defaults.
load_scan:: Actions t -> DFADump -> Scan t
scan:: Scan t -> String -> [t]
scan':: Scan t -> Posn -> Char -> String -> [t]
-- `Scan' is an straightforward construction on `GScan'.
type Scan t = GScan () [t]
load_scan (al,s_a) dmp = load_gscan (al',s_a') dmp
where
al' = [(nm,mk_act f)|(nm,f)<-al]
mk_act f = \p _ inp len cont sc_s -> f p (take len inp):cont sc_s
s_a' p _ inp _ = s_a p inp
scan scr inp = scan' scr start_pos '\n' inp
scan' scr p c inp = gscan' scr p c inp (0,())
{------------------------------------------------------------------------------
gscan
------------------------------------------------------------------------------}
-- The @gscan@ package generates general-purpose scanners for converting input
-- text into a return type determined by the application. Access to the
-- scanner's internal state, start codes and some application-specific state is
-- provided.
--
-- The token actions take the form of an association list associating each
-- token name with an action function that constructs the result from the
-- length of the token, the scanner's state (including the remaining input from
-- the start of the token) and a continuation function that scans the remaining
-- input.
--
-- More specifically, each token action takes as arguments the position of the
-- token, the last character read before the token (used to resolve leading
-- context), the whole input text from the start of the token, the length of
-- the token, the continuation function and the visible state (as distinct from
-- the scanner's internal state) including the current start code and the
-- application specific state. The stop action is invoked when no more input
-- can be scanned; it takes the same parameters as the token actions, except
-- the token length and the continuation function.
type GScan s r = (DFA (GTokenAction s r), GStopAction s r)
type GActions s r = ([(String, GTokenAction s r)], GStopAction s r)
type GTokenAction s r =
Posn -> Char -> String -> Int ->
((StartCode,s)->r) -> (StartCode,s) -> r
type GStopAction s r = Posn -> Char -> String -> (StartCode,s) -> r
-- @load_gscan@ combines the actions with the dump generated by lx to produce a
-- @GScan@ structure that can be passed to @gscan@. @gscan@ takes the scanner,
-- the application-specific state and the input text as parameters. It assumes
-- that the text is at the start of the input with the position set to
-- @start_pos@ (see above) and sets the last character read to new-line and the
-- start code to 0; @gscan'@ can be used to override these defaults.
load_gscan:: GActions s r -> DFADump -> GScan s r
gscan:: GScan s r -> s -> String -> r
gscan':: GScan s r -> Posn -> Char -> String -> (StartCode,s) -> r
load_gscan (al,s_a) dmp = (load_dfa al df dmp,s_a)
where
df = \_ _ _ _ cont s -> cont s
gscan scr s inp = gscan' scr start_pos '\n' inp (0,s)
gscan' scr@(dfa,s_a) p c inp sc_s =
case scan_token dfa sc_s p c inp of
Nothing -> s_a p c inp sc_s
Just (p',c',inp',len,Acc _ _ t_a _ _ _) ->
t_a p c inp len (gscan' scr p' c' inp') sc_s
{------------------------------------------------------------------------------
SCAN INTERNALS
The internals of the Scan module follow. They shouldn't be required by most
applications.
------------------------------------------------------------------------------}
{------------------------------------------------------------------------------
scan_token
------------------------------------------------------------------------------}
-- `scan_token' picks out the next token from the input. It takes the DFA and
-- the usual parameters and returns the `Accept' structure associated with the
-- highest priority token matching the longest input sequence, nothing if no
-- token matches. Associated with `Accept' in `Sv' is the length of the token
-- as well as the position, previous character and remaining input at the end
-- of accepted token (i.e., the start of the next token).
type Sv t = (Posn,Char,String,Int,Accept t)
scan_token:: DFA f -> (StartCode,s) -> Posn -> Char -> String -> Maybe (Sv f)
scan_token dfa sc_s p c inp =
case dropWhile (check_ctx dfa sc_s c) (scan_tkn dfa p c inp 0 0 []) of
[] -> Nothing
sv:_ -> Just sv
-- This function takes the DFA, scanner state, last character read and an `Sv'
-- structure and determines whether the token has the right context to be
-- accepted. It may have some leading or trailing context or be restricted to
-- certain start codes.
--
-- Note that the trailing context is checked by invoking `scan_tkn' with the
-- given state in the DFA corresponding to the regular expression specifying
-- the trailing context; while this may be inefficient, trailing context is
-- rarely used and it avoids well-known infidelities arrising from the more
-- efficient method used by Lex and Flex.
check_ctx:: DFA f -> (StartCode,s) -> Char -> Sv f -> Bool
check_ctx dfa sc_s c (p',c',inp',_,acc) =
case acc of
Acc _ _ _ [] Nothing Nothing -> False
Acc _ _ _ scs lctx rctx ->
chk_scs sc_s scs || chk_lctx lctx || chk_rctx p' c' inp' rctx
where
chk_scs (sc,_) [] = False
chk_scs (sc,_) scs = sc `notElem` scs
chk_lctx Nothing = False
chk_lctx (Just st) = not(st c)
chk_rctx p' c' inp' Nothing = False
chk_rctx p' c' inp' (Just sn) = null(scan_tkn dfa p' c' inp' 0 sn [])
-- This function performs most of the work of `scan_token'. It pushes the
-- input through the DFA, remembering the accepting states it encounters on a
-- stack. No context is checked here. A space leak could result from a long
-- token with many valid prefixes, leading to a large stack. This space leak
-- is avoided in most cases by discarding the stack if an unconditional state
-- is pushed on (no state below an unconditional state will be needed).
scan_tkn:: DFA f -> Posn -> Char -> String -> Int -> SNum -> [Sv f] -> [Sv f]
scan_tkn dfa p c inp len s stk =
if s>=0
then case inp of
[] -> stk'
c':inp' -> scan_tkn dfa p' c' inp' (len+1) s' stk'
where
p' = move_pos p c'
s' = if inRange (bounds out) c' then out!c' else df
else stk
where
stk' = if clr then svs else svs ++ stk
svs = [(p,c,inp,len,acc)| acc<-accs]
St clr accs df out = dfa!s
{------------------------------------------------------------------------------
DFAs
------------------------------------------------------------------------------}
-- (This section should logically belong to the DFA module but it has been
-- placed here to make this module self-contained.)
--
-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
-- it can be used directly to scan text efficiently. Additionally it has an
-- extra place holder for holding action functions for generating
-- application-specific tokens. When this place holder is not being used, the
-- unit type will be used.
--
-- Each state in the automaton consist of a list of `Accept' values, descending
-- in priority, and an array mapping characters to new states. As the array
-- may only cover a sub-range of the characters, a default state number is
-- given in the third field. By convention, all transitions to the -1 state
-- represent invalid transitions.
--
-- A list of accept states is provided for as the original specification may
-- have been ambiguous, in which case the highest priority token should be
-- taken (the one appearing earliest in the specification); this can not be
-- calculated when the DFA is generated in all cases as some of the tokens may
-- be associated with leading or trailing context or start codes.
--
-- `scan_token' (see above) can deal with unconditional accept states more
-- efficiently than those associated with context; to save it testing each time
-- whether the list of accept states contains an unconditional state, the flag
-- in the first field of `St' is set to true whenever the list contains an
-- unconditional state.
--
-- The `Accept' structure contains the priority of the token being accepted
-- (lower numbers => higher priorities), the name of the token, a place holder
-- that can be used for storing the `action' function for constructing the
-- token from the input text and thge scanner's state, a list of start codes
-- (listing the start codes that the scanner must be in for the token to be
-- accepted; empty => no restriction), the leading and trailing context (both
-- `Nothing' if there is none).
--
-- The leading context consists simply of a character predicate that will
-- return true if the last character read is acceptable. The trailing context
-- consists of an alternative starting state within the DFA; if this `sub-dfa'
-- turns up any accepting state when applied to the residual input then the
-- trailing context is acceptable (see `scan_token' above).
type DFA a = Array SNum (State a)
type SNum = Int
data State a = St Bool [Accept a] SNum (Array Char SNum)
data Accept a = Acc Int String a [StartCode] (Maybe(Char->Bool)) (Maybe SNum)
type StartCode = Int
-- `DFADump' is the format used to encode DFAs by lx. `dump_dfa' will encode
-- the DFA (ignoring any action functions), `recover_dfa' will recover it again
-- and `load_dfa' will additionally combine the action functions specified in
-- an association list.
type DFADump = [(Bool,[AcceptDump],SNum,ArrDump Int)]
type AcceptDump = (Int,String,[StartCode],Maybe(ArrDump Bool),Maybe SNum)
type ArrDump a = ((Char,Char),[(Char,a)])
dump_dfa:: DFA a -> DFADump
dump_dfa dfa = map dp_st (elems dfa)
where
dp_st (St cl accs df out) = (cl,map dp_acc accs,df,dp_out df out)
dp_acc (Acc n nm _ scs lctx rctx) =
(n,nm,scs,dp_lctx lctx,rctx)
dp_lctx Nothing = Nothing
dp_lctx (Just st) =
case as of
[] -> Just (('1','0'),[])
_ -> Just ((fst(head as),fst(last as)),as)
where
as = [(c,True)| c<-dfa_alphabet, st c]
dp_out df ar = (bounds ar,[(c,n)| (c,n)<-assocs ar, n/=df])
load_dfa:: [(String,f)] -> f -> DFADump -> DFA f
load_dfa al df dmp = fmap f (recover_dfa dmp)
where
f (St clr accs dflt ar) = St clr (map g accs) dflt ar
g (Acc n nm _ scs lctx rctx) = Acc n nm t_a scs lctx rctx
where
t_a = case dropWhile (\(nm',_)->nm/=nm') al of
[] -> df
(_,t_a):_ -> t_a
recover_dfa:: DFADump -> DFA ()
recover_dfa l = listArray bds [rc_st cl accs df out| (cl,accs,df,out)<-l]
where
bds = (0,length l-1)
rc_st cl accs df out = St cl (rc_accs accs) df (rc_arr df out)
rc_accs accs = map rc_acc accs
rc_acc (n,nm,scs,lctx,rctx) =
Acc n nm () scs (rc_lctx lctx) rctx
rc_lctx Nothing = Nothing
rc_lctx (Just ad) = Just (tst(rc_arr False ad))
where
tst arr c = if inRange (bounds arr) c then arr!c else False
rc_arr df (bs,as) = listArray bs [df|_<-range bs] // [(c,y)|(c,y)<-as]
dfa_alphabet:: [Char]
dfa_alphabet = ['\0'..'\255']