Alex.hs

{------------------------------------------------------------------------------
				 SCANNING TEXT

This module provides the run-time interface to the `Alex' package.  The
functions in this module take the raw tables generated by `lx' and constructs
scanners, functions for cutting input text into tokens.

Many scanners need to maintain the location of the tokens in the input text for
diagnostics generation and for parsing layout-sensitive languages like Haskell.
Thus the first section defines `Posn' type for locating tokens in the input
text.

Two scanning packages are given.  The first, `scan', generates simple stateless
scanners that generate streams of tokens.  The second, `gscan', provides
general scanners with access to the scanner's internal state, hooks for
application-specific state and no restriction on the return type of the
scanner.

`gscan' should be adequate for most application but, if it isn't, the
components used to assemble it are available at the end of the module for
reassembly into a suitable configuration.

Chris Dornan, Aug-95, 10-Jul-96, 29-Sep-97
------------------------------------------------------------------------------}

module Alex where

import Array


{------------------------------------------------------------------------------
				Token Positions
------------------------------------------------------------------------------}


-- `Posn' records the location of a token in the input text.  It has three
-- fields: the address (number of chacaters preceding the token), line number
-- and column of a token within the file. `start_pos' gives the position of the
-- start of the file and `eof_pos' a standard encoding for the end of file.
-- `move_pos' calculates the new position after traversing a given character,
-- assuming the usual eight character tab stops.

data Posn = Pn !Int !Int !Int
	deriving (Eq,Show)

start_pos:: Posn
start_pos = Pn 0 1 1

eof_pos:: Posn
eof_pos = Pn (-1) (-1) (-1)

move_pos:: Posn -> Char -> Posn
move_pos (Pn a l c) '\t' = Pn (a+1)  l     (((c+7) `div` 8)*8+1)
move_pos (Pn a l c) '\n' = Pn (a+1) (l+1)   1
move_pos (Pn a l c) _    = Pn (a+1)  l     (c+1)


{------------------------------------------------------------------------------
				     scan
------------------------------------------------------------------------------}


-- The @Scan@ package generates simple scanners that convert input text to
-- streams of tokens.  The scanners are stateless as each token generated is a
-- function of its textual content and location.
--  
-- The token actions take the form of an association list associating each
-- token name with an action function that constructs the token from the text
-- matched and its location.  The stop action is invoked when no more input can
-- be tokenised; it takes the residual input and its position and generates the
-- remaining stream of tokens, usually the empty list or an end-of-file token
-- if the empty string is passed, an error token otherwise.

type Actions t = ([(String,TokenAction t)], StopAction t)

type TokenAction t = Posn -> String -> t

type StopAction t = Posn -> String -> [t]


-- @load_scan@ combines the actions with the dump generated by \lx\ to produce
-- a @Scan@ structure that can be passed to @scan@.  @scan@ takes the scanner
-- and the input text and generates a stream of tokens.  It assumes that the
-- text is at the start of the input with the position set to @start_pos@ (see
-- above) and sets the last character read to newline (the last character read
-- is used to resolve leading context specifications); @scan'@ can be used to
-- override these defaults.

load_scan:: Actions t -> DFADump -> Scan t
scan:: Scan t -> String -> [t]
scan':: Scan t -> Posn -> Char -> String -> [t]


-- `Scan' is an straightforward construction on `GScan'.

type Scan t = GScan () [t]

load_scan (al,s_a) dmp = load_gscan (al',s_a') dmp
	where
	al' = [(nm,mk_act f)|(nm,f)<-al]

	mk_act f = \p _ inp len cont sc_s -> f p (take len inp):cont sc_s

	s_a' p _ inp _ = s_a p inp

scan scr inp = scan' scr start_pos '\n' inp

scan' scr p c inp = gscan' scr p c inp (0,())


{------------------------------------------------------------------------------
				    gscan
------------------------------------------------------------------------------}


-- The @gscan@ package generates general-purpose scanners for converting input
-- text into a return type determined by the application.  Access to the
-- scanner's internal state, start codes and some application-specific state is
-- provided.
--  
-- The token actions take the form of an association list associating each
-- token name with an action function that constructs the result from the
-- length of the token, the scanner's state (including the remaining input from
-- the start of the token) and a continuation function that scans the remaining
-- input.
--  
-- More specifically, each token action takes as arguments the position of the
-- token, the last character read before the token (used to resolve leading
-- context), the whole input text from the start of the token, the length of
-- the token, the continuation function and the visible state (as distinct from
-- the scanner's internal state) including the current start code and the
-- application specific state.  The stop action is invoked when no more input
-- can be scanned; it takes the same parameters as the token actions, except
-- the token length and the continuation function.

type GScan s r = (DFA (GTokenAction s r), GStopAction s r)

type GActions s r = ([(String, GTokenAction s r)], GStopAction s r)

type GTokenAction s r = 
	Posn -> Char -> String -> Int ->
		((StartCode,s)->r) -> (StartCode,s) -> r

type GStopAction s r = Posn -> Char -> String -> (StartCode,s) -> r


-- @load_gscan@ combines the actions with the dump generated by lx to produce a
-- @GScan@ structure that can be passed to @gscan@.  @gscan@ takes the scanner,
-- the application-specific state and the input text as parameters.  It assumes
-- that the text is at the start of the input with the position set to
-- @start_pos@ (see above) and sets the last character read to new-line and the
-- start code to 0; @gscan'@ can be used to override these defaults.

load_gscan:: GActions s r -> DFADump -> GScan s r
gscan:: GScan s r -> s -> String -> r
gscan':: GScan s r -> Posn -> Char -> String -> (StartCode,s) -> r

load_gscan (al,s_a) dmp = (load_dfa al df dmp,s_a)
	where
	df = \_ _ _ _ cont s -> cont s

gscan scr s inp = gscan' scr start_pos '\n' inp (0,s)

gscan' scr@(dfa,s_a) p c inp sc_s =
	case scan_token dfa sc_s p c inp of
	  Nothing -> s_a p c inp sc_s
	  Just (p',c',inp',len,Acc _ _ t_a _ _ _) ->
				t_a p c inp len (gscan' scr p' c' inp') sc_s


{------------------------------------------------------------------------------
				SCAN INTERNALS

The internals of the Scan module follow.  They shouldn't be required by most
applications.
------------------------------------------------------------------------------}


{------------------------------------------------------------------------------
				  scan_token
------------------------------------------------------------------------------}


-- `scan_token' picks out the next token from the input.  It takes the DFA and
-- the usual parameters and returns the `Accept' structure associated with the
-- highest priority token matching the longest input sequence, nothing if no
-- token matches.  Associated with `Accept' in `Sv' is the length of the token
-- as well as the position, previous character and remaining input at the end
-- of accepted token (i.e., the start of the next token).

type Sv t = (Posn,Char,String,Int,Accept t)

scan_token:: DFA f -> (StartCode,s) -> Posn -> Char -> String -> Maybe (Sv f)
scan_token dfa sc_s p c inp =
	case dropWhile (check_ctx dfa sc_s c) (scan_tkn dfa p c inp 0 0 []) of
	  [] -> Nothing
	  sv:_ -> Just sv

-- This function takes the DFA, scanner state, last character read and an `Sv'
-- structure and determines whether the token has the right context to be
-- accepted.  It may have some leading or trailing context or be restricted to
-- certain start codes.
--  
-- Note that the trailing context is checked by invoking `scan_tkn' with the
-- given state in the DFA corresponding to the regular expression specifying
-- the trailing context; while this may be inefficient, trailing context is
-- rarely used and it avoids well-known infidelities arrising from the more
-- efficient method used by Lex and Flex.

check_ctx:: DFA f -> (StartCode,s) -> Char -> Sv f -> Bool
check_ctx dfa sc_s c (p',c',inp',_,acc) =
	case acc of
	  Acc _ _ _ [] Nothing Nothing -> False
	  Acc _ _ _ scs lctx rctx ->
		chk_scs sc_s scs || chk_lctx lctx || chk_rctx p' c' inp' rctx
	where
	chk_scs (sc,_) [] = False
	chk_scs (sc,_) scs = sc `notElem` scs

	chk_lctx Nothing = False
	chk_lctx (Just st) = not(st c)

	chk_rctx p' c' inp' Nothing = False
	chk_rctx p' c' inp' (Just sn) = null(scan_tkn dfa p' c' inp' 0 sn [])

-- This function performs most of the work of `scan_token'.  It pushes the
-- input through the DFA, remembering the accepting states it encounters on a
-- stack.  No context is checked here.  A space leak could result from a long
-- token with many valid prefixes, leading to a large stack.  This space leak
-- is avoided in most cases by discarding the stack if an unconditional state
-- is pushed on (no state below an unconditional state will be needed).

scan_tkn:: DFA f -> Posn -> Char -> String -> Int -> SNum -> [Sv f] -> [Sv f]
scan_tkn dfa p c inp len s stk =
	if s>=0
	   then case inp of
		  [] -> stk'
		  c':inp' -> scan_tkn dfa p' c' inp' (len+1) s' stk'
			where
			p' = move_pos p c'
			s' = if inRange (bounds out) c' then out!c' else df
	   else stk
	where
	stk' =	if clr then svs else svs ++ stk
	svs  =	[(p,c,inp,len,acc)| acc<-accs]

	St clr accs df out = dfa!s


{------------------------------------------------------------------------------
				     DFAs
------------------------------------------------------------------------------}


-- (This section should logically belong to the DFA module but it has been
-- placed here to make this module self-contained.)
--  
-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
-- it can be used directly to scan text efficiently.  Additionally it has an
-- extra place holder for holding action functions for generating
-- application-specific tokens.  When this place holder is not being used, the
-- unit type will be used.
--  
-- Each state in the automaton consist of a list of `Accept' values, descending
-- in priority, and an array mapping characters to new states.  As the array
-- may only cover a sub-range of the characters, a default state number is
-- given in the third field.  By convention, all transitions to the -1 state
-- represent invalid transitions.
--  
-- A list of accept states is provided for as the original specification may
-- have been ambiguous, in which case the highest priority token should be
-- taken (the one appearing earliest in the specification); this can not be
-- calculated when the DFA is generated in all cases as some of the tokens may
-- be associated with leading or trailing context or start codes.
--  
-- `scan_token' (see above) can deal with unconditional accept states more
-- efficiently than those associated with context; to save it testing each time
-- whether the list of accept states contains an unconditional state, the flag
-- in the first field of `St' is set to true whenever the list contains an
-- unconditional state.
--  
-- The `Accept' structure contains the priority of the token being accepted
-- (lower numbers => higher priorities), the name of the token, a place holder
-- that can be used for storing the `action' function for constructing the
-- token from the input text and thge scanner's state, a list of start codes
-- (listing the start codes that the scanner must be in for the token to be
-- accepted; empty => no restriction), the leading and trailing context (both
-- `Nothing' if there is none).
--  
-- The leading context consists simply of a character predicate that will
-- return true if the last character read is acceptable.  The trailing context
-- consists of an alternative starting state within the DFA; if this `sub-dfa'
-- turns up any accepting state when applied to the residual input then the
-- trailing context is acceptable (see `scan_token' above).

type DFA a = Array SNum (State a)

type SNum = Int

data State a = St Bool [Accept a] SNum (Array Char SNum)

data Accept a = Acc Int String a [StartCode] (Maybe(Char->Bool)) (Maybe SNum)

type StartCode = Int


-- `DFADump' is the format used to encode DFAs by lx.  `dump_dfa' will encode
-- the DFA (ignoring any action functions), `recover_dfa' will recover it again
-- and `load_dfa' will additionally combine the action functions specified in
-- an association list.

type DFADump = [(Bool,[AcceptDump],SNum,ArrDump Int)]

type AcceptDump  = (Int,String,[StartCode],Maybe(ArrDump Bool),Maybe SNum)

type ArrDump a = ((Char,Char),[(Char,a)])


dump_dfa:: DFA a -> DFADump
dump_dfa dfa = map dp_st (elems dfa)
	where
	dp_st (St cl accs df out) = (cl,map dp_acc accs,df,dp_out df out)

	dp_acc (Acc n nm _ scs lctx rctx) =
				(n,nm,scs,dp_lctx lctx,rctx)

	dp_lctx Nothing = Nothing
	dp_lctx (Just st) =
		case as of
		  [] -> Just (('1','0'),[])
		  _  -> Just ((fst(head as),fst(last as)),as)
		where
		as = [(c,True)| c<-dfa_alphabet, st c]

	dp_out df ar = (bounds ar,[(c,n)| (c,n)<-assocs ar, n/=df])

load_dfa:: [(String,f)] -> f -> DFADump -> DFA f
load_dfa al df dmp = fmap f (recover_dfa dmp)
	where
	f (St clr accs dflt ar) = St clr (map g accs) dflt ar

	g (Acc n nm _ scs lctx rctx) = Acc n nm t_a scs lctx rctx
		where
		t_a =	case dropWhile (\(nm',_)->nm/=nm') al of
			  []        -> df
			  (_,t_a):_ -> t_a

recover_dfa:: DFADump -> DFA ()
recover_dfa l = listArray bds [rc_st cl accs df out| (cl,accs,df,out)<-l]
	where
	bds = (0,length l-1)

	rc_st cl accs df out = St cl (rc_accs accs) df (rc_arr df out)
	
	rc_accs accs = map rc_acc accs

	rc_acc (n,nm,scs,lctx,rctx) =
			Acc n nm () scs (rc_lctx lctx) rctx

	rc_lctx Nothing = Nothing
	rc_lctx (Just ad) = Just (tst(rc_arr False ad))
		where
		tst arr c = if inRange (bounds arr) c then arr!c else False

	rc_arr df (bs,as) = listArray bs [df|_<-range bs] // [(c,y)|(c,y)<-as]

dfa_alphabet:: [Char]
dfa_alphabet = ['\0'..'\255']