Skip to content

Commit

Permalink
introduce string character iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
robertDurst committed Dec 29, 2023
1 parent ee10a94 commit 8bd1d98
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 77 deletions.
3 changes: 1 addition & 2 deletions spec/zodiac/lexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
end

it 'lexs operators' do
input = '+ - * / % ** & | ^ << >> && || @@::..== === =~ +@ -@ []'
input = '+ - * / % ** & | ^ << >> && || @@::..== =~ +@ -@ []'
lexer = described_class.new(input)

expected_output = [
Expand All @@ -100,7 +100,6 @@
{ kind: 'SYMBOL', value: '::' },
{ kind: 'SYMBOL', value: '..' },
{ kind: 'SYMBOL', value: '==' },
{ kind: 'SYMBOL', value: '===' },
{ kind: 'SYMBOL', value: '=~' },
{ kind: 'SYMBOL', value: '+@' },
{ kind: 'SYMBOL', value: '-@' },
Expand Down
8 changes: 6 additions & 2 deletions src/zodiac/character_helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@ def string_start?(value)
end

def symbol?(value)
'.:[]{}+-*/%&|^><@~$!?:'.include?(value)
'=.:[]{}+-*/%&|^><@~$!?:'.include?(value)
end

def op_assign_symbol?(value)
'+-*/%*|^><&|[]'.include?(value)
end

def double_symbol?(value)
'*<>|&@:.'.include?(value)
'=*<>|&@:.'.include?(value)
end

def complex_symbol?(value, next_value)
!next_value.nil? && %w(+@ -@ [] =~).include?(value + next_value) || (double_symbol?(value) && value == next_value)
end

def contains_equal_sign?(value)
Expand Down
123 changes: 50 additions & 73 deletions src/zodiac/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require './src/zodiac/character_helpers'
require './src/zodiac/lex_error'
require './src/zodiac/string_character_iterator'

module Zodiac
# Base lexing class for the Zodiac language.
Expand All @@ -11,17 +12,18 @@ module Zodiac
# * HERE_DOC
# * REGEXP
# * '<=>'
# * '==='
class Lexer
include ::Zodiac::CharacterHelpers

def initialize(raw_string)
@raw_string = raw_string
@cur_index = 0
@tokens = []
@word = ''
@input_iterator = StringCharacterIterator.new(raw_string)
end

def lex
lex_next while @cur_index < @raw_string.size
lex_next while @input_iterator.not_finished?

@tokens
end
Expand All @@ -30,118 +32,93 @@ def lex

def lexers
[
{ lexer: 'lex_equals_sign_prefix', condition: proc { @cur == '=' } },
{ lexer: 'lex_comment', condition: proc { @cur == '#' } },
{ lexer: 'lex_op_assign', condition: proc { op_assign? } },
{ lexer: 'lex_symbol', condition: proc { symbol?(@cur) } },
{ lexer: 'lex_identifier', condition: proc { letter?(@cur) || underscore?(@cur) } },
{ lexer: 'lex_string', condition: proc { string_start?(@cur) } },
{ lexer: 'lex_number', condition: proc { number?(@cur) } }
{ token_kind: 'COMMENT', lexer: 'lex_comment', condition: proc { |top| top == '#' } },
{ token_kind: 'OP_ASGN', lexer: 'lex_op_assign', condition: proc { |_top| @input_iterator.op_assign_peek? } },
{ token_kind: 'SYMBOL', lexer: 'lex_symbol', condition: proc { |top| symbol?(top) } },
{ token_kind: 'IDENTIFIER', lexer: 'lex_identifier', condition: proc { |top|
letter?(top) || underscore?(top)
} },
{ token_kind: 'STRING', lexer: 'lex_string', condition: proc { |top| string_start?(top) } },
{ token_kind: 'NUMBER', lexer: 'lex_number', condition: proc { |top| number?(top) } }
]
end

def lex_next
@cur = @raw_string[@cur_index]
@next_cur = @raw_string[@cur_index + 1]
@word = ''
reset_lex_iteration_state

lexers.each do |lexer|
next unless lexer[:condition].call(@input_iterator.peek)

lexers.each do |bar|
if bar[:condition].call
send(bar[:lexer])
return true
end
send(lexer[:lexer])
@tokens << { kind: lexer[:token_kind], value: @word }
return true
end

@cur_index += 1
# if we get here, we didn't lex anything, i.e. unrecognized character pattern
@input_iterator.iterate
end

### lexers ###

def lex_symbol
if !@next_cur.nil? && complex_symbol?
@tokens << { kind: 'SYMBOL', value: @cur + @next_cur }
@cur_index += 2
else
@tokens << { kind: 'SYMBOL', value: @cur }
@cur_index += 1
end
end
@word = @input_iterator.peek
@input_iterator.iterate

def lex_equals_sign_prefix
if !@next_cur.nil? && @next_cur == '~'
@cur_index += 2
@word = '=~'
else
continue_until_stop { @cur == '=' }
end
return unless complex_symbol?(@word, @input_iterator.peek)

@tokens << { kind: 'SYMBOL', value: @word }
@word += @input_iterator.peek
@input_iterator.iterate
end

def lex_op_assign
end_index = @raw_string[@cur_index..].index('=') + @cur_index
@tokens << { kind: 'OP_ASGN', value: @raw_string[@cur_index..end_index] }
@cur_index = end_index + 1
continue_until_stop(after: 1) { @input_iterator.peek != '=' }
end

def lex_string
raise LexError, 'String not terminated' unless @raw_string[@cur_index + 1..].include?(@cur)

append_word_and_iterate
continue_until_stop { !string_start?(@cur) }
append_word_and_iterate
unless @input_iterator.rest_includes?(@input_iterator.peek)
raise LexError,
'String not terminated'
end

@tokens << { kind: 'STRING', value: @word }
continue_until_stop(before: 1, after: 1) { !string_start?(@input_iterator.peek) }
end

def lex_number
continue_until_stop { number?(@cur) }
continue_until_stop { number?(@input_iterator.peek) }

# presense of '.' means it is a decimal
lex_decimal if @cur == '.'

@tokens << { kind: 'NUMBER', value: @word }
end
return unless @input_iterator.peek == '.'

def lex_decimal
append_word_and_iterate
continue_until_stop { number?(@cur) }
continue_until_stop(before: 1) do
number?(@input_iterator.peek)
end
end

def lex_identifier
@tokens << { kind: 'IDENTIFIER', value: continue_until_stop { alpha_num?(@cur) } }
continue_until_stop { alpha_num?(@input_iterator.peek) }
end

def lex_comment
@tokens << { kind: 'COMMENT', value: continue_until_stop { @cur != "\n" } }
continue_until_stop { @input_iterator.peek != "\n" }
end

### Helpers ###

def op_assign?
last_space_index = @raw_string[@cur_index..].index(' ')
end_index = last_space_index.nil? ? @raw_string.size : last_space_index + @cur_index

contains_equal_sign?(@raw_string[@cur_index..end_index]) &&
op_assign_symbol?(@cur) && ((end_index - @cur_index) < 4)
def append_word_and_iterate
@word += @input_iterator.peek
@input_iterator.iterate
end

def complex_symbol?
%w(+@ -@ []).include?(@cur + @next_cur) || (double_symbol?(@next_cur) && @cur == @next_cur)
end
def continue_until_stop(before: 0, after: 0)
before.times { append_word_and_iterate }

def append_word_and_iterate
@word += @cur
@cur_index += 1
@cur = @raw_string[@cur_index]
append_word_and_iterate while @input_iterator.not_finished? && yield

after.times { append_word_and_iterate }

@word
end

def continue_until_stop
append_word_and_iterate while @cur_index < @raw_string.size && yield

@word
def reset_lex_iteration_state
@word = ''
end
end
end
43 changes: 43 additions & 0 deletions src/zodiac/string_character_iterator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

# Raw string character iterator for the Zodiac language compiler.
class StringCharacterIterator
include ::Zodiac::CharacterHelpers

def initialize(raw_string)
@raw_string = raw_string
end

def not_finished?
@raw_string.size.positive?
end

def iterate
@raw_string = @raw_string[1..]
end

def peek
@raw_string[0]
end

def rest_includes?(value)
@raw_string[1..].include?(value)
end

def current_word_includes?(_value)
end_index = @raw_string.index(' ', 1) || @raw_string.size

contains_equal_sign?(@raw_string[..end_index])
end

def char_until(value)
@raw_string.index(value)
end

def op_assign_peek?
equals_sign_is_close_enough = current_word_includes?('=') && (@raw_string.index('=') < 4)
starts_with_op_assign_char = op_assign_symbol?(peek)

equals_sign_is_close_enough && starts_with_op_assign_char
end
end

0 comments on commit 8bd1d98

Please sign in to comment.