From 8bd1d989e8b6636dd1560c80418dbfc4755aacbd Mon Sep 17 00:00:00 2001 From: Rob Durst Date: Thu, 28 Dec 2023 22:18:31 -0700 Subject: [PATCH] introduce string character iterator --- spec/zodiac/lexer_spec.rb | 3 +- src/zodiac/character_helpers.rb | 8 +- src/zodiac/lexer.rb | 123 ++++++++++-------------- src/zodiac/string_character_iterator.rb | 43 +++++++++ 4 files changed, 100 insertions(+), 77 deletions(-) create mode 100644 src/zodiac/string_character_iterator.rb diff --git a/spec/zodiac/lexer_spec.rb b/spec/zodiac/lexer_spec.rb index 60950e3..a705884 100644 --- a/spec/zodiac/lexer_spec.rb +++ b/spec/zodiac/lexer_spec.rb @@ -79,7 +79,7 @@ end it 'lexs operators' do - input = '+ - * / % ** & | ^ << >> && || @@::..== === =~ +@ -@ []' + input = '+ - * / % ** & | ^ << >> && || @@::..== =~ +@ -@ []' lexer = described_class.new(input) expected_output = [ @@ -100,7 +100,6 @@ { kind: 'SYMBOL', value: '::' }, { kind: 'SYMBOL', value: '..' }, { kind: 'SYMBOL', value: '==' }, - { kind: 'SYMBOL', value: '===' }, { kind: 'SYMBOL', value: '=~' }, { kind: 'SYMBOL', value: '+@' }, { kind: 'SYMBOL', value: '-@' }, diff --git a/src/zodiac/character_helpers.rb b/src/zodiac/character_helpers.rb index c0a0288..9756180 100644 --- a/src/zodiac/character_helpers.rb +++ b/src/zodiac/character_helpers.rb @@ -8,7 +8,7 @@ def string_start?(value) end def symbol?(value) - '.:[]{}+-*/%&|^><@~$!?:'.include?(value) + '=.:[]{}+-*/%&|^><@~$!?:'.include?(value) end def op_assign_symbol?(value) @@ -16,7 +16,11 @@ def op_assign_symbol?(value) end def double_symbol?(value) - '*<>|&@:.'.include?(value) + '=*<>|&@:.'.include?(value) + end + + def complex_symbol?(value, next_value) + !next_value.nil? && %w(+@ -@ [] =~).include?(value + next_value) || (double_symbol?(value) && value == next_value) end def contains_equal_sign?(value) diff --git a/src/zodiac/lexer.rb b/src/zodiac/lexer.rb index 9d10f66..a6b76d5 100644 --- a/src/zodiac/lexer.rb +++ b/src/zodiac/lexer.rb @@ -2,6 +2,7 @@ require './src/zodiac/character_helpers' require './src/zodiac/lex_error' +require './src/zodiac/string_character_iterator' module Zodiac # Base lexing class for the Zodiac language. @@ -11,17 +12,18 @@ module Zodiac # * HERE_DOC # * REGEXP # * '<=>' + # * '===' class Lexer include ::Zodiac::CharacterHelpers def initialize(raw_string) - @raw_string = raw_string - @cur_index = 0 @tokens = [] + @word = '' + @input_iterator = StringCharacterIterator.new(raw_string) end def lex - lex_next while @cur_index < @raw_string.size + lex_next while @input_iterator.not_finished? @tokens end @@ -30,118 +32,93 @@ def lex def lexers [ - { lexer: 'lex_equals_sign_prefix', condition: proc { @cur == '=' } }, - { lexer: 'lex_comment', condition: proc { @cur == '#' } }, - { lexer: 'lex_op_assign', condition: proc { op_assign? } }, - { lexer: 'lex_symbol', condition: proc { symbol?(@cur) } }, - { lexer: 'lex_identifier', condition: proc { letter?(@cur) || underscore?(@cur) } }, - { lexer: 'lex_string', condition: proc { string_start?(@cur) } }, - { lexer: 'lex_number', condition: proc { number?(@cur) } } + { token_kind: 'COMMENT', lexer: 'lex_comment', condition: proc { |top| top == '#' } }, + { token_kind: 'OP_ASGN', lexer: 'lex_op_assign', condition: proc { |_top| @input_iterator.op_assign_peek? } }, + { token_kind: 'SYMBOL', lexer: 'lex_symbol', condition: proc { |top| symbol?(top) } }, + { token_kind: 'IDENTIFIER', lexer: 'lex_identifier', condition: proc { |top| + letter?(top) || underscore?(top) + } }, + { token_kind: 'STRING', lexer: 'lex_string', condition: proc { |top| string_start?(top) } }, + { token_kind: 'NUMBER', lexer: 'lex_number', condition: proc { |top| number?(top) } } ] end def lex_next - @cur = @raw_string[@cur_index] - @next_cur = @raw_string[@cur_index + 1] - @word = '' + reset_lex_iteration_state + + lexers.each do |lexer| + next unless lexer[:condition].call(@input_iterator.peek) - lexers.each do |bar| - if bar[:condition].call - send(bar[:lexer]) - return true - end + send(lexer[:lexer]) + @tokens << { kind: lexer[:token_kind], value: @word } + return true end - @cur_index += 1 + # if we get here, we didn't lex anything, i.e. unrecognized character pattern + @input_iterator.iterate end ### lexers ### def lex_symbol - if !@next_cur.nil? && complex_symbol? - @tokens << { kind: 'SYMBOL', value: @cur + @next_cur } - @cur_index += 2 - else - @tokens << { kind: 'SYMBOL', value: @cur } - @cur_index += 1 - end - end + @word = @input_iterator.peek + @input_iterator.iterate - def lex_equals_sign_prefix - if !@next_cur.nil? && @next_cur == '~' - @cur_index += 2 - @word = '=~' - else - continue_until_stop { @cur == '=' } - end + return unless complex_symbol?(@word, @input_iterator.peek) - @tokens << { kind: 'SYMBOL', value: @word } + @word += @input_iterator.peek + @input_iterator.iterate end def lex_op_assign - end_index = @raw_string[@cur_index..].index('=') + @cur_index - @tokens << { kind: 'OP_ASGN', value: @raw_string[@cur_index..end_index] } - @cur_index = end_index + 1 + continue_until_stop(after: 1) { @input_iterator.peek != '=' } end def lex_string - raise LexError, 'String not terminated' unless @raw_string[@cur_index + 1..].include?(@cur) - - append_word_and_iterate - continue_until_stop { !string_start?(@cur) } - append_word_and_iterate + unless @input_iterator.rest_includes?(@input_iterator.peek) + raise LexError, + 'String not terminated' + end - @tokens << { kind: 'STRING', value: @word } + continue_until_stop(before: 1, after: 1) { !string_start?(@input_iterator.peek) } end def lex_number - continue_until_stop { number?(@cur) } + continue_until_stop { number?(@input_iterator.peek) } - # presense of '.' means it is a decimal - lex_decimal if @cur == '.' - - @tokens << { kind: 'NUMBER', value: @word } - end + return unless @input_iterator.peek == '.' - def lex_decimal - append_word_and_iterate - continue_until_stop { number?(@cur) } + continue_until_stop(before: 1) do + number?(@input_iterator.peek) + end end def lex_identifier - @tokens << { kind: 'IDENTIFIER', value: continue_until_stop { alpha_num?(@cur) } } + continue_until_stop { alpha_num?(@input_iterator.peek) } end def lex_comment - @tokens << { kind: 'COMMENT', value: continue_until_stop { @cur != "\n" } } + continue_until_stop { @input_iterator.peek != "\n" } end ### Helpers ### - - def op_assign? - last_space_index = @raw_string[@cur_index..].index(' ') - end_index = last_space_index.nil? ? @raw_string.size : last_space_index + @cur_index - - contains_equal_sign?(@raw_string[@cur_index..end_index]) && - op_assign_symbol?(@cur) && ((end_index - @cur_index) < 4) + def append_word_and_iterate + @word += @input_iterator.peek + @input_iterator.iterate end - def complex_symbol? - %w(+@ -@ []).include?(@cur + @next_cur) || (double_symbol?(@next_cur) && @cur == @next_cur) - end + def continue_until_stop(before: 0, after: 0) + before.times { append_word_and_iterate } - def append_word_and_iterate - @word += @cur - @cur_index += 1 - @cur = @raw_string[@cur_index] + append_word_and_iterate while @input_iterator.not_finished? && yield + + after.times { append_word_and_iterate } @word end - def continue_until_stop - append_word_and_iterate while @cur_index < @raw_string.size && yield - - @word + def reset_lex_iteration_state + @word = '' end end end diff --git a/src/zodiac/string_character_iterator.rb b/src/zodiac/string_character_iterator.rb new file mode 100644 index 0000000..47fc24f --- /dev/null +++ b/src/zodiac/string_character_iterator.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +# Raw string character iterator for the Zodiac language compiler. +class StringCharacterIterator + include ::Zodiac::CharacterHelpers + + def initialize(raw_string) + @raw_string = raw_string + end + + def not_finished? + @raw_string.size.positive? + end + + def iterate + @raw_string = @raw_string[1..] + end + + def peek + @raw_string[0] + end + + def rest_includes?(value) + @raw_string[1..].include?(value) + end + + def current_word_includes?(_value) + end_index = @raw_string.index(' ', 1) || @raw_string.size + + contains_equal_sign?(@raw_string[..end_index]) + end + + def char_until(value) + @raw_string.index(value) + end + + def op_assign_peek? + equals_sign_is_close_enough = current_word_includes?('=') && (@raw_string.index('=') < 4) + starts_with_op_assign_char = op_assign_symbol?(peek) + + equals_sign_is_close_enough && starts_with_op_assign_char + end +end