From 90f6ca3fc5045764078cd10ab7d32e636b79ab3f Mon Sep 17 00:00:00 2001 From: Adam Christiansen Date: Wed, 2 Apr 2025 11:47:35 -0600 Subject: [PATCH 1/3] feat: add indentation to brace tokens Automcatically inserted `{` and `}` track the indentation (i.e., column number) of the context that they delimit. --- fuyu-core/src/parse/lexer.rs | 8 ++++---- fuyu-core/src/parse/token.rs | 12 ++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fuyu-core/src/parse/lexer.rs b/fuyu-core/src/parse/lexer.rs index 2b9865d..e04758f 100644 --- a/fuyu-core/src/parse/lexer.rs +++ b/fuyu-core/src/parse/lexer.rs @@ -276,10 +276,10 @@ impl<'a> Lexer<'a> { [Some('['), ..] => self.advance_by_and_emit(1, Token::LeftSquare), [Some('\\'), ..] => self.advance_by_and_emit(1, Token::BackSlash), [Some(']'), ..] => self.advance_by_and_emit(1, Token::RightSquare), - [Some('{'), ..] => self.advance_by_and_emit(1, Token::LeftBrace), + [Some('{'), ..] => self.advance_by_and_emit(1, Token::LeftBrace(None)), [Some('|'), Some('|'), ..] => self.advance_by_and_emit(2, Token::PipePipe), [Some('|'), ..] => self.advance_by_and_emit(1, Token::Pipe), - [Some('}'), ..] => self.advance_by_and_emit(1, Token::RightBrace), + [Some('}'), ..] => self.advance_by_and_emit(1, Token::RightBrace(None)), //------------------------------------------------------------------------------------- // Numbers. //------------------------------------------------------------------------------------- @@ -720,10 +720,10 @@ mod tests { #[test] fn scan_operators_and_punctuation() { - scan!("{", ok: Token::LeftBrace); + scan!("{", ok: Token::LeftBrace(None)); scan!("[", ok: Token::LeftSquare); scan!("(", ok: Token::LeftParen); - scan!("}", ok: Token::RightBrace); + scan!("}", ok: Token::RightBrace(None)); scan!("]", ok: Token::RightSquare); scan!(")", ok: Token::RightParen); scan!("&[", ok: Token::AmpLeftSquare); diff --git a/fuyu-core/src/parse/token.rs b/fuyu-core/src/parse/token.rs index 6b1e272..cc4a6cd 100644 --- a/fuyu-core/src/parse/token.rs +++ b/fuyu-core/src/parse/token.rs @@ -84,13 +84,21 @@ pub enum Token { /// The content is not checked for validity. RawString, /// `{`. - LeftBrace, + /// + /// The associated value is `None` if the brace appears in the source. When the brace is + /// inserted by the lexer, the value is `Some` with the 1-indexed indentation of the column + /// where the brace is inserted. + LeftBrace(Option), /// `[`. LeftSquare, /// `(`. LeftParen, /// `}`. - RightBrace, + /// + /// The associated value is `None` if the brace appears in the source. When the brace is + /// inserted by the lexer, the value is `Some` with the 1-indexed indentation of the column + /// of the matching automatically inserted [`LeftBrace`][Token::LeftBrace]. + RightBrace(Option), /// `]`. RightSquare, /// `)`. From 2acafbd3166394cde5f3a85f1a017495668f1087 Mon Sep 17 00:00:00 2001 From: Adam Christiansen Date: Thu, 3 Apr 2025 20:14:03 -0600 Subject: [PATCH 2/3] feat: add textchars iterator The `TextChars` iterator was added for use in the lexer, which does the following: - Byte order mark (BOM) handling. - Reject invalid control characters (closes #74). - Keep track of the current column position. --- fuyu-core/src/parse/lexer.rs | 319 +++++++++++++++++------------------ fuyu-core/src/parse/text.rs | 130 ++++++++++++++ 2 files changed, 282 insertions(+), 167 deletions(-) diff --git a/fuyu-core/src/parse/lexer.rs b/fuyu-core/src/parse/lexer.rs index e04758f..c74e398 100644 --- a/fuyu-core/src/parse/lexer.rs +++ b/fuyu-core/src/parse/lexer.rs @@ -1,33 +1,27 @@ // SPDX-License-Identifier: MPL-2.0 -use crate::parse::{ByteIdx, Text, Token}; -use std::iter; +use crate::parse::{ByteIdx, Text, TextChars, Token}; +use std::collections::VecDeque; +use std::iter::FusedIterator; use std::str::Chars; -/// The Unicode byte order mark (BOM). -const BOM: char = '\u{feff}'; - -/// The lexer looks ahead by 3 tokens. -const LOOKAHEAD: usize = 3; - /// The `Lexer` produces an iterator of [`Spanned`] [`Token`]s from a source [`Text`]. #[derive(Debug)] pub struct Lexer<'a> { /// The source text. text: &'a Text, /// An iterator over the characters in the source text. - chars: Chars<'a>, - /// The lexer works based on a window of [`LOOKAHEAD`] characters. - /// - /// The first character in the window (i.e., `window[0]`) is considered the next character in the source. - window: [Option; LOOKAHEAD], - /// The current position of the iterator. - idx: ByteIdx, + text_chars: TextChars<'a>, /// The marked start index (inclusive) that tracks the start of a marked region (e.g., token in /// a lexer). /// - /// It is guaranteed that `idx >= mark_idx`. + /// It is guaranteed that `self.text_chars.idx() >= self.mark_idx`. mark_idx: ByteIdx, + // TODO: Use this to (1) track indentation and (2) emit `{`, `}`, and `;`. + /// The indentations tracked by the lexer. + indent_stack: Vec, + /// Queued tokens to emit from the iterator the next time it is advanced. + queued: VecDeque, } /// The values that are produced by the [`Lexer`]. @@ -95,7 +89,7 @@ macro_rules! scan_int { return $lexer.emit_error(LexicalError::Number); } $lexer.advance_while(pat!($pattern | '_')); - if !$lexer.window[0].map_or(false, is_text_like) { + if !$lexer.peek().map_or(false, is_text_like) { $lexer.emit($kind) } else { $lexer.advance_while(is_text_like); @@ -109,40 +103,58 @@ impl<'a> Lexer<'a> { pub fn new(text: &'a Text) -> Self { let mut lexer = Self { text, - chars: text.as_str().chars(), - window: [None; LOOKAHEAD], - idx: 0, + text_chars: TextChars::new(text), mark_idx: 0, + indent_stack: vec![], + queued: VecDeque::new(), }; - // Fill the window. - for i in 0..LOOKAHEAD { - lexer.window[i] = lexer.chars.next(); - } - // The BOM is skipped so nothing else has to deal with it. - lexer.advance_if(pat!(BOM)); + // The index might not start at 0. lexer.mark(); + // Queue up the shebang comment here instead of the main loop since it can appear only at the start. + if lexer.peek_pair() == (Some('#'), Some('!')) { + lexer.advance_until(pat!('\n')); + lexer.emit_queue(Token::ShebangComment); + } lexer } + /// Shorthand for accessing the index. + fn idx(&self) -> usize { + self.text_chars.idx() + } + /// Set the mark index to the current index. fn mark(&mut self) { - self.mark_idx = self.idx; + self.mark_idx = self.idx(); } /// Get the string from the mark index (inclusive) to the current index (exclusive). fn as_marked_str(&self) -> &str { - &self.text.as_str()[self.mark_idx..self.idx] + &self.text.as_str()[self.mark_idx..self.idx()] + } + + /// Peek the next character without advancing. + fn peek(&mut self) -> Option { + self.text_chars.peek() + } + + /// Peek the next two characters without advancing. + fn peek_pair(&mut self) -> (Option, Option) { + (self.text_chars.peek_nth(0), self.text_chars.peek_nth(1)) + } + + /// Peek the next three characters without advancing. + fn peek_triplet(&mut self) -> (Option, Option, Option) { + ( + self.text_chars.peek_nth(0), + self.text_chars.peek_nth(1), + self.text_chars.peek_nth(2), + ) } /// Move the lexer ahead by one character. fn advance(&mut self) { - // Update the position. - if let Some(c) = self.window[0] { - self.idx += c.len_utf8(); - } - // Shift the next character into the window. - self.window.rotate_left(1); - self.window[LOOKAHEAD - 1] = self.chars.next(); + self.text_chars.next(); } /// Call [`Self::advance()`] `n` times. @@ -162,7 +174,7 @@ impl<'a> Lexer<'a> { where P: Fn(char) -> bool, { - self.window[0] + self.peek() .is_some_and(pred) .then(|| self.advance()) .is_some() @@ -196,7 +208,13 @@ impl<'a> Lexer<'a> { /// Create a token that spans from the marked start index to the current index. fn emit(&mut self, kind: Token) -> Option { - Some(Ok((self.mark_idx, kind, self.idx))) + Some(Ok((self.mark_idx, kind, self.idx()))) + } + + /// Create a token that spans from the marked start index to the current index and push it on + /// to the queue. + fn emit_queue(&mut self, kind: Token) { + self.queued.push_back(Ok((self.mark_idx, kind, self.idx()))); } /// Advance the lexer by `n` characters and then [`Self::emit()`] a token of the given `kind`. @@ -207,18 +225,7 @@ impl<'a> Lexer<'a> { /// Create an error that spans from the marked start index to the current index. fn emit_error(&mut self, error: LexicalError) -> Option { - Some(Err((self.mark_idx, error, self.idx))) - } - - /// Check if a [`Token::ShebangComment`] is allowed at the current location. - fn shebang_comment_location(&self) -> bool { - // Check if the current location is at the start of the text. - if self.idx == 0 { - return true; - } - // Check if the current location is at the second character in the text and the first - // character was a BOM. - self.idx == BOM.len_utf8() && self.text.as_str().starts_with('\u{feff}') + Some(Err((self.mark_idx, error, self.idx()))) } /// Produce the next token and advance the lexer. @@ -226,73 +233,72 @@ impl<'a> Lexer<'a> { /// This returns `None` at the end of the source text, and once `None` is returned, it will /// always return `None` on subsequent calls. fn scan(&mut self) -> Option { + if !self.queued.is_empty() { + return self.queued.pop_front(); + } self.advance_while(char::is_whitespace); self.mark(); - match self.window { + match self.peek_triplet() { //------------------------------------------------------------------------------------- // Comments. //------------------------------------------------------------------------------------- - [Some('#'), Some('!'), ..] if self.shebang_comment_location() => { - self.advance_until(pat!('\n')); - self.emit(Token::ShebangComment) - } - [Some('/'), Some('/'), Some('/'), ..] => { + (Some('/'), Some('/'), Some('/'), ..) => { self.advance_until(pat!('\n')); self.emit(Token::DocComment) } - [Some('/'), Some('/'), ..] => { + (Some('/'), Some('/'), ..) => { self.advance_until(pat!('\n')); self.emit(Token::LineComment) } //------------------------------------------------------------------------------------- // Operators and punctuation. //------------------------------------------------------------------------------------- - [Some('#'), Some('['), ..] => self.advance_by_and_emit(2, Token::HashLeftSquare), - [Some('#'), ..] => self.advance_by_and_emit(1, Token::Hash), - [Some('%'), ..] => self.advance_by_and_emit(1, Token::Percent), - [Some('&'), Some('&'), ..] => self.advance_by_and_emit(2, Token::AmpAmp), - [Some('&'), Some('['), ..] => self.advance_by_and_emit(2, Token::AmpLeftSquare), - [Some('('), ..] => self.advance_by_and_emit(1, Token::LeftParen), - [Some(')'), ..] => self.advance_by_and_emit(1, Token::RightParen), - [Some('*'), Some('*'), ..] => self.advance_by_and_emit(2, Token::StarStar), - [Some('*'), ..] => self.advance_by_and_emit(1, Token::Star), - [Some('+'), ..] => self.advance_by_and_emit(1, Token::Plus), - [Some(','), ..] => self.advance_by_and_emit(1, Token::Comma), - [Some('-'), Some('>'), ..] => self.advance_by_and_emit(2, Token::MinusGt), - [Some('-'), ..] => self.advance_by_and_emit(1, Token::Minus), - [Some('.'), Some('.'), ..] => self.advance_by_and_emit(2, Token::DotDot), - [Some('.'), ..] => self.advance_by_and_emit(1, Token::Dot), - [Some('/'), Some('='), ..] => self.advance_by_and_emit(2, Token::SlashEq), - [Some('/'), ..] => self.advance_by_and_emit(1, Token::Slash), - [Some(':'), ..] => self.advance_by_and_emit(1, Token::Colon), - [Some(';'), ..] => self.advance_by_and_emit(1, Token::Semicolon), - [Some('<'), Some('='), ..] => self.advance_by_and_emit(2, Token::LtEq), - [Some('<'), ..] => self.advance_by_and_emit(2, Token::Lt), - [Some('='), Some('='), ..] => self.advance_by_and_emit(2, Token::EqEq), - [Some('='), ..] => self.advance_by_and_emit(1, Token::Eq), - [Some('>'), Some('='), ..] => self.advance_by_and_emit(2, Token::GtEq), - [Some('>'), ..] => self.advance_by_and_emit(1, Token::Gt), - [Some('@'), ..] => self.advance_by_and_emit(1, Token::At), - [Some('['), ..] => self.advance_by_and_emit(1, Token::LeftSquare), - [Some('\\'), ..] => self.advance_by_and_emit(1, Token::BackSlash), - [Some(']'), ..] => self.advance_by_and_emit(1, Token::RightSquare), - [Some('{'), ..] => self.advance_by_and_emit(1, Token::LeftBrace(None)), - [Some('|'), Some('|'), ..] => self.advance_by_and_emit(2, Token::PipePipe), - [Some('|'), ..] => self.advance_by_and_emit(1, Token::Pipe), - [Some('}'), ..] => self.advance_by_and_emit(1, Token::RightBrace(None)), + (Some('#'), Some('['), ..) => self.advance_by_and_emit(2, Token::HashLeftSquare), + (Some('#'), ..) => self.advance_by_and_emit(1, Token::Hash), + (Some('%'), ..) => self.advance_by_and_emit(1, Token::Percent), + (Some('&'), Some('&'), ..) => self.advance_by_and_emit(2, Token::AmpAmp), + (Some('&'), Some('['), ..) => self.advance_by_and_emit(2, Token::AmpLeftSquare), + (Some('('), ..) => self.advance_by_and_emit(1, Token::LeftParen), + (Some(')'), ..) => self.advance_by_and_emit(1, Token::RightParen), + (Some('*'), Some('*'), ..) => self.advance_by_and_emit(2, Token::StarStar), + (Some('*'), ..) => self.advance_by_and_emit(1, Token::Star), + (Some('+'), ..) => self.advance_by_and_emit(1, Token::Plus), + (Some(','), ..) => self.advance_by_and_emit(1, Token::Comma), + (Some('-'), Some('>'), ..) => self.advance_by_and_emit(2, Token::MinusGt), + (Some('-'), ..) => self.advance_by_and_emit(1, Token::Minus), + (Some('.'), Some('.'), ..) => self.advance_by_and_emit(2, Token::DotDot), + (Some('.'), ..) => self.advance_by_and_emit(1, Token::Dot), + (Some('/'), Some('='), ..) => self.advance_by_and_emit(2, Token::SlashEq), + (Some('/'), ..) => self.advance_by_and_emit(1, Token::Slash), + (Some(':'), ..) => self.advance_by_and_emit(1, Token::Colon), + (Some(';'), ..) => self.advance_by_and_emit(1, Token::Semicolon), + (Some('<'), Some('='), ..) => self.advance_by_and_emit(2, Token::LtEq), + (Some('<'), ..) => self.advance_by_and_emit(2, Token::Lt), + (Some('='), Some('='), ..) => self.advance_by_and_emit(2, Token::EqEq), + (Some('='), ..) => self.advance_by_and_emit(1, Token::Eq), + (Some('>'), Some('='), ..) => self.advance_by_and_emit(2, Token::GtEq), + (Some('>'), ..) => self.advance_by_and_emit(1, Token::Gt), + (Some('@'), ..) => self.advance_by_and_emit(1, Token::At), + (Some('['), ..) => self.advance_by_and_emit(1, Token::LeftSquare), + (Some('\\'), ..) => self.advance_by_and_emit(1, Token::BackSlash), + (Some(']'), ..) => self.advance_by_and_emit(1, Token::RightSquare), + (Some('{'), ..) => self.advance_by_and_emit(1, Token::LeftBrace(None)), + (Some('|'), Some('|'), ..) => self.advance_by_and_emit(2, Token::PipePipe), + (Some('|'), ..) => self.advance_by_and_emit(1, Token::Pipe), + (Some('}'), ..) => self.advance_by_and_emit(1, Token::RightBrace(None)), //------------------------------------------------------------------------------------- // Numbers. //------------------------------------------------------------------------------------- - [Some('0'), Some('b'), ..] => scan_int!(self, '0'..='1', Token::BinInt), - [Some('0'), Some('o'), ..] => scan_int!(self, '0'..='7', Token::OctInt), - [Some('0'), Some('x'), ..] => { + (Some('0'), Some('b'), ..) => scan_int!(self, '0'..='1', Token::BinInt), + (Some('0'), Some('o'), ..) => scan_int!(self, '0'..='7', Token::OctInt), + (Some('0'), Some('x'), ..) => { scan_int!(self, '0'..='9' | 'A'..='F' | 'a'..='f', Token::HexInt) } - [Some('0'), Some('B' | 'O' | 'X'), ..] => { + (Some('0'), Some('B' | 'O' | 'X'), ..) => { self.advance_while(is_text_like); self.emit_error(LexicalError::BaseMarker) } - [Some('0'..='9'), ..] => { + (Some('0'..='9'), ..) => { // Decimal number. // Assume the number is an integer until proven otherwise. let mut kind_or_error = Ok(Token::DecInt); @@ -311,13 +317,13 @@ impl<'a> Lexer<'a> { // least one character. self.advance_while(pat!('0'..='9' | '_')); // Phase 2: (\.\d[_\d]*)? - if matches!(self.window, [Some('.'), Some('0'..='9'), ..]) { + if matches!(self.peek_pair(), (Some('.'), Some('0'..='9'))) { kind_or_error = Ok(Token::Float); self.advance(); // Consume '.'. self.advance_while(pat!('0'..='9' | '_')); } // Phase 3: (e[+-]?\d[_\d]*)? - if matches!(self.window, [Some('e'), ..]) { + if self.peek() == Some('e') { kind_or_error = Ok(Token::Float); self.advance(); // Consume `e`. if self.advance_while(pat!('_')) > 0 { @@ -329,7 +335,7 @@ impl<'a> Lexer<'a> { } self.advance_while(pat!('0'..='9' | '_')); } - if self.window[0].is_some_and(is_text_like) { + if self.peek().is_some_and(is_text_like) { kind_or_error = Err(LexicalError::Number); self.advance_while(is_text_like); } @@ -341,15 +347,15 @@ impl<'a> Lexer<'a> { //------------------------------------------------------------------------------------- // Strings. //------------------------------------------------------------------------------------- - [Some('"'), Some('"'), Some('"'), ..] => { + (Some('"'), Some('"'), Some('"'), ..) => { self.advance_by(3); // Consume `"""`. self.scan_string(Token::BlockString, 3, 0, true) } - [Some('"'), ..] => { + (Some('"'), ..) => { self.advance(); // Consume `"`. self.scan_string(Token::String, 1, 0, true) } - [Some('r'), Some('"'), ..] | [Some('r'), Some('#'), Some('#' | '"'), ..] => { + (Some('r'), Some('"'), ..) | (Some('r'), Some('#'), Some('#' | '"'), ..) => { self.advance(); // Consume `r`. let hashes = self.advance_while(pat!('#')); if !self.advance_if(pat!('"')) { @@ -361,7 +367,7 @@ impl<'a> Lexer<'a> { //------------------------------------------------------------------------------------- // Identifiers and keywords. //------------------------------------------------------------------------------------- - [Some('A'..='Z' | 'a'..='z' | '_'), ..] => { + (Some('A'..='Z' | 'a'..='z' | '_'), ..) => { self.advance_while(|c| is_text_like(c) || c == '#'); match self.as_marked_str() { // Keywords. @@ -419,12 +425,12 @@ impl<'a> Lexer<'a> { // Other. //------------------------------------------------------------------------------------- // Illegal character to start a token. - [Some(_), ..] => { + (Some(_), ..) => { self.advance(); self.emit_error(LexicalError::Char) } // All done. - [None, ..] => None, + (None, ..) => None, } } @@ -445,20 +451,21 @@ impl<'a> Lexer<'a> { escapes: bool, ) -> Option { loop { - if matches!(self.window, [None, ..]) { + if self.peek().is_none() { return self.emit_error(LexicalError::UnclosedString); } // Advance the lexer until it is sitting on a `"` that can start the closing delimiter. if escapes { // Advance to the next string-like terminator or escape. self.advance_until(pat!('"' | '\\')); - match self.window { - [Some('"'), ..] => { /* Do nothing. */ } - [Some('\\'), ..] => { + match self.peek() { + Some('"') => { /* Do nothing. */ } + Some('\\') => { + // TODO: Need more advanced logic. What about `\u{...}` escapes? self.advance_by(2); // Skip the escape (e.g., the sequence `\"`). continue; } - [None, ..] => continue, // End of input. + None => continue, // End of input. _ => unreachable!(), } } else { @@ -487,7 +494,7 @@ impl Iterator for Lexer<'_> { } } -impl iter::FusedIterator for Lexer<'_> {} +impl FusedIterator for Lexer<'_> {} #[cfg(test)] mod tests { @@ -524,34 +531,35 @@ mod tests { #[test] fn new() { - lexer!(lexer <- "abc"); + lexer!(mut lexer <- "abc"); assert_eq!(lexer.text.as_str(), "abc"); - assert_eq!(lexer.window, [Some('a'), Some('b'), Some('c')]); - assert_eq!(lexer.idx, 0); + assert_eq!(lexer.peek(), Some('a')); + assert_eq!(lexer.idx(), 0); assert_eq!(lexer.mark_idx, 0); assert_eq!(lexer.text.as_str(), "abc"); } #[test] fn new_with_bom() { - lexer!(lexer <- "\u{feff}abc"); + // The byte order mark is U+FEFF. + lexer!(mut lexer <- "\u{feff}abc"); assert_eq!(lexer.text.as_str(), "\u{feff}abc"); - assert_eq!(lexer.window, [Some('a'), Some('b'), Some('c')]); - assert_eq!(lexer.idx, BOM.len_utf8()); - assert_eq!(lexer.mark_idx, BOM.len_utf8()); + assert_eq!(lexer.peek(), Some('a')); + assert_eq!(lexer.idx(), '\u{feff}'.len_utf8()); + assert_eq!(lexer.mark_idx, '\u{feff}'.len_utf8()); assert_eq!(lexer.text.as_str(), "\u{feff}abc"); } #[test] fn mark() { lexer!(mut lexer <- "abcdef"); - assert_eq!(lexer.idx, 0); + assert_eq!(lexer.idx(), 0); assert_eq!(lexer.mark_idx, 0); lexer.advance(); - assert_eq!(lexer.idx, 1); + assert_eq!(lexer.idx(), 1); assert_eq!(lexer.mark_idx, 0); lexer.mark(); - assert_eq!(lexer.idx, 1); + assert_eq!(lexer.idx(), 1); assert_eq!(lexer.mark_idx, 1); } @@ -571,28 +579,28 @@ mod tests { #[test] fn advance() { lexer!(mut lexer <- "abc"); - assert_eq!(lexer.idx, 0); - assert_eq!(lexer.window, [Some('a'), Some('b'), Some('c')]); + assert_eq!(lexer.idx(), 0); + assert_eq!(lexer.peek(), Some('a')); lexer.advance(); - assert_eq!(lexer.idx, 1); - assert_eq!(lexer.window, [Some('b'), Some('c'), None]); + assert_eq!(lexer.idx(), 1); + assert_eq!(lexer.peek(), Some('b')); lexer.advance(); - assert_eq!(lexer.idx, 2); - assert_eq!(lexer.window, [Some('c'), None, None]); + assert_eq!(lexer.idx(), 2); + assert_eq!(lexer.peek(), Some('c')); lexer.advance(); - assert_eq!(lexer.idx, 3); - assert_eq!(lexer.window, [None, None, None]); + assert_eq!(lexer.idx(), 3); + assert_eq!(lexer.peek(), None); lexer.advance(); // Test idempotency. - assert_eq!(lexer.idx, 3); - assert_eq!(lexer.window, [None, None, None]); + assert_eq!(lexer.idx(), 3); + assert_eq!(lexer.peek(), None); } #[test] fn advance_by() { lexer!(mut lexer <- "abcdef"); lexer.advance_by(3); - assert_eq!(lexer.idx, 3); + assert_eq!(lexer.idx(), 3); } #[test] @@ -602,42 +610,42 @@ mod tests { // - U+001E8D (ẍ): Latin Extended Additional require 3 bytes in UTF-8. // - U+01D50D (𝔍): Mathematical Alphanumeric Symbols require 4 bytes in UTF-8. lexer!(mut lexer <- "\u{0041}\u{003B2}\u{001E8D}\u{01D50D}"); - assert_eq!(lexer.idx, 0); + assert_eq!(lexer.idx(), 0); lexer.advance(); - assert_eq!(lexer.idx, 1); + assert_eq!(lexer.idx(), 1); lexer.advance(); - assert_eq!(lexer.idx, 3); + assert_eq!(lexer.idx(), 3); lexer.advance(); - assert_eq!(lexer.idx, 6); + assert_eq!(lexer.idx(), 6); lexer.advance(); - assert_eq!(lexer.idx, 10); + assert_eq!(lexer.idx(), 10); } #[test] fn advance_if() { lexer!(mut lexer <- "abcdef"); lexer.advance_if(|_| false); - assert_eq!(lexer.idx, 0); + assert_eq!(lexer.idx(), 0); lexer.advance_if(|_| true); - assert_eq!(lexer.idx, 1); + assert_eq!(lexer.idx(), 1); } #[test] fn advance_while() { lexer!(mut lexer <- "abcdef"); lexer.advance_while(|_| false); - assert_eq!(lexer.idx, 0); + assert_eq!(lexer.idx(), 0); lexer.advance_while(pat!('a'..='c')); - assert_eq!(lexer.idx, 3); + assert_eq!(lexer.idx(), 3); } #[test] fn advance_until() { lexer!(mut lexer <- "abcdef"); lexer.advance_until(|_| true); - assert_eq!(lexer.idx, 0); + assert_eq!(lexer.idx(), 0); lexer.advance_until(pat!('d'..='f')); - assert_eq!(lexer.idx, 3); + assert_eq!(lexer.idx(), 3); } #[test] @@ -666,35 +674,12 @@ mod tests { assert_eq!(error, (0, LexicalError::Ident, 3)); } - #[test] - fn shebang_comment_location() { - lexer!(mut lexer <- "#!shebang"); - assert!(lexer.shebang_comment_location()); - lexer.advance(); - assert!(!lexer.shebang_comment_location()); - } - - #[test] - fn shebang_comment_location_with_bom() { - lexer!(mut lexer <- "\u{feff}#!shebang"); - assert!(lexer.shebang_comment_location()); - lexer.advance(); - assert!(!lexer.shebang_comment_location()); - } - #[test] fn scan_shebang_comment() { // Valid. scan!("#!x", ok: Token::ShebangComment); scan!("#!x\na", Some(Ok((0, Token::ShebangComment, 3)))); - scan!( - "\u{feff}#!x\na", - Some(Ok(( - BOM.len_utf8(), - Token::ShebangComment, - BOM.len_utf8() + 3 - ))) - ); + scan!("\u{feff}#!bom\na", Some(Ok((3, Token::ShebangComment, 8)))); // Not a shebang comment. scan!(" #!x\na", Some(Ok((1, Token::Hash, 2)))); } @@ -992,7 +977,7 @@ mod tests { #[test] fn scan_illegal_char() { - // There are lots of illegal chars but only 2 are tested. + // There are lots of illegal chararacters, but only 2 are tested. scan!("$", err: LexicalError::Char); scan!("Γ€", err: LexicalError::Char); } @@ -1000,7 +985,7 @@ mod tests { #[test] fn scan_eof() { scan!("", None); - scan!("\r\n \t", None); + scan!("\n \r\n ", None); } #[test] diff --git a/fuyu-core/src/parse/text.rs b/fuyu-core/src/parse/text.rs index 494136d..d046242 100644 --- a/fuyu-core/src/parse/text.rs +++ b/fuyu-core/src/parse/text.rs @@ -4,6 +4,9 @@ //! //! [`Text`] is optimized for quick look ups of line and column position based on a [`ByteIdx`]. +use std::collections::VecDeque; +use std::str::Chars; + /// An index into a [`Text`] aligned at the byte level. /// /// Byte indices are used rather than `char` indices for performance. @@ -119,6 +122,133 @@ pub struct LineCol { pub col: usize, } +/// The Unicode byte order mark (BOM). +const BOM: char = '\u{feff}'; + +// TODO: Tests for this. +/// An iterator over the characters of a [`Text`]. +/// +/// This iterator has some benefits that simplifies processing: +/// +/// - The byte order mark (BOM) (U+FEFF) is automatically skipped if it is the first character. +/// - Sequences of `\r\n` are collapsed into `\n`. The column is the location of the `\r`. +/// +/// # πŸ’€ Warning πŸ’€ +/// +/// The current implementation of this iterator does not play "nice". It panics when any of the +/// following are encountered: +/// +/// - An `\r` not immediately followed by an `\n`. +/// - Any `\t` (tabs are not allowed). +/// - In general, any control character other than `\r` or `\n`. +/// +/// These rules may be relaxed in the future, but for now are present to simplify error handling +/// during developement. +#[derive(Debug)] +pub(super) struct TextChars<'a> { + /// The underlying iterator. + chars: Chars<'a>, + /// The characters that were peaked. + peaked: VecDeque, + /// The byte index of the next character (if any) to be emitted. + idx: ByteIdx, + /// The 0-indexed column of the next character (if any) to be emitted. + /// + /// Columns are counted in Unicode scalars. + column: usize, +} + +impl<'a> TextChars<'a> { + /// Create a new [`TextChars`] iterator over a [`Text`]. + pub fn new(text: &'a Text) -> Self { + let mut text_chars = Self { + chars: text.as_str().chars(), + peaked: VecDeque::new(), + idx: 0, + column: 0, + }; + // Skip the BOM. + if text_chars.peek() == Some(BOM) { + text_chars.next(); + } + text_chars + } + + /// Get the next character in the text. + /// + /// This pulls from the queue (if possible) before advancing the underlying iterator. + fn next_char(&mut self) -> Option { + if !self.peaked.is_empty() { + self.peaked.pop_front() + } else { + self.chars.next() + } + } + + /// Peek the next character in the text without advancing the iterator. + pub fn peek(&mut self) -> Option { + self.peek_nth(0) + } + + /// Peek the next character `n` positions ahead in the text without advancing the iterator. + pub fn peek_nth(&mut self, n: usize) -> Option { + // Make sure that the peak buffer has at least `n` characters (unless the end of the + // iterator is encountered). + while n >= self.peaked.len() { + match self.chars.next() { + Some(c) => self.peaked.push_back(c), + None => return None, + } + } + // At this point it is guaranteed that `n < self.peaked.len()`. + Some(self.peaked[n]) + } + + /// The byte index of the next character (if any) to be emitted. + pub fn idx(&self) -> usize { + self.idx + } + + /// The 0-indexed column of the next character (if any) to be emitted. + /// + /// Columns are counted in Unicode scalars. + pub fn column(&self) -> usize { + self.column + } +} + +impl Iterator for TextChars<'_> { + type Item = char; + + fn next(&mut self) -> Option { + match self.next_char() { + // Tabs are specific enough to get their own special case. + Some('\t') => panic!("Tabs (U+0009) are illegal."), + // The only control characters that are allowed are `\r` and `\n`. + Some(c) if c.is_control() && c != '\r' && c != '\n' => { + panic!("Illegal control character: U+{:06X}.", c as u32) + } + // All `\r` must be followed by `\n`. + Some('\r') => { + if let Some('\n') = self.next_char() { + // Treat this sequence as if it was just a `\n`. + self.idx += '\r'.len_utf8() + '\n'.len_utf8(); + self.column = 0; + Some('\n') + } else { + panic!("Carriage return (U+000D) must be followed by linefeed (U+000A).") + } + } + Some(c) => { + self.idx += c.len_utf8(); + self.column = if c == '\n' { 0 } else { self.column + 1 }; + Some(c) + } + None => None, + } + } +} + #[cfg(test)] mod tests { use super::*; From f8ebb3a64ad191e90528e5ea57ba67613f40ea92 Mon Sep 17 00:00:00 2001 From: Adam Christiansen Date: Tue, 8 Apr 2025 16:57:56 -0600 Subject: [PATCH 3/3] feat: add `;`, `{`, and `}` inference --- fuyu-core/src/parse/lexer.rs | 351 ++++++++++++++++++++++++++++++++--- fuyu-core/src/parse/text.rs | 40 ++-- fuyu-core/src/parse/token.rs | 6 +- 3 files changed, 351 insertions(+), 46 deletions(-) diff --git a/fuyu-core/src/parse/lexer.rs b/fuyu-core/src/parse/lexer.rs index c74e398..1062f54 100644 --- a/fuyu-core/src/parse/lexer.rs +++ b/fuyu-core/src/parse/lexer.rs @@ -3,6 +3,7 @@ use crate::parse::{ByteIdx, Text, TextChars, Token}; use std::collections::VecDeque; use std::iter::FusedIterator; +use std::num::NonZeroUsize; use std::str::Chars; /// The `Lexer` produces an iterator of [`Spanned`] [`Token`]s from a source [`Text`]. @@ -17,11 +18,18 @@ pub struct Lexer<'a> { /// /// It is guaranteed that `self.text_chars.idx() >= self.mark_idx`. mark_idx: ByteIdx, - // TODO: Use this to (1) track indentation and (2) emit `{`, `}`, and `;`. /// The indentations tracked by the lexer. - indent_stack: Vec, + /// + /// The indentation is 1-indexed. When an indentation is `Some`, then it corresponds to a real + /// indentation level, however, when an indentation is `None`, then it corresponds to `{` and + /// `}` explicitly present in the source. + indent_stack: Vec>, /// Queued tokens to emit from the iterator the next time it is advanced. queued: VecDeque, + /// Tracks whether a token was just emitted that should be followed by a `{`. + next_must_be_brace: bool, + /// The previous token that was emitted. + prev_token: Option, } /// The values that are produced by the [`Lexer`]. @@ -105,8 +113,10 @@ impl<'a> Lexer<'a> { text, text_chars: TextChars::new(text), mark_idx: 0, - indent_stack: vec![], + indent_stack: vec![NonZeroUsize::new(1)], queued: VecDeque::new(), + next_must_be_brace: false, + prev_token: None, }; // The index might not start at 0. lexer.mark(); @@ -123,6 +133,11 @@ impl<'a> Lexer<'a> { self.text_chars.idx() } + /// Shorthand for accessing the column. + fn column(&self) -> NonZeroUsize { + self.text_chars.column() + } + /// Set the mark index to the current index. fn mark(&mut self) { self.mark_idx = self.idx(); @@ -208,6 +223,7 @@ impl<'a> Lexer<'a> { /// Create a token that spans from the marked start index to the current index. fn emit(&mut self, kind: Token) -> Option { + self.prev_token = Some(kind.clone()); Some(Ok((self.mark_idx, kind, self.idx()))) } @@ -233,10 +249,126 @@ impl<'a> Lexer<'a> { /// This returns `None` at the end of the source text, and once `None` is returned, it will /// always return `None` on subsequent calls. fn scan(&mut self) -> Option { + // Pop from the queue if there is anything ready. + if !self.queued.is_empty() { + return self.queued.pop_front(); + } + // Skip the whitespace and maybe put tokens in the queue. + self.scan_through_whitespace(); if !self.queued.is_empty() { return self.queued.pop_front(); } + // At the end of the text, go through the indentation stack and emit `}` for all unclosed + // indentation. + if self.peek().is_none() && self.indent_stack.len() > 1 { + if let Some(indent) = self.indent_stack.pop() { + return self.emit(Token::RightBrace(indent)); + } else { + todo!("TODO: There was an unclosed `{{` in the source.") + } + // TODO: Does this also need to emit a semicolon? + } + // Nothing was found so try to find the next token normally. The lexer is now sitting on a + // character that can start a token. + self.scan_next() + } + + /// Scan for tokens in the following whitespace. + /// + /// This does not return any tokens, but may put some in the queue. This skips all whitespace. + fn scan_through_whitespace(&mut self) { + // Skip all whitespace and detect if a newline was seen. + self.advance_while(|c| char::is_whitespace(c) && c != '\n'); + let saw_newline = self.advance_if(pat!('\n')); self.advance_while(char::is_whitespace); + self.mark(); + // It is possible to emit a `;` and zero of more `}` based on indentation. + if saw_newline { + self.scan_through_whitespace_handle_newline(); + } + // When a brace is expected, either confirm that it is there or insert one. + self.scan_through_whitespace_infer_braces(); + } + + /// A helper for [`Self::scan_through_whitespace()`] that emits `;` and `{` tokens as required + /// after a newline. + /// + /// This should be called after skipping whitespace if at least one newline was present in the + /// skipped space. + fn scan_through_whitespace_handle_newline(&mut self) { + macro_rules! emit_semicolon { + ($lexer:expr) => { + // A semicolon is emitted when all of the following hold: + // + // - The top of the indent stack is an inferred `{`. + // - The previous token was not a semicolon. + // - At least one token has be emitted so far. + if self.indent_stack.last().unwrap().is_some() + && self.prev_token != Some(Token::Semicolon) + && self.prev_token.is_some() + { + self.emit_queue(Token::Semicolon); + } + }; + }; + let Some(Some(indent)) = self.indent_stack.last() else { + return; + }; + if self.column() == *indent && self.indent_stack.last().unwrap().is_some() { + emit_semicolon!(self); + } else if self.column() > *indent { + // Do nothing, as the expression on the previous line continues on this one. + } else { + emit_semicolon!(self); + loop { + let Some(Some(indent)) = self.indent_stack.last() else { + return; + }; + if *indent > self.column() { + // This `NonZeroUsize::new()` can never return `None`, so it is fine to use the + // returned option directly without checking if it is `None`. + self.emit_queue(Token::RightBrace(Some(*indent))); + self.indent_stack.pop(); + } else { + break; + } + } + // The top of the indentation stack is equal to the current column, otherwise the + // program is malformed. + let indent = self.indent_stack.last().unwrap().unwrap(); + if indent != self.column() { + // TODO: Emit and error instead of panicking. + panic!("Indentation did not return to an allowed level."); + } + } + } + + /// A helper for [`Self::scan_through_whitespace()`] that infers `{` if it is flagged that a + /// `{` must appear at the current location. + fn scan_through_whitespace_infer_braces(&mut self) { + // Condition for whether to run this logic. + if !self.next_must_be_brace { + return; + } + self.next_must_be_brace = false; + // Handle the `{` token. + if self.peek() == Some('{') { + // Do nothing, as this token will be lexer normally. + } else { + // Unwrapping is used instead of simply pushing the `Option` from + // `NonZeroUsize::new()` to panic on the error if `self.column()` ever returns + // zero (which it should not, by design). + let indent = Some(self.column()); + self.indent_stack.push(indent); + self.emit_queue(Token::LeftBrace(indent)); + } + } + + /// Scan the next token. + /// + /// It is assumed that the lexer is either on a character that can start a token or at the end + /// of the text. + fn scan_next(&mut self) -> Option { self.mark(); match self.peek_triplet() { //------------------------------------------------------------------------------------- @@ -282,10 +414,17 @@ impl<'a> Lexer<'a> { (Some('['), ..) => self.advance_by_and_emit(1, Token::LeftSquare), (Some('\\'), ..) => self.advance_by_and_emit(1, Token::BackSlash), (Some(']'), ..) => self.advance_by_and_emit(1, Token::RightSquare), - (Some('{'), ..) => self.advance_by_and_emit(1, Token::LeftBrace(None)), + (Some('{'), ..) => { + self.indent_stack.push(None); + self.advance_by_and_emit(1, Token::LeftBrace(None)) + } (Some('|'), Some('|'), ..) => self.advance_by_and_emit(2, Token::PipePipe), (Some('|'), ..) => self.advance_by_and_emit(1, Token::Pipe), - (Some('}'), ..) => self.advance_by_and_emit(1, Token::RightBrace(None)), + (Some('}'), ..) => { + // TODO: Need to check if popping the appropriate thing. + self.indent_stack.pop(); + self.advance_by_and_emit(1, Token::RightBrace(None)) + } //------------------------------------------------------------------------------------- // Numbers. //------------------------------------------------------------------------------------- @@ -372,14 +511,20 @@ impl<'a> Lexer<'a> { match self.as_marked_str() { // Keywords. "as" => self.emit(Token::KwAs), - "do" => self.emit(Token::KwDo), + "do" => { + self.next_must_be_brace = true; + self.emit(Token::KwDo) + } "else" => self.emit(Token::KwElse), "for" => self.emit(Token::KwFor), "if" => self.emit(Token::KwIf), "immediate" => self.emit(Token::KwImmediate), "import" => self.emit(Token::KwImport), "let" => self.emit(Token::KwLet), - "match" => self.emit(Token::KwMatch), + "match" => { + self.next_must_be_brace = true; + self.emit(Token::KwMatch) + } "panic" => self.emit(Token::KwPanic), "require" => self.emit(Token::KwRequire), "return" => self.emit(Token::KwReturn), @@ -390,7 +535,10 @@ impl<'a> Lexer<'a> { "unimplemented" => self.emit(Token::KwUnimplemented), "unreachable" => self.emit(Token::KwUnreachable), "use" => self.emit(Token::KwUse), - "when" => self.emit(Token::KwWhen), + "when" => { + self.next_must_be_brace = true; + self.emit(Token::KwWhen) + } // If not a keyword then this must be an identifier. "_" => self.emit(Token::Underscore), ident => { @@ -512,6 +660,15 @@ mod tests { }; } + /// Collect all of the tokens from the lexer into a vector ignoring the positions. + /// + /// This panics if the lexer emits any errors. + macro_rules! collect_tokens { + ($lexer:expr) => { + $lexer.map(|spanned| spanned.unwrap().1).collect::>() + }; + } + /// Shorthand for asserting that a scan produces a value. macro_rules! scan { // Assert that the `source` string produces the specified [`Spanned`] `token`. @@ -989,39 +1146,179 @@ mod tests { } #[test] - fn iter() { + fn iter_basic() { // This does not test every token, only a handful are needed to test the iterator. lexer!(lexer <- "let x = 123;"); assert_eq!( - lexer.collect::>(), + collect_tokens!(lexer), vec![ - Ok((0, Token::KwLet, 3)), - Ok((4, Token::LowerIdent, 5)), - Ok((6, Token::Eq, 7)), - Ok((8, Token::DecInt, 11)), - Ok((11, Token::Semicolon, 12)), + Token::KwLet, + Token::LowerIdent, + Token::Eq, + Token::DecInt, + Token::Semicolon, ] ); lexer!(lexer <- " let x = 123;\n"); assert_eq!( - lexer.collect::>(), + collect_tokens!(lexer), vec![ - Ok((2, Token::KwLet, 5)), - Ok((6, Token::LowerIdent, 7)), - Ok((8, Token::Eq, 9)), - Ok((10, Token::DecInt, 13)), - Ok((13, Token::Semicolon, 14)), + Token::KwLet, + Token::LowerIdent, + Token::Eq, + Token::DecInt, + Token::Semicolon, ] ); lexer!(lexer <- "let x=123;"); assert_eq!( - lexer.collect::>(), + collect_tokens!(lexer), + vec![ + Token::KwLet, + Token::LowerIdent, + Token::Eq, + Token::DecInt, + Token::Semicolon, + ] + ); + } + + #[test] + fn iter_infer_semicolons() { + // Semicolons should be inferred after `x`, `y`, and `z`. + lexer!(lexer <- concat!( + "x\n", + "y\n", + "z\n", + )); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::LowerIdent, + Token::Semicolon, + Token::LowerIdent, + Token::Semicolon, + Token::LowerIdent, + Token::Semicolon, + ] + ); + // Semicolons should be inferred after `x`, and `z`. + lexer!(lexer <- concat!( + "x\n", + " y\n", + "z\n", + )); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::LowerIdent, + Token::LowerIdent, + Token::Semicolon, + Token::LowerIdent, + Token::Semicolon, + ] + ); + } + + #[test] + fn iter_infer_braces() { + lexer!(lexer <- "do"); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::KwDo, + Token::LeftBrace(Some(NonZeroUsize::new(3).unwrap())), + Token::RightBrace(Some(NonZeroUsize::new(3).unwrap())), + ] + ); + lexer!(lexer <- "match"); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::KwMatch, + Token::LeftBrace(Some(NonZeroUsize::new(6).unwrap())), + Token::RightBrace(Some(NonZeroUsize::new(6).unwrap())), + ] + ); + lexer!(lexer <- "when"); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::KwWhen, + Token::LeftBrace(Some(NonZeroUsize::new(5).unwrap())), + Token::RightBrace(Some(NonZeroUsize::new(5).unwrap())), + ] + ); + } + + #[test] + fn iter_infer_nested_braces() { + lexer!(lexer <- concat!( + "do\n", + " do\n", + " x\n", + )); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::KwDo, + Token::LeftBrace(Some(NonZeroUsize::new(3).unwrap())), + Token::KwDo, + Token::LeftBrace(Some(NonZeroUsize::new(5).unwrap())), + Token::LowerIdent, + Token::Semicolon, + Token::RightBrace(Some(NonZeroUsize::new(5).unwrap())), + Token::RightBrace(Some(NonZeroUsize::new(3).unwrap())), + ] + ); + } + + #[test] + fn iter_do_not_infer_semicolons_in_explicit_braces() { + // Do not infer semicolons within `{ ... }`." + lexer!(lexer <- concat!( + "do {\n", + " x\n", + " y\n", + "}\n", + )); + assert_eq!( + collect_tokens!(lexer), + vec![ + Token::KwDo, + Token::LeftBrace(None), + Token::LowerIdent, + Token::LowerIdent, + Token::RightBrace(None), + Token::Semicolon, + ] + ); + } + + #[test] + fn iter_infer_semicolons_in_inferred_braces() { + // Infer semicolons inside an inferred brace block inside and explicit `{ ... }`. + lexer!(lexer <- concat!( + "do {\n", + " do \n", + " x\n", + " y\n", + "}\n", + )); + assert_eq!( + collect_tokens!(lexer), vec![ - Ok((0, Token::KwLet, 3)), - Ok((4, Token::LowerIdent, 5)), - Ok((5, Token::Eq, 6)), - Ok((6, Token::DecInt, 9)), - Ok((9, Token::Semicolon, 10)), + Token::KwDo, + Token::LeftBrace(None), + Token::KwDo, + Token::LeftBrace(Some(NonZeroUsize::new(5).unwrap())), + Token::LowerIdent, + Token::Semicolon, + Token::LowerIdent, + Token::Semicolon, + Token::RightBrace(Some(NonZeroUsize::new(5).unwrap())), + Token::RightBrace(None), + Token::Semicolon, ] ); } diff --git a/fuyu-core/src/parse/text.rs b/fuyu-core/src/parse/text.rs index d046242..e103cf2 100644 --- a/fuyu-core/src/parse/text.rs +++ b/fuyu-core/src/parse/text.rs @@ -5,6 +5,7 @@ //! [`Text`] is optimized for quick look ups of line and column position based on a [`ByteIdx`]. use std::collections::VecDeque; +use std::num::NonZeroUsize; use std::str::Chars; /// An index into a [`Text`] aligned at the byte level. @@ -111,6 +112,7 @@ impl LineTable { } } +// TODO: Make these NonZeroUsize. /// A line and column position in a source [`Text`]. /// /// Both the line and column are 0-indexed. @@ -148,14 +150,14 @@ const BOM: char = '\u{feff}'; pub(super) struct TextChars<'a> { /// The underlying iterator. chars: Chars<'a>, - /// The characters that were peaked. - peaked: VecDeque, + /// The characters that were peeked. + peeked: VecDeque, /// The byte index of the next character (if any) to be emitted. idx: ByteIdx, - /// The 0-indexed column of the next character (if any) to be emitted. + /// The 1-indexed column of the next character (if any) to be emitted. /// /// Columns are counted in Unicode scalars. - column: usize, + column: NonZeroUsize, } impl<'a> TextChars<'a> { @@ -163,9 +165,9 @@ impl<'a> TextChars<'a> { pub fn new(text: &'a Text) -> Self { let mut text_chars = Self { chars: text.as_str().chars(), - peaked: VecDeque::new(), + peeked: VecDeque::new(), idx: 0, - column: 0, + column: NonZeroUsize::new(1).unwrap(), }; // Skip the BOM. if text_chars.peek() == Some(BOM) { @@ -178,8 +180,8 @@ impl<'a> TextChars<'a> { /// /// This pulls from the queue (if possible) before advancing the underlying iterator. fn next_char(&mut self) -> Option { - if !self.peaked.is_empty() { - self.peaked.pop_front() + if !self.peeked.is_empty() { + self.peeked.pop_front() } else { self.chars.next() } @@ -192,16 +194,16 @@ impl<'a> TextChars<'a> { /// Peek the next character `n` positions ahead in the text without advancing the iterator. pub fn peek_nth(&mut self, n: usize) -> Option { - // Make sure that the peak buffer has at least `n` characters (unless the end of the + // Make sure that the peek buffer has at least `n` characters (unless the end of the // iterator is encountered). - while n >= self.peaked.len() { + while n >= self.peeked.len() { match self.chars.next() { - Some(c) => self.peaked.push_back(c), + Some(c) => self.peeked.push_back(c), None => return None, } } - // At this point it is guaranteed that `n < self.peaked.len()`. - Some(self.peaked[n]) + // At this point it is guaranteed that `n < self.peeked.len()`. + Some(self.peeked[n]) } /// The byte index of the next character (if any) to be emitted. @@ -209,10 +211,10 @@ impl<'a> TextChars<'a> { self.idx } - /// The 0-indexed column of the next character (if any) to be emitted. + /// The 1-indexed column of the next character (if any) to be emitted. /// /// Columns are counted in Unicode scalars. - pub fn column(&self) -> usize { + pub fn column(&self) -> NonZeroUsize { self.column } } @@ -233,7 +235,7 @@ impl Iterator for TextChars<'_> { if let Some('\n') = self.next_char() { // Treat this sequence as if it was just a `\n`. self.idx += '\r'.len_utf8() + '\n'.len_utf8(); - self.column = 0; + self.column = NonZeroUsize::new(1).unwrap(); Some('\n') } else { panic!("Carriage return (U+000D) must be followed by linefeed (U+000A).") @@ -241,7 +243,11 @@ impl Iterator for TextChars<'_> { } Some(c) => { self.idx += c.len_utf8(); - self.column = if c == '\n' { 0 } else { self.column + 1 }; + self.column = if c == '\n' { + NonZeroUsize::new(1).unwrap() + } else { + self.column.checked_add(1).unwrap() + }; Some(c) } None => None, diff --git a/fuyu-core/src/parse/token.rs b/fuyu-core/src/parse/token.rs index cc4a6cd..5d70387 100644 --- a/fuyu-core/src/parse/token.rs +++ b/fuyu-core/src/parse/token.rs @@ -1,5 +1,7 @@ // SPDX-License-Identifier: MPL-2.0 +use std::num::NonZeroUsize; + /// A `Token` is an abstract representation the token type not tied to any value. /// /// This value is often called the token "kind" throughout this project. @@ -88,7 +90,7 @@ pub enum Token { /// The associated value is `None` if the brace appears in the source. When the brace is /// inserted by the lexer, the value is `Some` with the 1-indexed indentation of the column /// where the brace is inserted. - LeftBrace(Option), + LeftBrace(Option), /// `[`. LeftSquare, /// `(`. @@ -98,7 +100,7 @@ pub enum Token { /// The associated value is `None` if the brace appears in the source. When the brace is /// inserted by the lexer, the value is `Some` with the 1-indexed indentation of the column /// of the matching automatically inserted [`LeftBrace`][Token::LeftBrace]. - RightBrace(Option), + RightBrace(Option), /// `]`. RightSquare, /// `)`.