From 0ad848c0167fa9fd88fc5b68d3001a4e1c3d8b95 Mon Sep 17 00:00:00 2001 From: Eyal Leshem Date: Tue, 21 Oct 2025 15:26:23 +0300 Subject: [PATCH] Prepare tokenizer for using borrowed strings instead of allocations. Key points for this commit: - The peekable trait isn't sufficient for using string slices, as we need the byte indexes (start/end) to create string slices, so added the current byte position to the State struct (Note: in the long term we could potentially remove peekable and use only the current position as an iterator) - Created internal functions that create slices from the original query instead of allocating strings, then converted these functions to return String to maintain compatibility (the idea is to make a small, reviewable commit without changing the Token struct or the parser) --- src/ast/mod.rs | 9 +-- src/tokenizer.rs | 168 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 130 insertions(+), 47 deletions(-) diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 176d36545..3a448312b 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -2787,10 +2787,11 @@ impl fmt::Display for Declare { } /// Sql options of a `CREATE TABLE` statement. -#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Default)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] pub enum CreateTableOptions { + #[default] None, /// Options specified using the `WITH` keyword. /// e.g. `WITH (description = "123")` @@ -2819,12 +2820,6 @@ pub enum CreateTableOptions { TableProperties(Vec), } -impl Default for CreateTableOptions { - fn default() -> Self { - Self::None - } -} - impl fmt::Display for CreateTableOptions { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 54a158c1f..6fbf59dc6 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {} struct State<'a> { peekable: Peekable>, - pub line: u64, - pub col: u64, + /// Reference to the original source string being tokenized + source: &'a str, + line: u64, + col: u64, + /// Byte position in the source string + byte_pos: usize, } impl State<'_> { @@ -759,6 +763,8 @@ impl State<'_> { } else { self.col += 1; } + // Update byte position (characters can be multi-byte in UTF-8) + self.byte_pos += s.len_utf8(); Some(s) } } @@ -769,6 +775,16 @@ impl State<'_> { self.peekable.peek() } + /// Return the character `n` positions ahead without advancing the stream. + /// For example, `peek_nth(0)` returns the current character (same as peek), + /// and `peek_nth(1)` returns the next character. + pub fn peek_nth(&self, n: usize) -> Option { + if self.byte_pos >= self.source.len() { + return None; + } + self.source[self.byte_pos..].chars().nth(n) + } + pub fn location(&self) -> Location { Location { line: self.line, @@ -893,8 +909,10 @@ impl<'a> Tokenizer<'a> { ) -> Result<(), TokenizerError> { let mut state = State { peekable: self.query.chars().peekable(), + source: self.query, line: 1, col: 1, + byte_pos: 0, }; let mut location = state.location(); @@ -908,22 +926,24 @@ impl<'a> Tokenizer<'a> { Ok(()) } - // Tokenize the identifier or keywords in `ch` + /// Tokenize an identifier or keyword after consuming the first character(s). + /// `consumed_byte_len` is the total byte length of the character(s) already consumed. fn tokenize_identifier_or_keyword( &self, - ch: impl IntoIterator, - chars: &mut State, + consumed_byte_len: usize, + chars: &mut State<'a>, ) -> Result, TokenizerError> { chars.next(); // consume the first char - let ch: String = ch.into_iter().collect(); - let word = self.tokenize_word(ch, chars); + let word = self.tokenize_word(consumed_byte_len, chars); // TODO: implement parsing of exponent here if word.chars().all(|x| x.is_ascii_digit() || x == '.') { let mut inner_state = State { peekable: word.chars().peekable(), + source: &word, line: 0, col: 0, + byte_pos: 0, }; let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.')); let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); @@ -937,7 +957,7 @@ impl<'a> Tokenizer<'a> { /// Get the next token or return None fn next_token( &self, - chars: &mut State, + chars: &mut State<'a>, prev_token: Option<&Token>, ) -> Result, TokenizerError> { match chars.peek() { @@ -988,7 +1008,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "b" or "B" - let s = self.tokenize_word(b, chars); + let s = self.tokenize_word(b.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1015,7 +1035,7 @@ impl<'a> Tokenizer<'a> { ), _ => { // regular identifier starting with an "r" or "R" - let s = self.tokenize_word(b, chars); + let s = self.tokenize_word(b.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1034,7 +1054,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "N" - let s = self.tokenize_word(n, chars); + let s = self.tokenize_word(n.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1051,7 +1071,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "E" or "e" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1070,7 +1090,7 @@ impl<'a> Tokenizer<'a> { } } // regular identifier starting with an "U" or "u" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } // The spec only allows an uppercase 'X' to introduce a hex @@ -1085,7 +1105,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "X" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1382,7 +1402,8 @@ impl<'a> Tokenizer<'a> { match chars.peek() { Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)), Some(sch) if self.dialect.is_identifier_start('%') => { - self.tokenize_identifier_or_keyword([ch, *sch], chars) + let consumed_byte_len = ch.len_utf8() + sch.len_utf8(); + self.tokenize_identifier_or_keyword(consumed_byte_len, chars) } _ => self.start_binop(chars, "%", Token::Mod), } @@ -1610,7 +1631,8 @@ impl<'a> Tokenizer<'a> { self.consume_for_binop(chars, "##", Token::DoubleSharp) } Some(sch) if self.dialect.is_identifier_start('#') => { - self.tokenize_identifier_or_keyword([ch, *sch], chars) + let consumed_byte_len = ch.len_utf8() + sch.len_utf8(); + self.tokenize_identifier_or_keyword(consumed_byte_len, chars) } _ => self.start_binop(chars, "#", Token::Sharp), } @@ -1635,7 +1657,9 @@ impl<'a> Tokenizer<'a> { match chars.peek() { Some(' ') => Ok(Some(Token::AtAt)), Some(tch) if self.dialect.is_identifier_start('@') => { - self.tokenize_identifier_or_keyword([ch, '@', *tch], chars) + let consumed_byte_len = + ch.len_utf8() + '@'.len_utf8() + tch.len_utf8(); + self.tokenize_identifier_or_keyword(consumed_byte_len, chars) } _ => Ok(Some(Token::AtAt)), } @@ -1654,7 +1678,8 @@ impl<'a> Tokenizer<'a> { Some('\"') => Ok(Some(Token::AtSign)), Some('`') => Ok(Some(Token::AtSign)), Some(sch) if self.dialect.is_identifier_start('@') => { - self.tokenize_identifier_or_keyword([ch, *sch], chars) + let consumed_byte_len = ch.len_utf8() + sch.len_utf8(); + self.tokenize_identifier_or_keyword(consumed_byte_len, chars) } _ => Ok(Some(Token::AtSign)), } @@ -1695,7 +1720,8 @@ impl<'a> Tokenizer<'a> { // identifier or keyword ch if self.dialect.is_identifier_start(ch) => { - self.tokenize_identifier_or_keyword([ch], chars) + let consumed_byte_len = ch.len_utf8(); + self.tokenize_identifier_or_keyword(consumed_byte_len, chars) } '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)), @@ -1876,13 +1902,36 @@ impl<'a> Tokenizer<'a> { comment } - /// Tokenize an identifier or keyword, after the first char is already consumed. - fn tokenize_word(&self, first_chars: impl Into, chars: &mut State) -> String { - let mut s = first_chars.into(); - s.push_str(&peeking_take_while(chars, |ch| { - self.dialect.is_identifier_part(ch) - })); - s + /// Tokenize an identifier or keyword, after the first char(s) have already been consumed. + /// `consumed_byte_len` is the byte length of the consumed character(s). + fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String { + // Overflow check: ensure we can safely subtract + if consumed_byte_len > chars.byte_pos { + return String::new(); + } + + // Calculate where the first character started + let first_char_byte_pos = chars.byte_pos - consumed_byte_len; + + // Use the zero-copy version and convert to String + self.tokenize_word_borrowed(first_char_byte_pos, chars) + .to_string() + } + + /// Tokenize an identifier or keyword, returning a borrowed slice when possible. + /// The first character position must be provided (before it was consumed). + /// Returns a slice with the same lifetime as the State's source. + fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str { + // Consume the rest of the word + peeking_take_while_ref(chars, |ch| self.dialect.is_identifier_part(ch)); + + // Boundary check: ensure first_char_byte_pos is valid + if first_char_byte_pos > chars.byte_pos || first_char_byte_pos > chars.source.len() { + return ""; + } + + // Return a slice from the first char to the current position + &chars.source[first_char_byte_pos..chars.byte_pos] } /// Read a quoted identifier @@ -2176,35 +2225,72 @@ impl<'a> Tokenizer<'a> { /// Read from `chars` until `predicate` returns `false` or EOF is hit. /// Return the characters read as String, and keep the first non-matching /// char available as `chars.next()`. -fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String { - let mut s = String::new(); +fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String { + peeking_take_while_ref(chars, predicate).to_string() +} + +/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit. +/// Returns a borrowed slice of the source string containing the matched characters. +/// This is the zero-copy version of `peeking_take_while`. +fn peeking_take_while_ref<'a>( + chars: &mut State<'a>, + mut predicate: impl FnMut(char) -> bool, +) -> &'a str { + // Record the starting byte position + let start_pos = chars.byte_pos; + + // Consume characters while predicate is true while let Some(&ch) = chars.peek() { if predicate(ch) { - chars.next(); // consume - s.push(ch); + chars.next(); // consume (this updates byte_pos) } else { break; } } - s + + // Get the ending byte position + let end_pos = chars.byte_pos; + + // Return the slice from the original source + &chars.source[start_pos..end_pos] } -/// Same as peeking_take_while, but also passes the next character to the predicate. -fn peeking_next_take_while( - chars: &mut State, +/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit. +/// This version also passes the next character to the predicate for lookahead, taking +/// both the current char and optional next char. Returns a borrowed slice of the source +/// string containing the matched characters. +/// +/// This is a zero-copy version of `peeking_next_take_while`. +fn peeking_take_while_next_ref<'a>( + chars: &mut State<'a>, mut predicate: impl FnMut(char, Option) -> bool, -) -> String { - let mut s = String::new(); +) -> &'a str { + // Record the starting byte position + let start_pos = chars.byte_pos; + + // Consume characters while predicate is true while let Some(&ch) = chars.peek() { - let next_char = chars.peekable.clone().nth(1); + let next_char = chars.peek_nth(1); if predicate(ch, next_char) { - chars.next(); // consume - s.push(ch); + chars.next(); // consume (this updates byte_pos) } else { break; } } - s + + // Get the ending byte position + let end_pos = chars.byte_pos; + + // Return the slice from the original source + &chars.source[start_pos..end_pos] +} + +/// Same as peeking_take_while, but also passes the next character to the predicate. +fn peeking_next_take_while( + chars: &mut State, + predicate: impl FnMut(char, Option) -> bool, +) -> String { + peeking_take_while_next_ref(chars, predicate).to_string() } fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option { @@ -3496,8 +3582,10 @@ mod tests { let s = format!("'{s}'"); let mut state = State { peekable: s.chars().peekable(), + source: &s, line: 0, col: 0, + byte_pos: 0, }; assert_eq!(