Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2787,10 +2787,11 @@ impl fmt::Display for Declare {
}

/// Sql options of a `CREATE TABLE` statement.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Default)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum CreateTableOptions {
#[default]
None,
/// Options specified using the `WITH` keyword.
/// e.g. `WITH (description = "123")`
Expand Down Expand Up @@ -2819,12 +2820,6 @@ pub enum CreateTableOptions {
TableProperties(Vec<SqlOption>),
}

impl Default for CreateTableOptions {
fn default() -> Self {
Self::None
}
}

impl fmt::Display for CreateTableOptions {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Expand Down
168 changes: 128 additions & 40 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}

struct State<'a> {
peekable: Peekable<Chars<'a>>,
pub line: u64,
pub col: u64,
/// Reference to the original source string being tokenized
source: &'a str,
line: u64,
col: u64,
/// Byte position in the source string
byte_pos: usize,
}

impl State<'_> {
Expand All @@ -759,6 +763,8 @@ impl State<'_> {
} else {
self.col += 1;
}
// Update byte position (characters can be multi-byte in UTF-8)
self.byte_pos += s.len_utf8();
Some(s)
}
}
Expand All @@ -769,6 +775,16 @@ impl State<'_> {
self.peekable.peek()
}

/// Return the character `n` positions ahead without advancing the stream.
/// For example, `peek_nth(0)` returns the current character (same as peek),
/// and `peek_nth(1)` returns the next character.
pub fn peek_nth(&self, n: usize) -> Option<char> {
if self.byte_pos >= self.source.len() {
return None;
}
self.source[self.byte_pos..].chars().nth(n)
}

pub fn location(&self) -> Location {
Location {
line: self.line,
Expand Down Expand Up @@ -893,8 +909,10 @@ impl<'a> Tokenizer<'a> {
) -> Result<(), TokenizerError> {
let mut state = State {
peekable: self.query.chars().peekable(),
source: self.query,
line: 1,
col: 1,
byte_pos: 0,
};

let mut location = state.location();
Expand All @@ -908,22 +926,24 @@ impl<'a> Tokenizer<'a> {
Ok(())
}

// Tokenize the identifier or keywords in `ch`
/// Tokenize an identifier or keyword after consuming the first character(s).
/// `consumed_byte_len` is the total byte length of the character(s) already consumed.
fn tokenize_identifier_or_keyword(
&self,
ch: impl IntoIterator<Item = char>,
chars: &mut State,
consumed_byte_len: usize,
chars: &mut State<'a>,
) -> Result<Option<Token>, TokenizerError> {
chars.next(); // consume the first char
let ch: String = ch.into_iter().collect();
let word = self.tokenize_word(ch, chars);
let word = self.tokenize_word(consumed_byte_len, chars);

// TODO: implement parsing of exponent here
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
let mut inner_state = State {
peekable: word.chars().peekable(),
source: &word,
line: 0,
col: 0,
byte_pos: 0,
};
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
Expand All @@ -937,7 +957,7 @@ impl<'a> Tokenizer<'a> {
/// Get the next token or return None
fn next_token(
&self,
chars: &mut State,
chars: &mut State<'a>,
prev_token: Option<&Token>,
) -> Result<Option<Token>, TokenizerError> {
match chars.peek() {
Expand Down Expand Up @@ -988,7 +1008,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "b" or "B"
let s = self.tokenize_word(b, chars);
let s = self.tokenize_word(b.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1015,7 +1035,7 @@ impl<'a> Tokenizer<'a> {
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b, chars);
let s = self.tokenize_word(b.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1034,7 +1054,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "N"
let s = self.tokenize_word(n, chars);
let s = self.tokenize_word(n.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1051,7 +1071,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "E" or "e"
let s = self.tokenize_word(x, chars);
let s = self.tokenize_word(x.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1070,7 +1090,7 @@ impl<'a> Tokenizer<'a> {
}
}
// regular identifier starting with an "U" or "u"
let s = self.tokenize_word(x, chars);
let s = self.tokenize_word(x.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
// The spec only allows an uppercase 'X' to introduce a hex
Expand All @@ -1085,7 +1105,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "X"
let s = self.tokenize_word(x, chars);
let s = self.tokenize_word(x.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand Down Expand Up @@ -1382,7 +1402,8 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
Some(sch) if self.dialect.is_identifier_start('%') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
let consumed_byte_len = ch.len_utf8() + sch.len_utf8();
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
}
_ => self.start_binop(chars, "%", Token::Mod),
}
Expand Down Expand Up @@ -1610,7 +1631,8 @@ impl<'a> Tokenizer<'a> {
self.consume_for_binop(chars, "##", Token::DoubleSharp)
}
Some(sch) if self.dialect.is_identifier_start('#') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
let consumed_byte_len = ch.len_utf8() + sch.len_utf8();
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
}
_ => self.start_binop(chars, "#", Token::Sharp),
}
Expand All @@ -1635,7 +1657,9 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some(' ') => Ok(Some(Token::AtAt)),
Some(tch) if self.dialect.is_identifier_start('@') => {
self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
let consumed_byte_len =
ch.len_utf8() + '@'.len_utf8() + tch.len_utf8();
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
}
_ => Ok(Some(Token::AtAt)),
}
Expand All @@ -1654,7 +1678,8 @@ impl<'a> Tokenizer<'a> {
Some('\"') => Ok(Some(Token::AtSign)),
Some('`') => Ok(Some(Token::AtSign)),
Some(sch) if self.dialect.is_identifier_start('@') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
let consumed_byte_len = ch.len_utf8() + sch.len_utf8();
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
}
_ => Ok(Some(Token::AtSign)),
}
Expand Down Expand Up @@ -1695,7 +1720,8 @@ impl<'a> Tokenizer<'a> {

// identifier or keyword
ch if self.dialect.is_identifier_start(ch) => {
self.tokenize_identifier_or_keyword([ch], chars)
let consumed_byte_len = ch.len_utf8();
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
}
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),

Expand Down Expand Up @@ -1876,13 +1902,36 @@ impl<'a> Tokenizer<'a> {
comment
}

/// Tokenize an identifier or keyword, after the first char is already consumed.
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
let mut s = first_chars.into();
s.push_str(&peeking_take_while(chars, |ch| {
self.dialect.is_identifier_part(ch)
}));
s
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
/// `consumed_byte_len` is the byte length of the consumed character(s).
fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
// Overflow check: ensure we can safely subtract
if consumed_byte_len > chars.byte_pos {
return String::new();
}

// Calculate where the first character started
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a check that the operation doesn't overflow? e.g.

if consumed_byte_len >= chars.byte_pos {
    return "".to_string()
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


// Use the zero-copy version and convert to String
self.tokenize_word_borrowed(first_char_byte_pos, chars)
.to_string()
}

/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
/// The first character position must be provided (before it was consumed).
/// Returns a slice with the same lifetime as the State's source.
fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str {
// Consume the rest of the word
peeking_take_while_ref(chars, |ch| self.dialect.is_identifier_part(ch));

// Boundary check: ensure first_char_byte_pos is valid
if first_char_byte_pos > chars.byte_pos || first_char_byte_pos > chars.source.len() {
return "";
}

// Return a slice from the first char to the current position
&chars.source[first_char_byte_pos..chars.byte_pos]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly here we can check that the indexing is safe and maybe return a literal "" if not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}

/// Read a quoted identifier
Expand Down Expand Up @@ -2176,35 +2225,72 @@ impl<'a> Tokenizer<'a> {
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
/// Return the characters read as String, and keep the first non-matching
/// char available as `chars.next()`.
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
let mut s = String::new();
fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String {
peeking_take_while_ref(chars, predicate).to_string()
}

/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
/// Returns a borrowed slice of the source string containing the matched characters.
/// This is the zero-copy version of `peeking_take_while`.
fn peeking_take_while_ref<'a>(
chars: &mut State<'a>,
mut predicate: impl FnMut(char) -> bool,
) -> &'a str {
// Record the starting byte position
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can sanity check the index before using it here as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s a sanity check needed here? The start_pos and end_pos is taken from the iterator, and the iterator is incremented according to the characters in the buffer

let start_pos = chars.byte_pos;

// Consume characters while predicate is true
while let Some(&ch) = chars.peek() {
if predicate(ch) {
chars.next(); // consume
s.push(ch);
chars.next(); // consume (this updates byte_pos)
} else {
break;
}
}
s

// Get the ending byte position
let end_pos = chars.byte_pos;

// Return the slice from the original source
&chars.source[start_pos..end_pos]
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
chars: &mut State,
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just flagging similar comments for borrow_while_until apply to this function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think its ok now, let me know if not

/// This version also passes the next character to the predicate for lookahead, taking
/// both the current char and optional next char. Returns a borrowed slice of the source
/// string containing the matched characters.
///
/// This is a zero-copy version of `peeking_next_take_while`.
fn peeking_take_while_next_ref<'a>(
chars: &mut State<'a>,
mut predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
let mut s = String::new();
) -> &'a str {
// Record the starting byte position
let start_pos = chars.byte_pos;

// Consume characters while predicate is true
while let Some(&ch) = chars.peek() {
let next_char = chars.peekable.clone().nth(1);
let next_char = chars.peek_nth(1);
if predicate(ch, next_char) {
chars.next(); // consume
s.push(ch);
chars.next(); // consume (this updates byte_pos)
} else {
break;
}
}
s

// Get the ending byte position
let end_pos = chars.byte_pos;

// Return the slice from the original source
&chars.source[start_pos..end_pos]
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
chars: &mut State,
predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
peeking_take_while_next_ref(chars, predicate).to_string()
}

fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
Expand Down Expand Up @@ -3496,8 +3582,10 @@ mod tests {
let s = format!("'{s}'");
let mut state = State {
peekable: s.chars().peekable(),
source: &s,
line: 0,
col: 0,
byte_pos: 0,
};

assert_eq!(
Expand Down