From f2235305f440a080e3dc7f7246fe303479f30c31 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 14 Aug 2024 16:42:40 +0100 Subject: [PATCH] Simple custom lexical precedence in PostgreSQL dialect (#1379) --- src/dialect/mod.rs | 189 ++++++++++++++++---------------------- src/dialect/postgresql.rs | 118 ++++++------------------ src/dialect/snowflake.rs | 4 +- src/parser/mod.rs | 38 +++++--- 4 files changed, 137 insertions(+), 212 deletions(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 9033ecc78..6ec2e28d8 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -354,13 +354,18 @@ pub trait Dialect: Debug + Any { if let Some(precedence) = self.get_next_precedence(parser) { return precedence; } + macro_rules! p { + ($precedence:ident) => { + self.prec_value(Precedence::$precedence) + }; + } let token = parser.peek_token(); debug!("get_next_precedence_full() {:?}", token); match token.token { - Token::Word(w) if w.keyword == Keyword::OR => Ok(OR_PREC), - Token::Word(w) if w.keyword == Keyword::AND => Ok(AND_PREC), - Token::Word(w) if w.keyword == Keyword::XOR => Ok(XOR_PREC), + Token::Word(w) if w.keyword == Keyword::OR => Ok(p!(Or)), + Token::Word(w) if w.keyword == Keyword::AND => Ok(p!(And)), + Token::Word(w) if w.keyword == Keyword::XOR => Ok(p!(Xor)), Token::Word(w) if w.keyword == Keyword::AT => { match ( @@ -370,9 +375,9 @@ pub trait Dialect: Debug + Any { (Token::Word(w), Token::Word(w2)) if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => { - Ok(AT_TZ_PREC) + Ok(p!(AtTz)) } - _ => Ok(UNKNOWN_PREC), + _ => Ok(self.prec_unknown()), } } @@ -382,25 +387,25 @@ pub trait Dialect: Debug + Any { // it takes on the precedence of those tokens. Otherwise, it // is not an infix operator, and therefore has zero // precedence. - Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC), - _ => Ok(UNKNOWN_PREC), + Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), + _ => Ok(self.prec_unknown()), }, - Token::Word(w) if w.keyword == Keyword::IS => Ok(IS_PREC), - Token::Word(w) if w.keyword == Keyword::IN => Ok(BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::LIKE => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(LIKE_PREC), - Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(BETWEEN_PREC), - Token::Word(w) if w.keyword == Keyword::DIV => Ok(MUL_DIV_MOD_OP_PREC), + Token::Word(w) if w.keyword == Keyword::IS => Ok(p!(Is)), + Token::Word(w) if w.keyword == Keyword::IN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::BETWEEN => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::LIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::ILIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)), + Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)), + Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)), Token::Eq | Token::Lt | Token::LtEq @@ -416,20 +421,19 @@ pub trait Dialect: Debug + Any { | Token::DoubleTildeAsterisk | Token::ExclamationMarkDoubleTilde | Token::ExclamationMarkDoubleTildeAsterisk - | Token::Spaceship => Ok(EQ_PREC), - Token::Pipe => Ok(PIPE_PREC), - Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(CARET_PREC), - Token::Ampersand => Ok(AMPERSAND_PREC), - Token::Plus | Token::Minus => Ok(PLUS_MINUS_PREC), + | Token::Spaceship => Ok(p!(Eq)), + Token::Pipe => Ok(p!(Pipe)), + Token::Caret | Token::Sharp | Token::ShiftRight | Token::ShiftLeft => Ok(p!(Caret)), + Token::Ampersand => Ok(p!(Ampersand)), + Token::Plus | Token::Minus => Ok(p!(PlusMinus)), Token::Mul | Token::Div | Token::DuckIntDiv | Token::Mod | Token::StringConcat => { - Ok(MUL_DIV_MOD_OP_PREC) + Ok(p!(MulDivModOp)) } Token::DoubleColon | Token::ExclamationMark | Token::LBracket | Token::Overlap - | Token::CaretAt => Ok(DOUBLE_COLON_PREC), - // Token::Colon if (self as dyn Dialect).is::() => Ok(DOUBLE_COLON_PREC), + | Token::CaretAt => Ok(p!(DoubleColon)), Token::Arrow | Token::LongArrow | Token::HashArrow @@ -442,8 +446,8 @@ pub trait Dialect: Debug + Any { | Token::Question | Token::QuestionAnd | Token::QuestionPipe - | Token::CustomBinaryOperator(_) => Ok(PG_OTHER_PREC), - _ => Ok(UNKNOWN_PREC), + | Token::CustomBinaryOperator(_) => Ok(p!(PgOther)), + _ => Ok(self.prec_unknown()), } } @@ -457,88 +461,57 @@ pub trait Dialect: Debug + Any { None } - // The following precedence values are used directly by `Parse` or in dialects, - // so have to be made public by the dialect. - - /// Return the precedence of the `::` operator. + /// Decide the lexical Precedence of operators. /// - /// Default is 50. - fn prec_double_colon(&self) -> u8 { - DOUBLE_COLON_PREC - } - - /// Return the precedence of `*`, `/`, and `%` operators. - /// - /// Default is 40. - fn prec_mul_div_mod_op(&self) -> u8 { - MUL_DIV_MOD_OP_PREC - } - - /// Return the precedence of the `+` and `-` operators. - /// - /// Default is 30. - fn prec_plus_minus(&self) -> u8 { - PLUS_MINUS_PREC - } - - /// Return the precedence of the `BETWEEN` operator. - /// - /// For example `BETWEEN AND ` - /// - /// Default is 22. - fn prec_between(&self) -> u8 { - BETWEEN_PREC - } - - /// Return the precedence of the `LIKE` operator. - /// - /// Default is 19. - fn prec_like(&self) -> u8 { - LIKE_PREC - } - - /// Return the precedence of the unary `NOT` operator. - /// - /// For example `NOT (a OR b)` - /// - /// Default is 15. - fn prec_unary_not(&self) -> u8 { - UNARY_NOT_PREC + /// Uses (APPROXIMATELY) as a reference + fn prec_value(&self, prec: Precedence) -> u8 { + match prec { + Precedence::DoubleColon => 50, + Precedence::AtTz => 41, + Precedence::MulDivModOp => 40, + Precedence::PlusMinus => 30, + Precedence::Xor => 24, + Precedence::Ampersand => 23, + Precedence::Caret => 22, + Precedence::Pipe => 21, + Precedence::Between => 20, + Precedence::Eq => 20, + Precedence::Like => 19, + Precedence::Is => 17, + Precedence::PgOther => 16, + Precedence::UnaryNot => 15, + Precedence::And => 10, + Precedence::Or => 5, + } } - /// Return the default (unknown) precedence. - /// - /// Default is 0. fn prec_unknown(&self) -> u8 { - UNKNOWN_PREC + 0 } } -// Define the lexical Precedence of operators. -// -// Uses (APPROXIMATELY) as a reference -// higher number = higher precedence -// -// NOTE: The pg documentation is incomplete, e.g. the AT TIME ZONE operator -// actually has higher precedence than addition. -// See . -const DOUBLE_COLON_PREC: u8 = 50; -const AT_TZ_PREC: u8 = 41; -const MUL_DIV_MOD_OP_PREC: u8 = 40; -const PLUS_MINUS_PREC: u8 = 30; -const XOR_PREC: u8 = 24; -const AMPERSAND_PREC: u8 = 23; -const CARET_PREC: u8 = 22; -const PIPE_PREC: u8 = 21; -const BETWEEN_PREC: u8 = 20; -const EQ_PREC: u8 = 20; -const LIKE_PREC: u8 = 19; -const IS_PREC: u8 = 17; -const PG_OTHER_PREC: u8 = 16; -const UNARY_NOT_PREC: u8 = 15; -const AND_PREC: u8 = 10; -const OR_PREC: u8 = 5; -const UNKNOWN_PREC: u8 = 0; +/// This represents the operators for which precedence must be defined +/// +/// higher number -> higher precedence +#[derive(Debug, Clone, Copy)] +pub enum Precedence { + DoubleColon, + AtTz, + MulDivModOp, + PlusMinus, + Xor, + Ampersand, + Caret, + Pipe, + Between, + Eq, + Like, + Is, + PgOther, + UnaryNot, + And, + Or, +} impl dyn Dialect { #[inline] diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 293fb9e7d..c25a80f67 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -12,7 +12,7 @@ use log::debug; use crate::ast::{CommentObject, Statement}; -use crate::dialect::Dialect; +use crate::dialect::{Dialect, Precedence}; use crate::keywords::Keyword; use crate::parser::{Parser, ParserError}; use crate::tokenizer::Token; @@ -89,71 +89,11 @@ impl Dialect for PostgreSqlDialect { let token = parser.peek_token(); debug!("get_next_precedence() {:?}", token); - let precedence = match token.token { - Token::Word(w) if w.keyword == Keyword::OR => OR_PREC, - Token::Word(w) if w.keyword == Keyword::XOR => XOR_PREC, - Token::Word(w) if w.keyword == Keyword::AND => AND_PREC, - Token::Word(w) if w.keyword == Keyword::AT => { - match ( - parser.peek_nth_token(1).token, - parser.peek_nth_token(2).token, - ) { - (Token::Word(w), Token::Word(w2)) - if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => - { - AT_TZ_PREC - } - _ => self.prec_unknown(), - } - } - - Token::Word(w) if w.keyword == Keyword::NOT => match parser.peek_nth_token(1).token { - // The precedence of NOT varies depending on keyword that - // follows it. If it is followed by IN, BETWEEN, or LIKE, - // it takes on the precedence of those tokens. Otherwise, it - // is not an infix operator, and therefore has zero - // precedence. - Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, - _ => self.prec_unknown(), - }, - Token::Word(w) if w.keyword == Keyword::IS => IS_PREC, - Token::Word(w) if w.keyword == Keyword::IN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::BETWEEN => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::LIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::ILIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::RLIKE => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::REGEXP => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::SIMILAR => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::OPERATOR => BETWEEN_LIKE_PREC, - Token::Word(w) if w.keyword == Keyword::DIV => MUL_DIV_MOD_OP_PREC, - Token::Word(w) if w.keyword == Keyword::COLLATE => COLLATE_PREC, - Token::Eq - | Token::Lt - | Token::LtEq - | Token::Neq - | Token::Gt - | Token::GtEq - | Token::DoubleEq - | Token::Tilde - | Token::TildeAsterisk - | Token::ExclamationMarkTilde - | Token::ExclamationMarkTildeAsterisk - | Token::DoubleTilde - | Token::DoubleTildeAsterisk - | Token::ExclamationMarkDoubleTilde - | Token::ExclamationMarkDoubleTildeAsterisk - | Token::Spaceship => EQ_PREC, - Token::Caret => CARET_PREC, - Token::Plus | Token::Minus => PLUS_MINUS_PREC, - Token::Mul | Token::Div | Token::Mod => MUL_DIV_MOD_OP_PREC, - Token::DoubleColon => DOUBLE_COLON_PREC, - Token::LBracket => BRACKET_PREC, + // we only return some custom value here when the behaviour (not merely the numeric value) differs + // from the default implementation + match token.token { + Token::Word(w) if w.keyword == Keyword::COLLATE => Some(Ok(COLLATE_PREC)), + Token::LBracket => Some(Ok(BRACKET_PREC)), Token::Arrow | Token::LongArrow | Token::HashArrow @@ -173,12 +113,9 @@ impl Dialect for PostgreSqlDialect { | Token::Sharp | Token::ShiftRight | Token::ShiftLeft - | Token::Pipe - | Token::Ampersand - | Token::CustomBinaryOperator(_) => PG_OTHER_PREC, - _ => self.prec_unknown(), - }; - Some(Ok(precedence)) + | Token::CustomBinaryOperator(_) => Some(Ok(PG_OTHER_PREC)), + _ => None, + } } fn parse_statement(&self, parser: &mut Parser) -> Option> { @@ -197,24 +134,25 @@ impl Dialect for PostgreSqlDialect { true } - fn prec_mul_div_mod_op(&self) -> u8 { - MUL_DIV_MOD_OP_PREC - } - - fn prec_plus_minus(&self) -> u8 { - PLUS_MINUS_PREC - } - - fn prec_between(&self) -> u8 { - BETWEEN_LIKE_PREC - } - - fn prec_like(&self) -> u8 { - BETWEEN_LIKE_PREC - } - - fn prec_unary_not(&self) -> u8 { - NOT_PREC + fn prec_value(&self, prec: Precedence) -> u8 { + match prec { + Precedence::DoubleColon => DOUBLE_COLON_PREC, + Precedence::AtTz => AT_TZ_PREC, + Precedence::MulDivModOp => MUL_DIV_MOD_OP_PREC, + Precedence::PlusMinus => PLUS_MINUS_PREC, + Precedence::Xor => XOR_PREC, + Precedence::Ampersand => PG_OTHER_PREC, + Precedence::Caret => CARET_PREC, + Precedence::Pipe => PG_OTHER_PREC, + Precedence::Between => BETWEEN_LIKE_PREC, + Precedence::Eq => EQ_PREC, + Precedence::Like => BETWEEN_LIKE_PREC, + Precedence::Is => IS_PREC, + Precedence::PgOther => PG_OTHER_PREC, + Precedence::UnaryNot => NOT_PREC, + Precedence::And => AND_PREC, + Precedence::Or => OR_PREC, + } } } diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index fe35d8da3..b22eb236f 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -20,7 +20,7 @@ use crate::ast::helpers::stmt_data_loading::{ use crate::ast::{ CommentDef, Ident, ObjectName, RowAccessPolicy, Statement, Tag, WrappedCollection, }; -use crate::dialect::Dialect; +use crate::dialect::{Dialect, Precedence}; use crate::keywords::Keyword; use crate::parser::{Parser, ParserError}; use crate::tokenizer::Token; @@ -150,7 +150,7 @@ impl Dialect for SnowflakeDialect { let token = parser.peek_token(); // Snowflake supports the `:` cast operator unlike other dialects match token.token { - Token::Colon => Some(Ok(self.prec_double_colon())), + Token::Colon => Some(Ok(self.prec_value(Precedence::DoubleColon))), _ => None, } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 293b817be..5d7abaa93 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1079,7 +1079,7 @@ impl<'a> Parser<'a> { self.parse_bigquery_struct_literal() } Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => { - let expr = self.parse_subexpr(self.dialect.prec_plus_minus())?; + let expr = self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?; Ok(Expr::Prior(Box::new(expr))) } Keyword::MAP if self.peek_token() == Token::LBrace && self.dialect.support_map_literal_syntax() => { @@ -1167,7 +1167,9 @@ impl<'a> Parser<'a> { }; Ok(Expr::UnaryOp { op, - expr: Box::new(self.parse_subexpr(self.dialect.prec_mul_div_mod_op())?), + expr: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::MulDivModOp))?, + ), }) } tok @ Token::DoubleExclamationMark @@ -1187,7 +1189,9 @@ impl<'a> Parser<'a> { }; Ok(Expr::UnaryOp { op, - expr: Box::new(self.parse_subexpr(self.dialect.prec_plus_minus())?), + expr: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?, + ), }) } Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => @@ -1739,7 +1743,7 @@ impl<'a> Parser<'a> { } pub fn parse_position_expr(&mut self, ident: Ident) -> Result { - let between_prec = self.dialect.prec_between(); + let between_prec = self.dialect.prec_value(Precedence::Between); let position_expr = self.maybe_parse(|p| { // PARSE SELECT POSITION('@' in field) p.expect_token(&Token::LParen)?; @@ -1993,12 +1997,14 @@ impl<'a> Parser<'a> { } _ => Ok(Expr::UnaryOp { op: UnaryOperator::Not, - expr: Box::new(self.parse_subexpr(self.dialect.prec_unary_not())?), + expr: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?, + ), }), }, _ => Ok(Expr::UnaryOp { op: UnaryOperator::Not, - expr: Box::new(self.parse_subexpr(self.dialect.prec_unary_not())?), + expr: Box::new(self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?), }), } } @@ -2671,7 +2677,9 @@ impl<'a> Parser<'a> { Ok(Expr::RLike { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), + pattern: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::Like))?, + ), regexp, }) } else if self.parse_keyword(Keyword::IN) { @@ -2682,21 +2690,27 @@ impl<'a> Parser<'a> { Ok(Expr::Like { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), + pattern: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::Like))?, + ), escape_char: self.parse_escape_char()?, }) } else if self.parse_keyword(Keyword::ILIKE) { Ok(Expr::ILike { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), + pattern: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::Like))?, + ), escape_char: self.parse_escape_char()?, }) } else if self.parse_keywords(&[Keyword::SIMILAR, Keyword::TO]) { Ok(Expr::SimilarTo { negated, expr: Box::new(expr), - pattern: Box::new(self.parse_subexpr(self.dialect.prec_like())?), + pattern: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::Like))?, + ), escape_char: self.parse_escape_char()?, }) } else { @@ -2971,9 +2985,9 @@ impl<'a> Parser<'a> { pub fn parse_between(&mut self, expr: Expr, negated: bool) -> Result { // Stop parsing subexpressions for and on tokens with // precedence lower than that of `BETWEEN`, such as `AND`, `IS`, etc. - let low = self.parse_subexpr(self.dialect.prec_between())?; + let low = self.parse_subexpr(self.dialect.prec_value(Precedence::Between))?; self.expect_keyword(Keyword::AND)?; - let high = self.parse_subexpr(self.dialect.prec_between())?; + let high = self.parse_subexpr(self.dialect.prec_value(Precedence::Between))?; Ok(Expr::Between { expr: Box::new(expr), negated,