diff --git a/src/ast/mod.rs b/src/ast/mod.rs index d77186bc7..bdcadce9e 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -4654,22 +4654,34 @@ pub enum Statement { /// Legacy copy-style options. options: Vec, }, + /// ClickHouse: /// ```sql /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] /// ``` - /// /// See ClickHouse + /// + /// Databricks: + /// ```sql + /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])] + /// ``` + /// See Databricks OptimizeTable { /// Table name to optimize. name: ObjectName, - /// Optional cluster identifier. + /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE TABLE`, Databricks uses `OPTIMIZE`). + has_table_keyword: bool, + /// Optional cluster identifier (ClickHouse). on_cluster: Option, - /// Optional partition spec. + /// Optional partition spec (ClickHouse). partition: Option, - /// Whether `FINAL` was specified. + /// Whether `FINAL` was specified (ClickHouse). include_final: bool, - /// Optional deduplication settings. + /// Optional deduplication settings (ClickHouse). deduplicate: Option, + /// Optional WHERE predicate (Databricks). + predicate: Option, + /// Optional ZORDER BY columns (Databricks). + zorder: Option>, }, /// ```sql /// LISTEN @@ -6243,12 +6255,19 @@ impl fmt::Display for Statement { } Statement::OptimizeTable { name, + has_table_keyword, on_cluster, partition, include_final, deduplicate, + predicate, + zorder, } => { - write!(f, "OPTIMIZE TABLE {name}")?; + write!(f, "OPTIMIZE")?; + if *has_table_keyword { + write!(f, " TABLE")?; + } + write!(f, " {name}")?; if let Some(on_cluster) = on_cluster { write!(f, " ON CLUSTER {on_cluster}")?; } @@ -6261,6 +6280,12 @@ impl fmt::Display for Statement { if let Some(deduplicate) = deduplicate { write!(f, " {deduplicate}")?; } + if let Some(predicate) = predicate { + write!(f, " WHERE {predicate}")?; + } + if let Some(zorder) = zorder { + write!(f, " ZORDER BY ({})", display_comma_separated(zorder))?; + } Ok(()) } Statement::LISTEN { channel } => { diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 149365c47..ea417109b 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -697,8 +697,10 @@ impl<'a> Parser<'a> { self.parse_install() } Keyword::LOAD => self.parse_load(), - // `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/ - Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | GenericDialect) => { + // `OPTIMIZE` is clickhouse/databricks specific + // ClickHouse: https://clickhouse.tech/docs/en/sql-reference/statements/optimize/ + // Databricks: https://docs.databricks.com/en/sql/language-manual/delta-optimize.html + Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | DatabricksDialect | GenericDialect) => { self.parse_optimize_table() } // `COMMENT` is snowflake specific https://docs.snowflake.com/en/sql-reference/sql/comment @@ -3332,25 +3334,35 @@ impl<'a> Parser<'a> { /// Syntax: /// /// ```sql + /// -- BigQuery style /// [field_name] field_type + /// -- Databricks/Hive style (colon separator) + /// field_name: field_type /// ``` /// /// [struct]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type /// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple + /// [databricks]: https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html fn parse_struct_field_def( &mut self, ) -> Result<(StructField, MatchedTrailingBracket), ParserError> { // Look beyond the next item to infer whether both field name // and type are specified. - let is_anonymous_field = !matches!( + // Supports both: + // - `field_name field_type` (BigQuery style) + // - `field_name: field_type` (Databricks/Hive style) + let is_named_field = matches!( (self.peek_nth_token(0).token, self.peek_nth_token(1).token), - (Token::Word(_), Token::Word(_)) + (Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon) ); - let field_name = if is_anonymous_field { - None + let field_name = if is_named_field { + let name = self.parse_identifier()?; + // Consume optional colon separator (Databricks/Hive style) + let _ = self.consume_token(&Token::Colon); + Some(name) } else { - Some(self.parse_identifier()?) + None }; let (field_type, trailing_bracket) = self.parse_data_type_helper()?; @@ -7878,7 +7890,7 @@ impl<'a> Parser<'a> { pub fn parse_hive_distribution(&mut self) -> Result { if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) { self.expect_token(&Token::LParen)?; - let columns = self.parse_comma_separated(Parser::parse_column_def)?; + let columns = self.parse_comma_separated(Parser::parse_column_def_for_partition)?; self.expect_token(&Token::RParen)?; Ok(HiveDistributionStyle::PARTITIONED { columns }) } else { @@ -7886,6 +7898,33 @@ impl<'a> Parser<'a> { } } + /// Parse column definition for PARTITIONED BY clause. + /// + /// Databricks allows partition columns without types when referencing + /// columns already defined in the table specification: + /// ```sql + /// CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1) + /// CREATE TABLE t (col1 STRING) PARTITIONED BY (col2 INT) + /// ``` + /// + /// See [Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html) + fn parse_column_def_for_partition(&mut self) -> Result { + let name = self.parse_identifier()?; + + // Check if the next token indicates there's no type specified + // (comma or closing paren means end of this column definition) + let data_type = match self.peek_token().token { + Token::Comma | Token::RParen => DataType::Unspecified, + _ => self.parse_data_type()?, + }; + + Ok(ColumnDef { + name, + data_type, + options: vec![], + }) + } + /// Parse Hive formats. pub fn parse_hive_formats(&mut self) -> Result, ParserError> { let mut hive_format: Option = None; @@ -11781,7 +11820,8 @@ impl<'a> Parser<'a> { let field_defs = self.parse_duckdb_struct_type_def()?; Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses)) } - Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | GenericDialect) => { + Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) => + { self.prev_token(); let (field_defs, _trailing_bracket) = self.parse_struct_type_def(Self::parse_struct_field_def)?; @@ -18204,13 +18244,24 @@ impl<'a> Parser<'a> { } } + /// ClickHouse: /// ```sql /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] /// ``` /// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize) + /// + /// Databricks: + /// ```sql + /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])] + /// ``` + /// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html) pub fn parse_optimize_table(&mut self) -> Result { - self.expect_keyword_is(Keyword::TABLE)?; + // Check for TABLE keyword (ClickHouse uses it, Databricks does not) + let has_table_keyword = self.parse_keyword(Keyword::TABLE); + let name = self.parse_object_name(false)?; + + // ClickHouse-specific options let on_cluster = self.parse_optional_on_cluster()?; let partition = if self.parse_keyword(Keyword::PARTITION) { @@ -18224,6 +18275,7 @@ impl<'a> Parser<'a> { }; let include_final = self.parse_keyword(Keyword::FINAL); + let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) { if self.parse_keyword(Keyword::BY) { Some(Deduplicate::ByExpression(self.parse_expr()?)) @@ -18234,12 +18286,31 @@ impl<'a> Parser<'a> { None }; + // Databricks-specific options + let predicate = if self.parse_keyword(Keyword::WHERE) { + Some(self.parse_expr()?) + } else { + None + }; + + let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) { + self.expect_token(&Token::LParen)?; + let columns = self.parse_comma_separated(|p| p.parse_expr())?; + self.expect_token(&Token::RParen)?; + Some(columns) + } else { + None + }; + Ok(Statement::OptimizeTable { name, + has_table_keyword, on_cluster, partition, include_final, deduplicate, + predicate, + zorder, }) } diff --git a/tests/sqlparser_databricks.rs b/tests/sqlparser_databricks.rs index 7f5ec6c3f..0512f60c5 100644 --- a/tests/sqlparser_databricks.rs +++ b/tests/sqlparser_databricks.rs @@ -389,3 +389,201 @@ fn parse_table_time_travel() { .parse_sql_statements("SELECT 1 FROM t1 VERSION AS OF 1 - 2",) .is_err()) } + +#[test] +fn parse_optimize_table() { + // Basic OPTIMIZE (Databricks style - no TABLE keyword) + databricks().verified_stmt("OPTIMIZE my_table"); + databricks().verified_stmt("OPTIMIZE db.my_table"); + databricks().verified_stmt("OPTIMIZE catalog.db.my_table"); + + // With WHERE clause + databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01'"); + databricks() + .verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' AND date < '2023-02-01'"); + + // With ZORDER BY clause + databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1)"); + databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2)"); + databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2, col3)"); + + // Combined WHERE and ZORDER BY + databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' ZORDER BY (col1)"); + databricks() + .verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' ZORDER BY (col1, col2)"); + + // Verify AST structure + match databricks() + .verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' ZORDER BY (col1, col2)") + { + Statement::OptimizeTable { + name, + has_table_keyword, + on_cluster, + partition, + include_final, + deduplicate, + predicate, + zorder, + } => { + assert_eq!(name.to_string(), "my_table"); + assert!(!has_table_keyword); + assert!(on_cluster.is_none()); + assert!(partition.is_none()); + assert!(!include_final); + assert!(deduplicate.is_none()); + assert!(predicate.is_some()); + assert_eq!( + zorder, + Some(vec![ + Expr::Identifier(Ident::new("col1")), + Expr::Identifier(Ident::new("col2")), + ]) + ); + } + _ => unreachable!(), + } + + // Negative cases + assert_eq!( + databricks() + .parse_sql_statements("OPTIMIZE my_table ZORDER BY") + .unwrap_err(), + ParserError::ParserError("Expected: (, found: EOF".to_string()) + ); + assert_eq!( + databricks() + .parse_sql_statements("OPTIMIZE my_table ZORDER BY ()") + .unwrap_err(), + ParserError::ParserError("Expected: an expression, found: )".to_string()) + ); +} + +#[test] +fn parse_create_table_partitioned_by() { + // Databricks allows PARTITIONED BY with just column names (referencing existing columns) + // https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html + + // Single partition column without type + databricks().verified_stmt("CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1)"); + + // Multiple partition columns without types + databricks().verified_stmt( + "CREATE TABLE t (col1 STRING, col2 INT, col3 DATE) PARTITIONED BY (col1, col2)", + ); + + // Partition columns with types (new columns not in table spec) + databricks().verified_stmt("CREATE TABLE t (name STRING) PARTITIONED BY (year INT, month INT)"); + + // Mixed: some with types, some without + databricks() + .verified_stmt("CREATE TABLE t (id INT, name STRING) PARTITIONED BY (region, year INT)"); + + // Verify AST structure for column without type + match databricks().verified_stmt("CREATE TABLE t (col1 STRING) PARTITIONED BY (col1)") { + Statement::CreateTable(CreateTable { + name, + columns, + hive_distribution, + .. + }) => { + assert_eq!(name.to_string(), "t"); + assert_eq!(columns.len(), 1); + assert_eq!(columns[0].name.to_string(), "col1"); + match hive_distribution { + HiveDistributionStyle::PARTITIONED { + columns: partition_cols, + } => { + assert_eq!(partition_cols.len(), 1); + assert_eq!(partition_cols[0].name.to_string(), "col1"); + assert_eq!(partition_cols[0].data_type, DataType::Unspecified); + } + _ => unreachable!(), + } + } + _ => unreachable!(), + } + + // Verify AST structure for column with type + match databricks().verified_stmt("CREATE TABLE t (name STRING) PARTITIONED BY (year INT)") { + Statement::CreateTable(CreateTable { + hive_distribution: + HiveDistributionStyle::PARTITIONED { + columns: partition_cols, + }, + .. + }) => { + assert_eq!(partition_cols.len(), 1); + assert_eq!(partition_cols[0].name.to_string(), "year"); + assert_eq!(partition_cols[0].data_type, DataType::Int(None)); + } + _ => unreachable!(), + } +} + +#[test] +fn parse_databricks_struct_type() { + // Databricks uses colon-separated struct field syntax (colon is optional) + // https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html + + // Basic struct with colon syntax - parses to canonical form without colons + databricks().one_statement_parses_to( + "CREATE TABLE t (col1 STRUCT)", + "CREATE TABLE t (col1 STRUCT)", + ); + + // Nested array of struct (the original issue case) + databricks().one_statement_parses_to( + "CREATE TABLE t (col1 ARRAY>)", + "CREATE TABLE t (col1 ARRAY>)", + ); + + // Multiple struct columns + databricks().one_statement_parses_to( + "CREATE TABLE t (col1 STRUCT, col2 STRUCT)", + "CREATE TABLE t (col1 STRUCT, col2 STRUCT)", + ); + + // Deeply nested structs + databricks().one_statement_parses_to( + "CREATE TABLE t (col1 STRUCT>)", + "CREATE TABLE t (col1 STRUCT>)", + ); + + // Struct with array field + databricks().one_statement_parses_to( + "CREATE TABLE t (col1 STRUCT, name: STRING>)", + "CREATE TABLE t (col1 STRUCT, name STRING>)", + ); + + // Syntax without colons should also work (BigQuery compatible) + databricks().verified_stmt("CREATE TABLE t (col1 STRUCT)"); + + // Verify AST structure + match databricks().one_statement_parses_to( + "CREATE TABLE t (col1 STRUCT)", + "CREATE TABLE t (col1 STRUCT)", + ) { + Statement::CreateTable(CreateTable { columns, .. }) => { + assert_eq!(columns.len(), 1); + assert_eq!(columns[0].name.to_string(), "col1"); + match &columns[0].data_type { + DataType::Struct(fields, StructBracketKind::AngleBrackets) => { + assert_eq!(fields.len(), 2); + assert_eq!( + fields[0].field_name.as_ref().map(|i| i.to_string()), + Some("field1".to_string()) + ); + assert_eq!(fields[0].field_type, DataType::String(None)); + assert_eq!( + fields[1].field_name.as_ref().map(|i| i.to_string()), + Some("field2".to_string()) + ); + assert_eq!(fields[1].field_type, DataType::Int(None)); + } + _ => unreachable!(), + } + } + _ => unreachable!(), + } +}