From 66718dda6b0b7dda98385979e217461df2cab916 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 11 Dec 2024 18:05:01 +0900 Subject: [PATCH 01/18] reorder methods --- .../postgresql-cst-parser/src/tree_sitter.rs | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 1bac64b..b3de090 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -133,6 +133,14 @@ impl<'a> From> for TreeCursor<'a> { } impl<'a> TreeCursor<'a> { + pub fn node(&self) -> Node<'a> { + Node { + input: self.input, + range_map: Rc::clone(&self.range_map), + node_or_token: self.node_or_token, + } + } + pub fn goto_first_child(&mut self) -> bool { if self.node_or_token.as_node().is_none() { return false; @@ -168,6 +176,20 @@ impl<'a> TreeCursor<'a> { } } + pub fn goto_parent(&mut self) -> bool { + while let Some(parent) = self.node_or_token.parent() { + self.node_or_token = NodeOrToken::Node(parent); + + if is_flatten(self.node_or_token) { + continue; + } + + return true; + } + + false + } + pub fn goto_next_sibling(&mut self) -> bool { let mut cursor = self.clone(); @@ -208,28 +230,6 @@ impl<'a> TreeCursor<'a> { } } - pub fn goto_parent(&mut self) -> bool { - while let Some(parent) = self.node_or_token.parent() { - self.node_or_token = NodeOrToken::Node(parent); - - if is_flatten(self.node_or_token) { - continue; - } - - return true; - } - - false - } - - pub fn node(&self) -> Node<'a> { - Node { - input: self.input, - range_map: Rc::clone(&self.range_map), - node_or_token: self.node_or_token, - } - } - pub fn is_comment(&self) -> bool { matches!( self.node_or_token.kind(), From 58304dbbd434a116e0a338ac0ea0ed7a3ba05449 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 11 Dec 2024 20:23:05 +0900 Subject: [PATCH 02/18] feat: implement Display for Range --- crates/postgresql-cst-parser/src/tree_sitter.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index b3de090..cd51e79 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -38,6 +38,12 @@ pub struct Range { pub end_col: usize, } +impl std::fmt::Display for Range { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[({}, {})-({}, {})]", self.start_row, self.start_col, self.end_row, self.end_col) + } +} + fn is_flatten_all(node_or_token: NodeOrToken) -> bool { matches!( node_or_token.kind(), From 4ab1be423423c1a68bda6130cc6235372c50aba5 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 11 Dec 2024 20:23:21 +0900 Subject: [PATCH 03/18] feat: implement child_count --- crates/postgresql-cst-parser/src/tree_sitter.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index cd51e79..9935d23 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -109,9 +109,12 @@ impl<'a> Node<'a> { &self.input[start..end] } - pub fn children(&self) {} - - pub fn child_count(&self) {} + pub fn child_count(&self) -> usize { + if let Some(node) = self.node_or_token.as_node() { + return node.children_with_tokens().count(); + } + 0 + } pub fn next_sibling(&self) -> Option> { self.node_or_token From eb926f4656884d4f5503caf95debd9973d807f13 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 11 Dec 2024 20:24:14 +0900 Subject: [PATCH 04/18] feat: implement tree-sitter like traverse API --- .../postgresql-cst-parser/src/tree_sitter.rs | 266 ++++++++++++++---- 1 file changed, 205 insertions(+), 61 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 9935d23..d1c688a 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -151,48 +151,19 @@ impl<'a> TreeCursor<'a> { } pub fn goto_first_child(&mut self) -> bool { - if self.node_or_token.as_node().is_none() { - return false; - } - - let mut cursor = self.clone(); - - // TODO 書き捨てコードなのでリファクタ - loop { - if let Some(node) = cursor.node_or_token.as_node() { - if let Some(child) = node.first_child_or_token() { - cursor.node_or_token = child; - - if is_skip(child.kind()) || is_flatten(child) { - continue; - } - - self.node_or_token = cursor.node_or_token; - return true; - } - } - if let Some(sibling) = cursor.node_or_token.next_sibling_or_token() { - cursor.node_or_token = sibling; - - if is_skip(sibling.kind()) || is_flatten(sibling) { - continue; - } - + if let Some(current_node) = self.node_or_token.as_node() { + if let Some(child) = current_node.first_child_or_token() { + self.node_or_token = child; return true; - } else { - cursor.node_or_token = NodeOrToken::Node(cursor.node_or_token.parent().unwrap()); } } + false } pub fn goto_parent(&mut self) -> bool { - while let Some(parent) = self.node_or_token.parent() { + if let Some(parent) = self.node_or_token.parent() { self.node_or_token = NodeOrToken::Node(parent); - if is_flatten(self.node_or_token) { - continue; - } - return true; } @@ -200,35 +171,35 @@ impl<'a> TreeCursor<'a> { } pub fn goto_next_sibling(&mut self) -> bool { - let mut cursor = self.clone(); - - loop { - while let Some(sibling) = cursor.node_or_token.next_sibling_or_token() { - cursor.node_or_token = sibling; + if let Some(sibling) = self.node_or_token.next_sibling_or_token() { + self.node_or_token = sibling; + return true; + } + false + } - if is_skip(sibling.kind()) { - continue; - } + /// + /// These methods are unused in uroborosql-fmt + /// + // pub fn field_id(&self)-> Option { + // unimplemented!() + // } - if is_flatten(sibling) { - cursor.goto_first_child(); - } + // pub fn field_name(&self)-> Option<&'static str> { + // unimplemented!() + // } - self.node_or_token = cursor.node_or_token; - return true; - } + // pub fn goto_first_child_for_byte(&mut self, index: usize) -> Option { + // unimplemented!() + // } - if let Some(parent) = cursor.node_or_token.parent() { - if !is_flatten(NodeOrToken::Node(parent)) { - return false; - } + // pub fn goto_first_child_for_point(&mut self, point: Point) -> Option { + // unimplemented!() + // } - cursor.node_or_token = NodeOrToken::Node(parent); - } else { - return false; - } - } - } + // pub fn reset(&mut self, node: Node<'a>) { + // unimplemented!() + // } pub fn goto_direct_prev_sibling(&mut self) -> bool { if let Some(prev) = self.node_or_token.prev_sibling_or_token() { @@ -335,7 +306,7 @@ pub fn dump_as_tree_sitter_like(input: &str, node: &ResolvedNode) { let mut depth = 0; loop { - dbg!(cursor.node().kind(), cursor.node().text(), depth); + // dbg!(cursor.node().kind(), cursor.node().text(), depth); if cursor.goto_first_child() { depth += 1; @@ -357,7 +328,12 @@ pub fn dump_as_tree_sitter_like(input: &str, node: &ResolvedNode) { #[cfg(test)] mod tests { - use crate::{cst, tree_sitter::dump_as_tree_sitter_like, ParseError}; + use crate::{ + cst, parse, + syntax_kind::SyntaxKind, + tree_sitter::{as_tree_sitter_cursor, convert_cst, dump_as_tree_sitter_like, TreeCursor}, + ParseError, + }; #[test] fn test() -> Result<(), ParseError> { @@ -371,10 +347,178 @@ FROM , B"#; // dbg!(input); let node = cst::parse(input)?; - dbg!(&node); + // dbg!(&node); dump_as_tree_sitter_like(input, &node); Ok(()) } + + #[test] + fn tree_sitter_like_traverse() { + const UNIT: usize = 2; + + fn visit(cursor: &mut TreeCursor, depth: usize, src: &str) { + (0..(depth * UNIT)).for_each(|_| print!(" ")); + + print!("{}", cursor.node().kind()); + + if cursor.node().child_count() == 0 { + // print!(" \"{}\"", cursor.node().utf8_text(src.as_bytes()).unwrap()); + print!(" \"{}\"", cursor.node().text().escape_default()); + } + println!( + // " [{}-{}]", + // cursor.node().start_position(), + // cursor.node().end_position() + " {}", + cursor.node().range() + ); + + // 子供を走査 + if cursor.goto_first_child() { + visit(cursor, depth + 1, src); + while cursor.goto_next_sibling() { + visit(cursor, depth + 1, src); + } + cursor.goto_parent(); + } + } + + let src = r#" +-- comment +SELECT + 1 as X +, 2 -- comment +, 3 +FROM + A +, B"#; + + let node = parse(&src).unwrap(); + let node = convert_cst(&node); + let mut cursor = as_tree_sitter_cursor(src, &node); + + visit(&mut cursor, 0, &src); + } + + #[test] + fn goto_first_child_from_node() { + let src = "select a, b, c from tbl;"; + let root = convert_cst(&parse(&src).unwrap()); + let first_select = root + .descendants() + .find(|x| x.kind() == SyntaxKind::simple_select) + .unwrap(); + + let mut cursor = as_tree_sitter_cursor(src, &first_select); + assert_eq!(cursor.node().kind(), SyntaxKind::simple_select); + + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), SyntaxKind::SELECT); + } + + #[test] + fn goto_first_child_from_token() { + let src = "select a, b, c from tbl;"; + let root = convert_cst(&parse(&src).unwrap()); + let column_id_node = root + .descendants() + .find(|x| x.kind() == SyntaxKind::ColId) + .unwrap(); + + let mut cursor = as_tree_sitter_cursor(src, &column_id_node); + cursor.goto_first_child(); + assert_eq!(cursor.node().kind(), SyntaxKind::IDENT); + + assert!(!cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), SyntaxKind::IDENT); + } + + #[test] + fn goto_parent_from_root() { + let src = "select a, b, c from tbl;"; + let root = convert_cst(&parse(&src).unwrap()); + + let mut cursor = as_tree_sitter_cursor(src, &root); + assert_eq!(cursor.node().kind(), SyntaxKind::Root); + + assert!(!cursor.goto_parent()); + assert_eq!(cursor.node().kind(), SyntaxKind::Root); + } + + #[test] + fn goto_parent_from_node() { + let src = "select a, b, c from tbl;"; + let root = convert_cst(&parse(&src).unwrap()); + + let target_element = root + .descendants() + .find(|x| x.kind() == SyntaxKind::target_el) + .unwrap(); + let mut cursor = as_tree_sitter_cursor(src, &target_element); + assert_eq!(cursor.node().kind(), SyntaxKind::target_el); + + assert!(cursor.goto_parent()); + assert_eq!(cursor.node().kind(), SyntaxKind::target_list); + } + + #[test] + fn goto_parent_from_token() { + let src = "select a, b, c from tbl;"; + let root = convert_cst(&parse(&src).unwrap()); + + let column_id_node = root + .descendants() + .find(|x| x.kind() == SyntaxKind::ColId) + .unwrap(); + let mut cursor = as_tree_sitter_cursor(src, &column_id_node); + + cursor.goto_first_child(); + assert_eq!(cursor.node().kind(), SyntaxKind::IDENT); + + assert!(cursor.goto_parent()); + assert_eq!(cursor.node().kind(), SyntaxKind::ColId); + } + + #[test] + fn goto_next_sibling() { + let src = "select a,b,c from tbl;"; + let root = convert_cst(&parse(&src).unwrap()); + + let target_element = root + .descendants() + .find(|x| x.kind() == SyntaxKind::target_el) + .unwrap(); + let mut cursor = as_tree_sitter_cursor(src, &target_element); + // + // - target_list + // - target_el (1) + // - Comma "," + // - target_el (2) + // - Comma "," + // - target_el (3) + // + + // 1 + assert_eq!(cursor.node().kind(), SyntaxKind::target_el); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), SyntaxKind::Comma); + + // 2 + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), SyntaxKind::target_el); + + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), SyntaxKind::Comma); + + // 3 + assert!(cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), SyntaxKind::target_el); + + // No more siblings + assert!(!cursor.goto_next_sibling()); + assert_eq!(cursor.node().kind(), SyntaxKind::target_el); + } } From f316107fb690a5ff2c6763a0e803f39fdbfacdbe Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 18 Dec 2024 16:19:33 +0900 Subject: [PATCH 05/18] feat: implement relationship between Nodes and their Range --- .../src/tree_sitter/convert.rs | 111 +++++++++++++++--- 1 file changed, 92 insertions(+), 19 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index bdeb8ab..52cf27c 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -1,28 +1,86 @@ -use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode}; +use std::collections::HashMap; + +use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode, text::TextRange}; use crate::{syntax_kind::SyntaxKind, PostgreSQLSyntax, ResolvedNode}; +use super::{traverse_pre_order, Range}; + /// Converts the given CST into a node structure and hierarchy that closely matches what `tree-sitter-sql` produces. -pub fn convert_cst(root: &ResolvedNode) -> ResolvedNode { +pub fn convert_cst(src: &str, root: &ResolvedNode) -> (ResolvedNode, HashMap) { let mut builder = GreenNodeBuilder::new(); + let mut range_vec = vec![]; // Build `Root` node builder.start_node(SyntaxKind::Root); - walk_and_build(&mut builder, root); + walk_and_build(src, &mut builder, &mut range_vec, root); builder.finish_node(); let (tree, cache) = builder.finish(); + let resolved_node = + SyntaxNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()); + + let mut range_map = HashMap::new(); + let mut ranges = range_vec.iter(); + traverse_pre_order(&resolved_node, |node_or_token| { + if let Some(original_range) = ranges.next() { + // clone? + range_map.insert(node_or_token.text_range(), original_range.clone()); + } else { + unreachable!() + } + }); - SyntaxNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()) + (resolved_node, range_map) } /// Traverse the CST and rewrite certain nodes /// e.g. flatten list node, remove option node fn walk_and_build( + input: &str, builder: &mut GreenNodeBuilder<'static, 'static, PostgreSQLSyntax>, + range_vec: &mut Vec, node: &ResolvedNode, ) { use cstree::util::NodeOrToken; + + let new_line_indices: Vec<_> = input + .char_indices() + .filter(|&(_, c)| c == '\n') + .map(|(i, _)| i) + .collect(); + + let text_range = node.text_range(); + + // text_range の初期部分が、改行のどの部分に現れるか + let before_start_new_line_count = + match new_line_indices.binary_search(&text_range.start().into()) { + Ok(i) => i, + Err(i) => i, + }; + + let before_end_new_line_count = match new_line_indices.binary_search(&text_range.end().into()) { + Ok(i) => i, + Err(i) => i, + }; + + range_vec.push(Range { + start_row: before_start_new_line_count, + start_col: usize::from(node.text_range().start()) + - match before_start_new_line_count { + 0 => 0, + // ひとつ前のインデックス(直前の改行文字)+1 + i => new_line_indices[i - 1] + 1, // +1 は改行文字分? + }, + end_row: before_end_new_line_count, + end_col: usize::from(node.text_range().end()) + - 1 + - match before_end_new_line_count { + 0 => 0, + i => new_line_indices[i - 1], + }, + }); + let parent_kind = node.kind(); let children = node.children_with_tokens(); @@ -44,15 +102,22 @@ fn walk_and_build( // +- target_el // +- ... // - walk_and_build(builder, n); + walk_and_build(input, builder, range_vec, n); } else { // Node is target for flattening, but at the top level of the nest builder.start_node(n.kind()); - walk_and_build(builder, n); + walk_and_build(input, builder, range_vec, n); builder.finish_node(); } } + // SyntaxKind::parse_toplevel + // | SyntaxKind::stmtmulti + // | SyntaxKind::toplevel_stmt + // | SyntaxKind::stmt + // | SyntaxKind::select_no_parens + // | SyntaxKind::simple_select + // | SyntaxKind::opt_target_list => { // [Removal] // @@ -64,18 +129,26 @@ fn walk_and_build( // +- child_1 +- child_2 // +- child_1 // - walk_and_build(builder, n); + walk_and_build(input, builder, range_vec, n); } // Default pattern _ => { builder.start_node(n.kind()); - walk_and_build(builder, n); + walk_and_build(input, builder, range_vec, n); builder.finish_node(); } } } NodeOrToken::Token(t) => { + // Remove Whitespace Token + // Note: + // This process will break the lossless property of the CST. + // `text()` for Nodes and `text_range()` for Nodes and Tokens will become incompatible with the original text. + if t.kind() == SyntaxKind::Whitespace { + continue; + } + builder.token(t.kind(), t.text()); } } @@ -87,9 +160,8 @@ mod tests { use crate::{cst, tree_sitter::convert::convert_cst}; #[test] - /// Assert that the CST is not broken by the conversion. - fn restored_texts_are_equal() { - let input = r#" + fn whitespace_is_removed() { + let original = r#" SELECT 1 as X , 2 @@ -98,11 +170,12 @@ FROM A , B"#; - let root = cst::parse(input).unwrap(); - let new_root = convert_cst(&root); + let root = cst::parse(&original).unwrap(); + let new_root = convert_cst(&original, &root); - // format!("{ResolvedNode}") returns original input str. - assert_eq!(format!("{root}"), format!("{new_root}")); + let whitespace_removed: String = original.split_whitespace().collect(); + dbg!(&whitespace_removed); + assert_eq!(new_root.text(), whitespace_removed.as_str()); } mod removal { @@ -121,7 +194,7 @@ FROM let root = cst::parse(input).unwrap(); assert_exists(&root, SyntaxKind::opt_target_list); - let new_root = convert_cst(&root); + let new_root = convert_cst(input, &root); assert_not_exists(&new_root, SyntaxKind::opt_target_list); } } @@ -143,7 +216,7 @@ FROM let root = cst::parse(input).unwrap(); assert_node_count(&root, SyntaxKind::target_list, 3); - let new_root = convert_cst(&root); + let new_root = convert_cst(input, &root); assert_node_count(&new_root, SyntaxKind::target_list, 1); assert_no_direct_nested_kind(&new_root, SyntaxKind::target_list); } @@ -152,7 +225,7 @@ FROM fn no_nested_stmtmulti() { let input = "select a,b,c;\nselect d,e from t;"; let root = cst::parse(input).unwrap(); - let new_root = convert_cst(&root); + let new_root = convert_cst(&input, &root); assert_no_direct_nested_kind(&new_root, SyntaxKind::stmtmulti); } @@ -161,7 +234,7 @@ FROM fn no_nested_from_list() { let input = "select * from t1, t2;"; let root = cst::parse(input).unwrap(); - let new_root = convert_cst(&root); + let new_root = convert_cst(input, &root); assert_no_direct_nested_kind(&new_root, SyntaxKind::from_list); } From 1353b71fa9ef344c0f46574d7e854c92be686262 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 18 Dec 2024 20:11:49 +0900 Subject: [PATCH 06/18] checkpoint: range conversion worked --- .../src/tree_sitter/convert.rs | 210 +++++++++++++----- 1 file changed, 160 insertions(+), 50 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 52cf27c..76833dc 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -1,58 +1,81 @@ use std::collections::HashMap; -use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode, text::TextRange}; +use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode, text}; -use crate::{syntax_kind::SyntaxKind, PostgreSQLSyntax, ResolvedNode}; +use crate::{syntax_kind::SyntaxKind, NodeOrToken, PostgreSQLSyntax, ResolvedNode}; -use super::{traverse_pre_order, Range}; +use super::traverse_pre_order; -/// Converts the given CST into a node structure and hierarchy that closely matches what `tree-sitter-sql` produces. -pub fn convert_cst(src: &str, root: &ResolvedNode) -> (ResolvedNode, HashMap) { +type SequentialRange = cstree::text::TextRange; // Range representation by cstree +type RowColumnRange = super::Range; // tree-sitter like range representation + +pub fn get_ts_tree_and_range_map( + src: &str, + root: &ResolvedNode, +) -> (ResolvedNode, HashMap) { let mut builder = GreenNodeBuilder::new(); - let mut range_vec = vec![]; + let mut row_column_ranges: Vec = vec![]; - // Build `Root` node + let new_line_indices: Vec<_> = src + .char_indices() + .filter(|&(_, c)| c == '\n') + .map(|(i, _)| i) + .collect(); + + row_column_ranges.push(get_row_column_range( + &NodeOrToken::Node(root), + &new_line_indices, + )); builder.start_node(SyntaxKind::Root); - walk_and_build(src, &mut builder, &mut range_vec, root); + // process subtrees. + // These Nodes will be ignored: + // - unneeded node + // - nested node + // - Whitespace token + // + // Each Node in the tree: + // 1. Add new Node (or Token) to New Tree + // 2. Create tree-sitter compatible `Range`s based on the original text. + walk_and_build( + src, + &new_line_indices, + &mut builder, + &mut row_column_ranges, + &root, + ); builder.finish_node(); let (tree, cache) = builder.finish(); - let resolved_node = + let new_root = SyntaxNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()); + assert_eq!( + new_root.descendants_with_tokens().count(), + row_column_ranges.len() + ); + let mut range_map = HashMap::new(); - let mut ranges = range_vec.iter(); - traverse_pre_order(&resolved_node, |node_or_token| { - if let Some(original_range) = ranges.next() { - // clone? - range_map.insert(node_or_token.text_range(), original_range.clone()); + let mut range_iter = row_column_ranges.iter(); + traverse_pre_order(&new_root, |node_or_token| { + if let Some(original_range) = range_iter.next() { + let byte_range = node_or_token.text_range(); + range_map.insert(byte_range, original_range.clone()); } else { - unreachable!() + unreachable!(); } }); - (resolved_node, range_map) -} - -/// Traverse the CST and rewrite certain nodes -/// e.g. flatten list node, remove option node -fn walk_and_build( - input: &str, - builder: &mut GreenNodeBuilder<'static, 'static, PostgreSQLSyntax>, - range_vec: &mut Vec, - node: &ResolvedNode, -) { - use cstree::util::NodeOrToken; + assert!(range_iter.next().is_none()); - let new_line_indices: Vec<_> = input - .char_indices() - .filter(|&(_, c)| c == '\n') - .map(|(i, _)| i) - .collect(); + (new_root, range_map) +} - let text_range = node.text_range(); +fn get_row_column_range( + node_or_token: &NodeOrToken, + new_line_indices: &Vec, +) -> RowColumnRange { + let text_range: SequentialRange = node_or_token.text_range(); - // text_range の初期部分が、改行のどの部分に現れるか let before_start_new_line_count = match new_line_indices.binary_search(&text_range.start().into()) { Ok(i) => i, @@ -64,30 +87,75 @@ fn walk_and_build( Err(i) => i, }; - range_vec.push(Range { + RowColumnRange { start_row: before_start_new_line_count, - start_col: usize::from(node.text_range().start()) + start_col: usize::from(text_range.start()) - match before_start_new_line_count { 0 => 0, - // ひとつ前のインデックス(直前の改行文字)+1 - i => new_line_indices[i - 1] + 1, // +1 は改行文字分? + i => new_line_indices[i - 1] + 1, }, end_row: before_end_new_line_count, - end_col: usize::from(node.text_range().end()) + end_col: usize::from(text_range.end()) - 1 - match before_end_new_line_count { 0 => 0, i => new_line_indices[i - 1], }, + } +} + +/// Converts the given CST into a node structure and hierarchy that closely matches what `tree-sitter-sql` produces. +pub fn convert_cst(src: &str, root: &ResolvedNode) -> ResolvedNode { + let mut builder = GreenNodeBuilder::new(); + let mut row_column_ranges: Vec = vec![]; + + // Build `Root` node + builder.start_node(SyntaxKind::Root); + walk_and_build( + src, + &vec![1_usize], + &mut builder, + &mut row_column_ranges, + root, + ); + builder.finish_node(); + + let (tree, cache) = builder.finish(); + let resolved_node = + SyntaxNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()); + + let mut range_map = HashMap::new(); + let mut ranges = row_column_ranges.iter(); + traverse_pre_order(&resolved_node, |node_or_token| { + if let Some(original_range) = ranges.next() { + // clone? + range_map.insert(node_or_token.text_range(), original_range.clone()); + } else { + unreachable!() + } }); + resolved_node +} + +/// Traverse the CST and rewrite certain nodes +/// e.g. flatten list node, remove option node +fn walk_and_build( + input: &str, + new_line_indices: &Vec, + builder: &mut GreenNodeBuilder<'static, 'static, PostgreSQLSyntax>, + row_column_ranges: &mut Vec, + node: &ResolvedNode, +) { + use cstree::util::NodeOrToken; + let parent_kind = node.kind(); let children = node.children_with_tokens(); for child in children { match child { - NodeOrToken::Node(n) => { - match n.kind() { + NodeOrToken::Node(child_node) => { + match child_node.kind() { child_kind @ (SyntaxKind::stmtmulti | SyntaxKind::target_list | SyntaxKind::from_list) => { @@ -102,11 +170,28 @@ fn walk_and_build( // +- target_el // +- ... // - walk_and_build(input, builder, range_vec, n); + walk_and_build( + input, + new_line_indices, + builder, + row_column_ranges, + child_node, + ); } else { // Node is target for flattening, but at the top level of the nest - builder.start_node(n.kind()); - walk_and_build(input, builder, range_vec, n); + println!("push1: {child_kind}"); + row_column_ranges.push(get_row_column_range( + &NodeOrToken::Node(&child_node), + &new_line_indices, + )); + builder.start_node(child_node.kind()); + walk_and_build( + input, + new_line_indices, + builder, + row_column_ranges, + child_node, + ); builder.finish_node(); } } @@ -129,27 +214,52 @@ fn walk_and_build( // +- child_1 +- child_2 // +- child_1 // - walk_and_build(input, builder, range_vec, n); + println!("removal: opt_target_list"); + walk_and_build( + input, + new_line_indices, + builder, + row_column_ranges, + child_node, + ); } // Default pattern _ => { - builder.start_node(n.kind()); - walk_and_build(input, builder, range_vec, n); + let k = child_node.kind(); + println!("push2: {k}"); + row_column_ranges.push(get_row_column_range( + &NodeOrToken::Node(&child_node), + &new_line_indices, + )); + builder.start_node(child_node.kind()); + walk_and_build( + input, + new_line_indices, + builder, + row_column_ranges, + child_node, + ); builder.finish_node(); } } } - NodeOrToken::Token(t) => { + NodeOrToken::Token(child_token) => { // Remove Whitespace Token // Note: // This process will break the lossless property of the CST. // `text()` for Nodes and `text_range()` for Nodes and Tokens will become incompatible with the original text. - if t.kind() == SyntaxKind::Whitespace { + if child_token.kind() == SyntaxKind::Whitespace { continue; } - builder.token(t.kind(), t.text()); + let k = child_token.kind(); + println!("push3: {k}"); + row_column_ranges.push(get_row_column_range( + &NodeOrToken::Token(child_token), + &new_line_indices, + )); + builder.token(child_token.kind(), child_token.text()); } } } From a5c9c8ddf2e6b37fbc9d9f92f1d87882abd89996 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 18 Dec 2024 20:43:19 +0900 Subject: [PATCH 07/18] implement Display trait for Range --- crates/postgresql-cst-parser/src/tree_sitter.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index d1c688a..d8c64e2 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -40,7 +40,11 @@ pub struct Range { impl std::fmt::Display for Range { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "[({}, {})-({}, {})]", self.start_row, self.start_col, self.end_row, self.end_col) + write!( + f, + "[({}, {})-({}, {})]", + self.start_row, self.start_col, self.end_row, self.end_col + ) } } From 37a3a08fae9e046cc9504df26756b5241c92951f Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 18 Dec 2024 20:43:51 +0900 Subject: [PATCH 08/18] comment out legacy node lists --- .../postgresql-cst-parser/src/tree_sitter.rs | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index d8c64e2..83d8571 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -48,43 +48,43 @@ impl std::fmt::Display for Range { } } -fn is_flatten_all(node_or_token: NodeOrToken) -> bool { - matches!( - node_or_token.kind(), - SyntaxKind::parse_toplevel - | SyntaxKind::stmtmulti - | SyntaxKind::toplevel_stmt - | SyntaxKind::stmt - | SyntaxKind::select_clause - | SyntaxKind::select_with_parens - | SyntaxKind::select_no_parens - | SyntaxKind::simple_select - | SyntaxKind::opt_target_list - // | SyntaxKind::relation_expr - // | SyntaxKind::extended_relation_expr - // | SyntaxKind::qualified_name - // | SyntaxKind::indirection - // | SyntaxKind::indirection_el - // | SyntaxKind::table_ref - | SyntaxKind::alias_clause - | SyntaxKind::opt_alias_clause - ) -} - -fn is_flatten_except_top(node_or_token: NodeOrToken) -> bool { - matches!( - node_or_token.kind(), - SyntaxKind::target_list | SyntaxKind::from_list - ) && node_or_token.parent().unwrap().kind() == node_or_token.kind() -} - -fn is_flatten(node_or_token: NodeOrToken) -> bool { - is_flatten_all(node_or_token) || is_flatten_except_top(node_or_token) -} - -fn is_skip(kind: SyntaxKind) -> bool { - matches!(kind, SyntaxKind::Whitespace) -} +// fn is_flatten_all(node_or_token: NodeOrToken) -> bool { +// matches!( +// node_or_token.kind(), +// SyntaxKind::parse_toplevel +// | SyntaxKind::stmtmulti +// | SyntaxKind::toplevel_stmt +// | SyntaxKind::stmt +// | SyntaxKind::select_clause +// | SyntaxKind::select_with_parens +// | SyntaxKind::select_no_parens +// | SyntaxKind::simple_select +// | SyntaxKind::opt_target_list +// // | SyntaxKind::relation_expr +// // | SyntaxKind::extended_relation_expr +// // | SyntaxKind::qualified_name +// // | SyntaxKind::indirection +// // | SyntaxKind::indirection_el +// // | SyntaxKind::table_ref +// | SyntaxKind::alias_clause +// | SyntaxKind::opt_alias_clause +// ) +// } + +// fn is_flatten_except_top(node_or_token: NodeOrToken) -> bool { +// matches!( +// node_or_token.kind(), +// SyntaxKind::target_list | SyntaxKind::from_list +// ) && node_or_token.parent().unwrap().kind() == node_or_token.kind() +// } + +// fn is_flatten(node_or_token: NodeOrToken) -> bool { +// is_flatten_all(node_or_token) || is_flatten_except_top(node_or_token) +// } + +// fn is_skip(kind: SyntaxKind) -> bool { +// matches!(kind, SyntaxKind::Whitespace) +// } impl<'a> Node<'a> { pub fn walk(&self) -> TreeCursor<'a> { From 5ce503c411524e8cd04d320badefd42b77d4bfb5 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Wed, 18 Dec 2024 20:48:26 +0900 Subject: [PATCH 09/18] wip: implement tree conversion (includes lifetime error) --- .../postgresql-cst-parser/src/tree_sitter.rs | 109 ++++++++++-------- .../src/tree_sitter/convert.rs | 63 +++------- 2 files changed, 71 insertions(+), 101 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 83d8571..35a4c99 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -2,7 +2,7 @@ mod assert_util; mod convert; -pub use convert::convert_cst; +pub use convert::get_ts_tree_and_range_map; use std::{collections::HashMap, fmt::Display, rc::Rc}; @@ -223,48 +223,55 @@ impl<'a> TreeCursor<'a> { } pub fn as_tree_sitter_cursor<'a>(input: &'a str, node: &'a ResolvedNode) -> TreeCursor<'a> { - let mut range_map = HashMap::new(); - - let new_line_indices: Vec<_> = input - .char_indices() - .filter(|&(_, c)| c == '\n') - .map(|(i, _)| i) - .collect(); - - traverse_pre_order(node, |node_or_token| { - let text_range = node_or_token.text_range(); - - let before_start_new_line_count = - match new_line_indices.binary_search(&text_range.start().into()) { - Ok(i) => i, - Err(i) => i, - }; - - let before_end_new_line_count = - match new_line_indices.binary_search(&text_range.end().into()) { - Ok(i) => i, - Err(i) => i, - }; - - range_map.insert( - node_or_token.text_range(), - Range { - start_row: before_start_new_line_count, - start_col: usize::from(node_or_token.text_range().start()) - - match before_start_new_line_count { - 0 => 0, - i => new_line_indices[i - 1] + 1, - }, - end_row: before_end_new_line_count, - end_col: usize::from(node_or_token.text_range().end()) - - 1 - - match before_end_new_line_count { - 0 => 0, - i => new_line_indices[i - 1], - }, - }, - ); - }); + // let mut range_map = HashMap::new(); + + // // 改行がある場所のインデックスのベクタ + // // つまり、(v[n], v[n+1]) がソースコード上における `n 行目` の範囲 + // // n: 0.. + // let new_line_indices: Vec<_> = input + // .char_indices() + // .filter(|&(_, c)| c == '\n') + // .map(|(i, _)| i) + // .collect(); + + // traverse_pre_order(node, |node_or_token| { + // let text_range = node_or_token.text_range(); + + // // text_range の初期部分が、改行のどの部分に現れるか + // let before_start_new_line_count = + // match new_line_indices.binary_search(&text_range.start().into()) { + // Ok(i) => i, + // Err(i) => i, + // }; + + // let before_end_new_line_count = + // match new_line_indices.binary_search(&text_range.end().into()) { + // Ok(i) => i, + // Err(i) => i, + // }; + + // range_map.insert( + // node_or_token.text_range(), + // Range { + // start_row: before_start_new_line_count, + // start_col: usize::from(node_or_token.text_range().start()) + // - match before_start_new_line_count { + // 0 => 0, + // // ひとつ前のインデックス(直前の改行文字)+1 + // i => new_line_indices[i - 1] + 1, // +1 は改行文字分? + // }, + // end_row: before_end_new_line_count, + // end_col: usize::from(node_or_token.text_range().end()) + // - 1 + // - match before_end_new_line_count { + // 0 => 0, + // i => new_line_indices[i - 1], + // }, + // }, + // ); + // }); + // + let (node,range_map) = get_ts_tree_and_range_map(&input, &node); TreeCursor { input, @@ -335,7 +342,7 @@ mod tests { use crate::{ cst, parse, syntax_kind::SyntaxKind, - tree_sitter::{as_tree_sitter_cursor, convert_cst, dump_as_tree_sitter_like, TreeCursor}, + tree_sitter::{as_tree_sitter_cursor, dump_as_tree_sitter_like, get_ts_tree_and_range_map, TreeCursor}, ParseError, }; @@ -400,7 +407,7 @@ FROM , B"#; let node = parse(&src).unwrap(); - let node = convert_cst(&node); + let (node, _) = get_ts_tree_and_range_map(&src, &node); let mut cursor = as_tree_sitter_cursor(src, &node); visit(&mut cursor, 0, &src); @@ -409,7 +416,7 @@ FROM #[test] fn goto_first_child_from_node() { let src = "select a, b, c from tbl;"; - let root = convert_cst(&parse(&src).unwrap()); + let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let first_select = root .descendants() .find(|x| x.kind() == SyntaxKind::simple_select) @@ -425,7 +432,7 @@ FROM #[test] fn goto_first_child_from_token() { let src = "select a, b, c from tbl;"; - let root = convert_cst(&parse(&src).unwrap()); + let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let column_id_node = root .descendants() .find(|x| x.kind() == SyntaxKind::ColId) @@ -442,7 +449,7 @@ FROM #[test] fn goto_parent_from_root() { let src = "select a, b, c from tbl;"; - let root = convert_cst(&parse(&src).unwrap()); + let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let mut cursor = as_tree_sitter_cursor(src, &root); assert_eq!(cursor.node().kind(), SyntaxKind::Root); @@ -454,7 +461,7 @@ FROM #[test] fn goto_parent_from_node() { let src = "select a, b, c from tbl;"; - let root = convert_cst(&parse(&src).unwrap()); + let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let target_element = root .descendants() @@ -470,7 +477,7 @@ FROM #[test] fn goto_parent_from_token() { let src = "select a, b, c from tbl;"; - let root = convert_cst(&parse(&src).unwrap()); + let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let column_id_node = root .descendants() @@ -488,7 +495,7 @@ FROM #[test] fn goto_next_sibling() { let src = "select a,b,c from tbl;"; - let root = convert_cst(&parse(&src).unwrap()); + let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let target_element = root .descendants() diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 76833dc..9af5d1f 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode, text}; +use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode}; use crate::{syntax_kind::SyntaxKind, NodeOrToken, PostgreSQLSyntax, ResolvedNode}; @@ -26,6 +26,7 @@ pub fn get_ts_tree_and_range_map( &NodeOrToken::Node(root), &new_line_indices, )); + builder.start_node(SyntaxKind::Root); // process subtrees. // These Nodes will be ignored: @@ -54,7 +55,7 @@ pub fn get_ts_tree_and_range_map( row_column_ranges.len() ); - let mut range_map = HashMap::new(); + let mut range_map: HashMap = HashMap::new(); let mut range_iter = row_column_ranges.iter(); traverse_pre_order(&new_root, |node_or_token| { if let Some(original_range) = range_iter.next() { @@ -104,40 +105,6 @@ fn get_row_column_range( } } -/// Converts the given CST into a node structure and hierarchy that closely matches what `tree-sitter-sql` produces. -pub fn convert_cst(src: &str, root: &ResolvedNode) -> ResolvedNode { - let mut builder = GreenNodeBuilder::new(); - let mut row_column_ranges: Vec = vec![]; - - // Build `Root` node - builder.start_node(SyntaxKind::Root); - walk_and_build( - src, - &vec![1_usize], - &mut builder, - &mut row_column_ranges, - root, - ); - builder.finish_node(); - - let (tree, cache) = builder.finish(); - let resolved_node = - SyntaxNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()); - - let mut range_map = HashMap::new(); - let mut ranges = row_column_ranges.iter(); - traverse_pre_order(&resolved_node, |node_or_token| { - if let Some(original_range) = ranges.next() { - // clone? - range_map.insert(node_or_token.text_range(), original_range.clone()); - } else { - unreachable!() - } - }); - - resolved_node -} - /// Traverse the CST and rewrite certain nodes /// e.g. flatten list node, remove option node fn walk_and_build( @@ -179,11 +146,12 @@ fn walk_and_build( ); } else { // Node is target for flattening, but at the top level of the nest - println!("push1: {child_kind}"); + row_column_ranges.push(get_row_column_range( &NodeOrToken::Node(&child_node), &new_line_indices, )); + builder.start_node(child_node.kind()); walk_and_build( input, @@ -214,7 +182,6 @@ fn walk_and_build( // +- child_1 +- child_2 // +- child_1 // - println!("removal: opt_target_list"); walk_and_build( input, new_line_indices, @@ -226,8 +193,6 @@ fn walk_and_build( // Default pattern _ => { - let k = child_node.kind(); - println!("push2: {k}"); row_column_ranges.push(get_row_column_range( &NodeOrToken::Node(&child_node), &new_line_indices, @@ -253,8 +218,6 @@ fn walk_and_build( continue; } - let k = child_token.kind(); - println!("push3: {k}"); row_column_ranges.push(get_row_column_range( &NodeOrToken::Token(child_token), &new_line_indices, @@ -267,7 +230,7 @@ fn walk_and_build( #[cfg(test)] mod tests { - use crate::{cst, tree_sitter::convert::convert_cst}; + use crate::{cst, tree_sitter::convert::get_ts_tree_and_range_map}; #[test] fn whitespace_is_removed() { @@ -281,7 +244,7 @@ FROM , B"#; let root = cst::parse(&original).unwrap(); - let new_root = convert_cst(&original, &root); + let (new_root, _) = get_ts_tree_and_range_map(&original, &root); let whitespace_removed: String = original.split_whitespace().collect(); dbg!(&whitespace_removed); @@ -294,7 +257,7 @@ FROM syntax_kind::SyntaxKind, tree_sitter::{ assert_util::{assert_exists, assert_not_exists}, - convert::convert_cst, + convert::get_ts_tree_and_range_map, }, }; @@ -304,7 +267,7 @@ FROM let root = cst::parse(input).unwrap(); assert_exists(&root, SyntaxKind::opt_target_list); - let new_root = convert_cst(input, &root); + let (new_root, _) = get_ts_tree_and_range_map(input, &root); assert_not_exists(&new_root, SyntaxKind::opt_target_list); } } @@ -315,7 +278,7 @@ FROM syntax_kind::SyntaxKind, tree_sitter::{ assert_util::{assert_no_direct_nested_kind, assert_node_count}, - convert::convert_cst, + convert::get_ts_tree_and_range_map, }, }; @@ -326,7 +289,7 @@ FROM let root = cst::parse(input).unwrap(); assert_node_count(&root, SyntaxKind::target_list, 3); - let new_root = convert_cst(input, &root); + let (new_root, _) = get_ts_tree_and_range_map(input, &root); assert_node_count(&new_root, SyntaxKind::target_list, 1); assert_no_direct_nested_kind(&new_root, SyntaxKind::target_list); } @@ -335,7 +298,7 @@ FROM fn no_nested_stmtmulti() { let input = "select a,b,c;\nselect d,e from t;"; let root = cst::parse(input).unwrap(); - let new_root = convert_cst(&input, &root); + let (new_root, _) = get_ts_tree_and_range_map(input, &root); assert_no_direct_nested_kind(&new_root, SyntaxKind::stmtmulti); } @@ -344,7 +307,7 @@ FROM fn no_nested_from_list() { let input = "select * from t1, t2;"; let root = cst::parse(input).unwrap(); - let new_root = convert_cst(input, &root); + let (new_root, _) = get_ts_tree_and_range_map(&input, &root); assert_no_direct_nested_kind(&new_root, SyntaxKind::from_list); } From 210cc1a385344813e5b4c4a660c1d867dc9a020c Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 15:23:10 +0900 Subject: [PATCH 10/18] remove comments that is not needed --- .../postgresql-cst-parser/src/tree_sitter.rs | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 35a4c99..41cd09b 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -182,29 +182,6 @@ impl<'a> TreeCursor<'a> { false } - /// - /// These methods are unused in uroborosql-fmt - /// - // pub fn field_id(&self)-> Option { - // unimplemented!() - // } - - // pub fn field_name(&self)-> Option<&'static str> { - // unimplemented!() - // } - - // pub fn goto_first_child_for_byte(&mut self, index: usize) -> Option { - // unimplemented!() - // } - - // pub fn goto_first_child_for_point(&mut self, point: Point) -> Option { - // unimplemented!() - // } - - // pub fn reset(&mut self, node: Node<'a>) { - // unimplemented!() - // } - pub fn goto_direct_prev_sibling(&mut self) -> bool { if let Some(prev) = self.node_or_token.prev_sibling_or_token() { self.node_or_token = prev; From 01edb95269a8817909a38fb5d7271d283acff045 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 16:27:46 +0900 Subject: [PATCH 11/18] Change the interface of the function that gets the Cursor --- .../postgresql-cst-parser/src/tree_sitter.rs | 95 ++++-------- .../src/tree_sitter/convert.rs | 136 +++++++++--------- 2 files changed, 96 insertions(+), 135 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 41cd09b..b86aa39 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -199,57 +199,11 @@ impl<'a> TreeCursor<'a> { } } -pub fn as_tree_sitter_cursor<'a>(input: &'a str, node: &'a ResolvedNode) -> TreeCursor<'a> { - // let mut range_map = HashMap::new(); - - // // 改行がある場所のインデックスのベクタ - // // つまり、(v[n], v[n+1]) がソースコード上における `n 行目` の範囲 - // // n: 0.. - // let new_line_indices: Vec<_> = input - // .char_indices() - // .filter(|&(_, c)| c == '\n') - // .map(|(i, _)| i) - // .collect(); - - // traverse_pre_order(node, |node_or_token| { - // let text_range = node_or_token.text_range(); - - // // text_range の初期部分が、改行のどの部分に現れるか - // let before_start_new_line_count = - // match new_line_indices.binary_search(&text_range.start().into()) { - // Ok(i) => i, - // Err(i) => i, - // }; - - // let before_end_new_line_count = - // match new_line_indices.binary_search(&text_range.end().into()) { - // Ok(i) => i, - // Err(i) => i, - // }; - - // range_map.insert( - // node_or_token.text_range(), - // Range { - // start_row: before_start_new_line_count, - // start_col: usize::from(node_or_token.text_range().start()) - // - match before_start_new_line_count { - // 0 => 0, - // // ひとつ前のインデックス(直前の改行文字)+1 - // i => new_line_indices[i - 1] + 1, // +1 は改行文字分? - // }, - // end_row: before_end_new_line_count, - // end_col: usize::from(node_or_token.text_range().end()) - // - 1 - // - match before_end_new_line_count { - // 0 => 0, - // i => new_line_indices[i - 1], - // }, - // }, - // ); - // }); - // - let (node,range_map) = get_ts_tree_and_range_map(&input, &node); - +pub fn as_tree_sitter_cursor<'a>( + input: &'a str, + node: &'a ResolvedNode, + range_map: HashMap, +) -> TreeCursor<'a> { TreeCursor { input, range_map: Rc::new(range_map), @@ -290,11 +244,12 @@ fn traverse_pre_order(node: &ResolvedNode, mut f: F) { } pub fn dump_as_tree_sitter_like(input: &str, node: &ResolvedNode) { - let mut cursor = as_tree_sitter_cursor(input, node); + let (node, range_map) = get_ts_tree_and_range_map(input, node); + let mut cursor = as_tree_sitter_cursor(input, &node, range_map); let mut depth = 0; loop { - // dbg!(cursor.node().kind(), cursor.node().text(), depth); + dbg!(cursor.node().kind(), cursor.node().text(), depth); if cursor.goto_first_child() { depth += 1; @@ -319,7 +274,9 @@ mod tests { use crate::{ cst, parse, syntax_kind::SyntaxKind, - tree_sitter::{as_tree_sitter_cursor, dump_as_tree_sitter_like, get_ts_tree_and_range_map, TreeCursor}, + tree_sitter::{ + as_tree_sitter_cursor, dump_as_tree_sitter_like, get_ts_tree_and_range_map, TreeCursor, + }, ParseError, }; @@ -384,8 +341,8 @@ FROM , B"#; let node = parse(&src).unwrap(); - let (node, _) = get_ts_tree_and_range_map(&src, &node); - let mut cursor = as_tree_sitter_cursor(src, &node); + let (node, range_map) = get_ts_tree_and_range_map(&src, &node); + let mut cursor = as_tree_sitter_cursor(src, &node, range_map); visit(&mut cursor, 0, &src); } @@ -393,13 +350,13 @@ FROM #[test] fn goto_first_child_from_node() { let src = "select a, b, c from tbl;"; - let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); + let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let first_select = root .descendants() .find(|x| x.kind() == SyntaxKind::simple_select) .unwrap(); - let mut cursor = as_tree_sitter_cursor(src, &first_select); + let mut cursor = as_tree_sitter_cursor(src, &first_select, range_map); assert_eq!(cursor.node().kind(), SyntaxKind::simple_select); assert!(cursor.goto_first_child()); @@ -409,13 +366,13 @@ FROM #[test] fn goto_first_child_from_token() { let src = "select a, b, c from tbl;"; - let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); + let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let column_id_node = root .descendants() .find(|x| x.kind() == SyntaxKind::ColId) .unwrap(); - let mut cursor = as_tree_sitter_cursor(src, &column_id_node); + let mut cursor = as_tree_sitter_cursor(&src, column_id_node, range_map); cursor.goto_first_child(); assert_eq!(cursor.node().kind(), SyntaxKind::IDENT); @@ -426,11 +383,11 @@ FROM #[test] fn goto_parent_from_root() { let src = "select a, b, c from tbl;"; - let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); + let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); - let mut cursor = as_tree_sitter_cursor(src, &root); - assert_eq!(cursor.node().kind(), SyntaxKind::Root); + let mut cursor = as_tree_sitter_cursor(src, &root, range_map); + assert_eq!(cursor.node().kind(), SyntaxKind::Root); assert!(!cursor.goto_parent()); assert_eq!(cursor.node().kind(), SyntaxKind::Root); } @@ -438,13 +395,13 @@ FROM #[test] fn goto_parent_from_node() { let src = "select a, b, c from tbl;"; - let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); + let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let target_element = root .descendants() .find(|x| x.kind() == SyntaxKind::target_el) .unwrap(); - let mut cursor = as_tree_sitter_cursor(src, &target_element); + let mut cursor = as_tree_sitter_cursor(src, &target_element, range_map); assert_eq!(cursor.node().kind(), SyntaxKind::target_el); assert!(cursor.goto_parent()); @@ -454,13 +411,13 @@ FROM #[test] fn goto_parent_from_token() { let src = "select a, b, c from tbl;"; - let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); + let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let column_id_node = root .descendants() .find(|x| x.kind() == SyntaxKind::ColId) .unwrap(); - let mut cursor = as_tree_sitter_cursor(src, &column_id_node); + let mut cursor = as_tree_sitter_cursor(src, &column_id_node, range_map); cursor.goto_first_child(); assert_eq!(cursor.node().kind(), SyntaxKind::IDENT); @@ -472,13 +429,13 @@ FROM #[test] fn goto_next_sibling() { let src = "select a,b,c from tbl;"; - let (root, _) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); + let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let target_element = root .descendants() .find(|x| x.kind() == SyntaxKind::target_el) .unwrap(); - let mut cursor = as_tree_sitter_cursor(src, &target_element); + let mut cursor = as_tree_sitter_cursor(src, &target_element, range_map); // // - target_list // - target_el (1) diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 9af5d1f..275a4e4 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -6,8 +6,8 @@ use crate::{syntax_kind::SyntaxKind, NodeOrToken, PostgreSQLSyntax, ResolvedNode use super::traverse_pre_order; -type SequentialRange = cstree::text::TextRange; // Range representation by cstree -type RowColumnRange = super::Range; // tree-sitter like range representation +type SequentialRange = cstree::text::TextRange; // Range representation by cstree (Sequential bytes) +type RowColumnRange = super::Range; // tree-sitter like range representation (Rows and Columns) pub fn get_ts_tree_and_range_map( src: &str, @@ -16,57 +16,45 @@ pub fn get_ts_tree_and_range_map( let mut builder = GreenNodeBuilder::new(); let mut row_column_ranges: Vec = vec![]; - let new_line_indices: Vec<_> = src - .char_indices() - .filter(|&(_, c)| c == '\n') - .map(|(i, _)| i) - .collect(); - - row_column_ranges.push(get_row_column_range( - &NodeOrToken::Node(root), - &new_line_indices, - )); - - builder.start_node(SyntaxKind::Root); - // process subtrees. - // These Nodes will be ignored: - // - unneeded node - // - nested node - // - Whitespace token - // - // Each Node in the tree: - // 1. Add new Node (or Token) to New Tree - // 2. Create tree-sitter compatible `Range`s based on the original text. - walk_and_build( - src, - &new_line_indices, - &mut builder, - &mut row_column_ranges, - &root, - ); - builder.finish_node(); + // Build new tree, and Collect row-column style Ranges + { + let new_line_indices: Vec<_> = src + .char_indices() + .filter(|&(_, c)| c == '\n') + .map(|(i, _)| i) + .collect(); + + row_column_ranges.push(get_row_column_range( + &NodeOrToken::Node(root), + &new_line_indices, + )); + + builder.start_node(SyntaxKind::Root); + // process subtrees + // These Nodes will be ignored: + // - Unneeded node + // - Nested node + // - Whitespace token + // + // Each Node in the tree: + // 1. Add new Node (or Token) to New Tree + // 2. Create tree-sitter compatible `Range`s based on the original text. + walk_and_build( + &root, + &new_line_indices, + &mut builder, + &mut row_column_ranges, + ); + builder.finish_node(); + } + // Get New tree let (tree, cache) = builder.finish(); let new_root = SyntaxNode::new_root_with_resolver(tree, cache.unwrap().into_interner().unwrap()); - assert_eq!( - new_root.descendants_with_tokens().count(), - row_column_ranges.len() - ); - - let mut range_map: HashMap = HashMap::new(); - let mut range_iter = row_column_ranges.iter(); - traverse_pre_order(&new_root, |node_or_token| { - if let Some(original_range) = range_iter.next() { - let byte_range = node_or_token.text_range(); - range_map.insert(byte_range, original_range.clone()); - } else { - unreachable!(); - } - }); - - assert!(range_iter.next().is_none()); + // Create a mapping between the TextRanges of nodes and tokens (in bytes) and the original text ranges (in rows and columns). + let range_map = create_mapping(&new_root, row_column_ranges); (new_root, range_map) } @@ -105,14 +93,33 @@ fn get_row_column_range( } } -/// Traverse the CST and rewrite certain nodes -/// e.g. flatten list node, remove option node +fn create_mapping( + root: &ResolvedNode, + row_column_ranges: Vec, +) -> HashMap { + assert_eq!( + root.descendants_with_tokens().count(), + row_column_ranges.len() + ); + + let mut range_map: HashMap = HashMap::new(); + let mut range_iter = row_column_ranges.iter(); + traverse_pre_order(&root, |node_or_token| { + if let Some(original_range) = range_iter.next() { + let byte_range = node_or_token.text_range(); + range_map.insert(byte_range, original_range.clone()); + } + }); + + assert!(range_iter.next().is_none()); + range_map +} + fn walk_and_build( - input: &str, + node: &ResolvedNode, new_line_indices: &Vec, builder: &mut GreenNodeBuilder<'static, 'static, PostgreSQLSyntax>, row_column_ranges: &mut Vec, - node: &ResolvedNode, ) { use cstree::util::NodeOrToken; @@ -127,7 +134,7 @@ fn walk_and_build( | SyntaxKind::target_list | SyntaxKind::from_list) => { if parent_kind == child_kind { - // [Flatten] + // [Node: Flatten] // // This patten does not construct node. // @@ -138,15 +145,14 @@ fn walk_and_build( // +- ... // walk_and_build( - input, + child_node, new_line_indices, builder, row_column_ranges, - child_node, ); } else { // Node is target for flattening, but at the top level of the nest - + row_column_ranges.push(get_row_column_range( &NodeOrToken::Node(&child_node), &new_line_indices, @@ -154,11 +160,10 @@ fn walk_and_build( builder.start_node(child_node.kind()); walk_and_build( - input, + child_node, new_line_indices, builder, row_column_ranges, - child_node, ); builder.finish_node(); } @@ -172,7 +177,7 @@ fn walk_and_build( // | SyntaxKind::simple_select // | SyntaxKind::opt_target_list => { - // [Removal] + // [Node: Removal] // // Ignore current node, and continue building its children. // @@ -183,15 +188,14 @@ fn walk_and_build( // +- child_1 // walk_and_build( - input, + child_node, new_line_indices, builder, row_column_ranges, - child_node, ); } - // Default pattern + // [Node: Default] _ => { row_column_ranges.push(get_row_column_range( &NodeOrToken::Node(&child_node), @@ -199,18 +203,17 @@ fn walk_and_build( )); builder.start_node(child_node.kind()); walk_and_build( - input, + child_node, new_line_indices, builder, row_column_ranges, - child_node, ); builder.finish_node(); } } } NodeOrToken::Token(child_token) => { - // Remove Whitespace Token + // [Token: Removal] // Note: // This process will break the lossless property of the CST. // `text()` for Nodes and `text_range()` for Nodes and Tokens will become incompatible with the original text. @@ -218,6 +221,7 @@ fn walk_and_build( continue; } + // [Token: Default] row_column_ranges.push(get_row_column_range( &NodeOrToken::Token(child_token), &new_line_indices, @@ -247,7 +251,7 @@ FROM let (new_root, _) = get_ts_tree_and_range_map(&original, &root); let whitespace_removed: String = original.split_whitespace().collect(); - dbg!(&whitespace_removed); + // Lossless property of the CST is broken. assert_eq!(new_root.text(), whitespace_removed.as_str()); } From f1b54cfa45630e3ad976f932cbb2fce438d875b3 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 16:28:51 +0900 Subject: [PATCH 12/18] Change the structure of Range --- .../postgresql-cst-parser/src/tree_sitter.rs | 58 +++++++++++++------ .../src/tree_sitter/convert.rs | 20 +++++-- 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index b86aa39..b18c6e3 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -30,21 +30,31 @@ pub struct TreeCursor<'a> { node_or_token: NodeOrToken<'a>, } +// https://github.com/tree-sitter/tree-sitter/blob/90666c951d53c13cc6cf5002d971a6debed74244/lib/binding_rust/lib.rs#L74-L78 +#[derive(Debug, Clone)] +pub struct Point { + pub row: usize, + pub column: usize, +} + +impl std::fmt::Display for Point { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "({}, {})", self.row, self.column) + } +} + +// https://github.com/tree-sitter/tree-sitter/blob/90666c951d53c13cc6cf5002d971a6debed74244/lib/binding_rust/lib.rs#L80-L88 #[derive(Debug, Clone)] pub struct Range { - pub start_row: usize, - pub start_col: usize, - pub end_row: usize, - pub end_col: usize, + start_byte: usize, + end_byte: usize, + start_position: Point, + end_position: Point, } impl std::fmt::Display for Range { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "[({}, {})-({}, {})]", - self.start_row, self.start_col, self.end_row, self.end_col - ) + write!(f, "[{}-{}]", self.start_position, self.end_position) } } @@ -103,14 +113,26 @@ impl<'a> Node<'a> { .unwrap() } + pub fn start_position(&self) -> Point { + self.range().start_position + } + + pub fn end_position(&self) -> Point { + self.range().end_position + } + pub fn text(&self) -> &'a str { - // self.node_or_token - // .as_token() - // .map(|t| t.text()) - // .unwrap_or_default() - let start = self.node_or_token.text_range().start().into(); - let end = self.node_or_token.text_range().end().into(); - &self.input[start..end] + let Range { + start_byte, + end_byte, + .. + } = self.range(); + + &self.input[start_byte..end_byte] + } + + pub fn utf8_text() { + unimplemented!() } pub fn child_count(&self) -> usize { @@ -309,8 +331,8 @@ FROM print!("{}", cursor.node().kind()); if cursor.node().child_count() == 0 { - // print!(" \"{}\"", cursor.node().utf8_text(src.as_bytes()).unwrap()); - print!(" \"{}\"", cursor.node().text().escape_default()); + // print!(" \"{}\"", cursor.node().utf8_text(src.as_bytes()).unwrap()); // tree-sitter style + print!(" \"{}\"", cursor.node().text().escape_default()); // postgresql-cst-parser style } println!( // " [{}-{}]", diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 275a4e4..3f9c9da 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -76,20 +76,30 @@ fn get_row_column_range( Err(i) => i, }; - RowColumnRange { - start_row: before_start_new_line_count, - start_col: usize::from(text_range.start()) + let start_position = Point { + row: before_start_new_line_count, + column: usize::from(text_range.start()) - match before_start_new_line_count { 0 => 0, i => new_line_indices[i - 1] + 1, }, - end_row: before_end_new_line_count, - end_col: usize::from(text_range.end()) + }; + + let end_position = Point { + row: before_end_new_line_count, + column: usize::from(text_range.end()) - 1 - match before_end_new_line_count { 0 => 0, i => new_line_indices[i - 1], }, + }; + + RowColumnRange { + start_byte: text_range.start().into(), + end_byte: text_range.end().into(), + start_position, + end_position, } } From 5c10dc84a3bd5c1e51a20c9634a5e1a414f3f672 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 16:29:33 +0900 Subject: [PATCH 13/18] style: fmt --- .../src/tree_sitter/convert.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 3f9c9da..94472fa 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -4,7 +4,7 @@ use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode}; use crate::{syntax_kind::SyntaxKind, NodeOrToken, PostgreSQLSyntax, ResolvedNode}; -use super::traverse_pre_order; +use super::{traverse_pre_order, Point}; type SequentialRange = cstree::text::TextRange; // Range representation by cstree (Sequential bytes) type RowColumnRange = super::Range; // tree-sitter like range representation (Rows and Columns) @@ -197,12 +197,7 @@ fn walk_and_build( // +- child_1 +- child_2 // +- child_1 // - walk_and_build( - child_node, - new_line_indices, - builder, - row_column_ranges, - ); + walk_and_build(child_node, new_line_indices, builder, row_column_ranges); } // [Node: Default] @@ -212,12 +207,7 @@ fn walk_and_build( &new_line_indices, )); builder.start_node(child_node.kind()); - walk_and_build( - child_node, - new_line_indices, - builder, - row_column_ranges, - ); + walk_and_build(child_node, new_line_indices, builder, row_column_ranges); builder.finish_node(); } } From 86f99979347245b7ab78656c168e244a12926c71 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 17:45:54 +0900 Subject: [PATCH 14/18] add tests --- .../postgresql-cst-parser/src/tree_sitter.rs | 131 ++++++++++++++++-- 1 file changed, 123 insertions(+), 8 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index b18c6e3..55186c0 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -137,9 +137,10 @@ impl<'a> Node<'a> { pub fn child_count(&self) -> usize { if let Some(node) = self.node_or_token.as_node() { - return node.children_with_tokens().count(); + node.children_with_tokens().count() + } else { + 0 } - 0 } pub fn next_sibling(&self) -> Option> { @@ -189,19 +190,19 @@ impl<'a> TreeCursor<'a> { pub fn goto_parent(&mut self) -> bool { if let Some(parent) = self.node_or_token.parent() { self.node_or_token = NodeOrToken::Node(parent); - - return true; + true + } else { + false } - - false } pub fn goto_next_sibling(&mut self) -> bool { if let Some(sibling) = self.node_or_token.next_sibling_or_token() { self.node_or_token = sibling; - return true; + true + } else { + false } - false } pub fn goto_direct_prev_sibling(&mut self) -> bool { @@ -488,4 +489,118 @@ FROM assert!(!cursor.goto_next_sibling()); assert_eq!(cursor.node().kind(), SyntaxKind::target_el); } + + #[test] + fn range() { + let src = r#" +-- comment +SELECT + 1 as X +, 2 -- comment 2 +, 3 +FROM + A +, B"#; + + let node = parse(&src).unwrap(); + let (node, range_map) = get_ts_tree_and_range_map(&src, &node); + + let mut cursor = as_tree_sitter_cursor(&src, &node, range_map); + let mut text_buf = String::from("\n"); + + 'traverse: loop { + if cursor.node().child_count() == 0 { + text_buf.push_str(&format!("{}\n", cursor.node().range())); + } + + if cursor.goto_first_child() { + } else if cursor.goto_next_sibling() { + } else { + loop { + if !cursor.goto_parent() { + break 'traverse; + } + + if cursor.goto_next_sibling() { + break; + } + } + } + } + + let expected = r#" +[(1, 0)-(1, 10)] +[(2, 0)-(2, 6)] +[(3, 1)-(3, 2)] +[(3, 3)-(3, 5)] +[(3, 6)-(3, 7)] +[(4, 0)-(4, 1)] +[(4, 2)-(4, 3)] +[(4, 4)-(4, 16)] +[(5, 0)-(5, 1)] +[(5, 2)-(5, 3)] +[(6, 0)-(6, 4)] +[(7, 1)-(7, 2)] +[(8, 0)-(8, 1)] +[(8, 2)-(8, 3)] +"#; + + assert_eq!(text_buf, expected); + } + + #[test] + fn texts() { + let src = r#" +-- comment +SELECT + 1 as X +, 2 -- comment 2 +, 3 +FROM + A +, B"#; + + let node = parse(&src).unwrap(); + let (node, range_map) = get_ts_tree_and_range_map(&src, &node); + + let mut cursor = as_tree_sitter_cursor(&src, &node, range_map); + let mut text_buf = Vec::new(); + + 'traverse: loop { + if cursor.node().child_count() == 0 { + text_buf.push(cursor.node().text()); + } + + if cursor.goto_first_child() { + } else if cursor.goto_next_sibling() { + } else { + loop { + if !cursor.goto_parent() { + break 'traverse; + } + + if cursor.goto_next_sibling() { + break; + } + } + } + } + + let mut text_buf = text_buf.iter(); + assert_eq!(text_buf.next(), Some(&"-- comment")); + assert_eq!(text_buf.next(), Some(&"SELECT")); + assert_eq!(text_buf.next(), Some(&"1")); + assert_eq!(text_buf.next(), Some(&"as")); + assert_eq!(text_buf.next(), Some(&"X")); + assert_eq!(text_buf.next(), Some(&",")); + assert_eq!(text_buf.next(), Some(&"2")); + assert_eq!(text_buf.next(), Some(&"-- comment 2")); + assert_eq!(text_buf.next(), Some(&",")); + assert_eq!(text_buf.next(), Some(&"3")); + assert_eq!(text_buf.next(), Some(&"FROM")); + assert_eq!(text_buf.next(), Some(&"A")); + assert_eq!(text_buf.next(), Some(&",")); + assert_eq!(text_buf.next(), Some(&"B")); + assert_eq!(text_buf.next(), None); + } } From 1a3e55d6ddc9a00c05f2b6275518d71509b9f6f6 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 18:09:23 +0900 Subject: [PATCH 15/18] add flatten nodes --- .../postgresql-cst-parser/src/tree_sitter.rs | 49 ++++--------------- .../src/tree_sitter/convert.rs | 19 ++++--- 2 files changed, 18 insertions(+), 50 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 55186c0..350c5cb 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -58,44 +58,6 @@ impl std::fmt::Display for Range { } } -// fn is_flatten_all(node_or_token: NodeOrToken) -> bool { -// matches!( -// node_or_token.kind(), -// SyntaxKind::parse_toplevel -// | SyntaxKind::stmtmulti -// | SyntaxKind::toplevel_stmt -// | SyntaxKind::stmt -// | SyntaxKind::select_clause -// | SyntaxKind::select_with_parens -// | SyntaxKind::select_no_parens -// | SyntaxKind::simple_select -// | SyntaxKind::opt_target_list -// // | SyntaxKind::relation_expr -// // | SyntaxKind::extended_relation_expr -// // | SyntaxKind::qualified_name -// // | SyntaxKind::indirection -// // | SyntaxKind::indirection_el -// // | SyntaxKind::table_ref -// | SyntaxKind::alias_clause -// | SyntaxKind::opt_alias_clause -// ) -// } - -// fn is_flatten_except_top(node_or_token: NodeOrToken) -> bool { -// matches!( -// node_or_token.kind(), -// SyntaxKind::target_list | SyntaxKind::from_list -// ) && node_or_token.parent().unwrap().kind() == node_or_token.kind() -// } - -// fn is_flatten(node_or_token: NodeOrToken) -> bool { -// is_flatten_all(node_or_token) || is_flatten_except_top(node_or_token) -// } - -// fn is_skip(kind: SyntaxKind) -> bool { -// matches!(kind, SyntaxKind::Whitespace) -// } - impl<'a> Node<'a> { pub fn walk(&self) -> TreeCursor<'a> { unimplemented!() @@ -327,7 +289,7 @@ FROM const UNIT: usize = 2; fn visit(cursor: &mut TreeCursor, depth: usize, src: &str) { - (0..(depth * UNIT)).for_each(|_| print!(" ")); + (0..(depth * UNIT)).for_each(|_| print!("-")); print!("{}", cursor.node().kind()); @@ -361,7 +323,14 @@ SELECT , 3 FROM A -, B"#; +, B +; +select + 1 +, 2 +; + +"#; let node = parse(&src).unwrap(); let (node, range_map) = get_ts_tree_and_range_map(&src, &node); diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 94472fa..149e263 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -140,8 +140,8 @@ fn walk_and_build( match child { NodeOrToken::Node(child_node) => { match child_node.kind() { - child_kind @ (SyntaxKind::stmtmulti - | SyntaxKind::target_list + child_kind @ ( + SyntaxKind::target_list | SyntaxKind::from_list) => { if parent_kind == child_kind { // [Node: Flatten] @@ -179,14 +179,13 @@ fn walk_and_build( } } - // SyntaxKind::parse_toplevel - // | SyntaxKind::stmtmulti - // | SyntaxKind::toplevel_stmt - // | SyntaxKind::stmt - // | SyntaxKind::select_no_parens - // | SyntaxKind::simple_select - // | - SyntaxKind::opt_target_list => { + SyntaxKind::parse_toplevel + | SyntaxKind::stmtmulti + | SyntaxKind::toplevel_stmt + | SyntaxKind::stmt + | SyntaxKind::select_no_parens + | SyntaxKind::simple_select + | SyntaxKind::opt_target_list => { // [Node: Removal] // // Ignore current node, and continue building its children. From 16d1feddd4b4bcbcb6be8d435edacdc3223da052 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 18:27:04 +0900 Subject: [PATCH 16/18] fix test failure --- crates/postgresql-cst-parser/src/tree_sitter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index 350c5cb..ebab607 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -345,11 +345,11 @@ select let (root, range_map) = get_ts_tree_and_range_map(&src, &parse(&src).unwrap()); let first_select = root .descendants() - .find(|x| x.kind() == SyntaxKind::simple_select) + .find(|x| x.kind() == SyntaxKind::SelectStmt) .unwrap(); let mut cursor = as_tree_sitter_cursor(src, &first_select, range_map); - assert_eq!(cursor.node().kind(), SyntaxKind::simple_select); + assert_eq!(cursor.node().kind(), SyntaxKind::SelectStmt); assert!(cursor.goto_first_child()); assert_eq!(cursor.node().kind(), SyntaxKind::SELECT); From 8f93acbb7950324416f3f131c388a67321517342 Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 18:29:16 +0900 Subject: [PATCH 17/18] move traverse_pre_order to convert.rs --- .../postgresql-cst-parser/src/tree_sitter.rs | 32 ---------------- .../src/tree_sitter/convert.rs | 38 +++++++++++++++++-- 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter.rs b/crates/postgresql-cst-parser/src/tree_sitter.rs index ebab607..b2893bd 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter.rs @@ -196,38 +196,6 @@ pub fn as_tree_sitter_cursor<'a>( } } -fn traverse_pre_order(node: &ResolvedNode, mut f: F) { - let mut node_or_token = NodeOrToken::Node(node); - - loop { - f(node_or_token); - - if let Some(node) = node_or_token.as_node() { - if let Some(child) = node.first_child_or_token() { - node_or_token = child; - continue; - } - } - - if let Some(sibling) = node_or_token.next_sibling_or_token() { - node_or_token = sibling; - } else { - loop { - if let Some(parent) = node_or_token.parent() { - node_or_token = NodeOrToken::Node(parent); - } else { - return; - } - - if let Some(sibling) = node_or_token.next_sibling_or_token() { - node_or_token = sibling; - break; - } - } - } - } -} - pub fn dump_as_tree_sitter_like(input: &str, node: &ResolvedNode) { let (node, range_map) = get_ts_tree_and_range_map(input, node); let mut cursor = as_tree_sitter_cursor(input, &node, range_map); diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 149e263..532f0fa 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -4,7 +4,7 @@ use cstree::{build::GreenNodeBuilder, syntax::SyntaxNode}; use crate::{syntax_kind::SyntaxKind, NodeOrToken, PostgreSQLSyntax, ResolvedNode}; -use super::{traverse_pre_order, Point}; +use super::Point; type SequentialRange = cstree::text::TextRange; // Range representation by cstree (Sequential bytes) type RowColumnRange = super::Range; // tree-sitter like range representation (Rows and Columns) @@ -103,6 +103,38 @@ fn get_row_column_range( } } +fn traverse_pre_order(node: &ResolvedNode, mut f: F) { + let mut node_or_token = NodeOrToken::Node(node); + + loop { + f(node_or_token); + + if let Some(node) = node_or_token.as_node() { + if let Some(child) = node.first_child_or_token() { + node_or_token = child; + continue; + } + } + + if let Some(sibling) = node_or_token.next_sibling_or_token() { + node_or_token = sibling; + } else { + loop { + if let Some(parent) = node_or_token.parent() { + node_or_token = NodeOrToken::Node(parent); + } else { + return; + } + + if let Some(sibling) = node_or_token.next_sibling_or_token() { + node_or_token = sibling; + break; + } + } + } + } +} + fn create_mapping( root: &ResolvedNode, row_column_ranges: Vec, @@ -140,9 +172,7 @@ fn walk_and_build( match child { NodeOrToken::Node(child_node) => { match child_node.kind() { - child_kind @ ( - SyntaxKind::target_list - | SyntaxKind::from_list) => { + child_kind @ (SyntaxKind::target_list | SyntaxKind::from_list) => { if parent_kind == child_kind { // [Node: Flatten] // From d8b9272e878388026c25166ac7c0ba46c6d2dbdb Mon Sep 17 00:00:00 2001 From: lemonadern Date: Fri, 20 Dec 2024 18:32:02 +0900 Subject: [PATCH 18/18] apply clippy --- .../src/tree_sitter/convert.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs index 532f0fa..d367dae 100644 --- a/crates/postgresql-cst-parser/src/tree_sitter/convert.rs +++ b/crates/postgresql-cst-parser/src/tree_sitter/convert.rs @@ -40,7 +40,7 @@ pub fn get_ts_tree_and_range_map( // 1. Add new Node (or Token) to New Tree // 2. Create tree-sitter compatible `Range`s based on the original text. walk_and_build( - &root, + root, &new_line_indices, &mut builder, &mut row_column_ranges, @@ -59,10 +59,7 @@ pub fn get_ts_tree_and_range_map( (new_root, range_map) } -fn get_row_column_range( - node_or_token: &NodeOrToken, - new_line_indices: &Vec, -) -> RowColumnRange { +fn get_row_column_range(node_or_token: &NodeOrToken, new_line_indices: &[usize]) -> RowColumnRange { let text_range: SequentialRange = node_or_token.text_range(); let before_start_new_line_count = @@ -146,7 +143,7 @@ fn create_mapping( let mut range_map: HashMap = HashMap::new(); let mut range_iter = row_column_ranges.iter(); - traverse_pre_order(&root, |node_or_token| { + traverse_pre_order(root, |node_or_token| { if let Some(original_range) = range_iter.next() { let byte_range = node_or_token.text_range(); range_map.insert(byte_range, original_range.clone()); @@ -194,8 +191,8 @@ fn walk_and_build( // Node is target for flattening, but at the top level of the nest row_column_ranges.push(get_row_column_range( - &NodeOrToken::Node(&child_node), - &new_line_indices, + &NodeOrToken::Node(child_node), + new_line_indices, )); builder.start_node(child_node.kind()); @@ -232,8 +229,8 @@ fn walk_and_build( // [Node: Default] _ => { row_column_ranges.push(get_row_column_range( - &NodeOrToken::Node(&child_node), - &new_line_indices, + &NodeOrToken::Node(child_node), + new_line_indices, )); builder.start_node(child_node.kind()); walk_and_build(child_node, new_line_indices, builder, row_column_ranges); @@ -253,7 +250,7 @@ fn walk_and_build( // [Token: Default] row_column_ranges.push(get_row_column_range( &NodeOrToken::Token(child_token), - &new_line_indices, + new_line_indices, )); builder.token(child_token.kind(), child_token.text()); }