From f7686d7af24e7927f3ce2fd130f38119672d19eb Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Sat, 20 Jul 2024 04:37:47 +0200 Subject: [PATCH] getting somewhere --- Cargo.lock | 1 + helix-syntax/src/injections_tree.rs | 216 ++++++++++-- helix-syntax/src/lib.rs | 2 +- helix-syntax/src/tree_sitter.rs | 19 +- helix-syntax/src/tree_sitter/parser.rs | 44 ++- helix-syntax/src/tree_sitter/query.rs | 56 +-- .../src/tree_sitter/query/predicate.rs | 137 +++++++- .../src/tree_sitter/query_captures.rs | 66 ---- helix-syntax/src/tree_sitter/query_cursor.rs | 319 ++++++++++++++++++ helix-syntax/src/tree_sitter/query_match.rs | 1 + helix-syntax/src/tree_sitter/ropey.rs | 40 ++- .../src/tree_sitter/syntax_tree_node.rs | 3 +- 12 files changed, 731 insertions(+), 173 deletions(-) delete mode 100644 helix-syntax/src/tree_sitter/query_captures.rs create mode 100644 helix-syntax/src/tree_sitter/query_cursor.rs create mode 100644 helix-syntax/src/tree_sitter/query_match.rs diff --git a/Cargo.lock b/Cargo.lock index 610f7e650..397606e34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1433,6 +1433,7 @@ dependencies = [ "ahash", "arc-swap", "bitflags 2.6.0", + "cc", "hashbrown 0.14.5", "helix-stdx", "libloading", diff --git a/helix-syntax/src/injections_tree.rs b/helix-syntax/src/injections_tree.rs index 793039a38..2290a0e6d 100644 --- a/helix-syntax/src/injections_tree.rs +++ b/helix-syntax/src/injections_tree.rs @@ -1,15 +1,21 @@ use core::slice; +use std::cell::RefCell; use std::iter::Peekable; +use std::mem::replace; use std::sync::Arc; use hashbrown::HashMap; +use ropey::RopeSlice; use slotmap::{new_key_type, SlotMap}; use crate::parse::LayerUpdateFlags; -use crate::tree_sitter::SyntaxTree; -use crate::{HighlightConfiguration, RopeProvider}; +use crate::tree_sitter::{ + self, Capture, InactiveQueryCursor, Parser, Query, QueryCursor, RopeTsInput, SyntaxTree, + SyntaxTreeNode, +}; +use crate::HighlightConfiguration; -// TODO(perf): replace std::ops::Range with helix_core::Range once added +// TODO(perf): replace std::ops::Range with helix_stdx::Range once added type Range = std::ops::Range; new_key_type! { @@ -23,15 +29,16 @@ pub struct LanguageLayer { pub(crate) parse_tree: Option, /// internal flags used during parsing to track incremental invalidation pub(crate) flags: LayerUpdateFlags, + ranges: Vec, pub(crate) parent: Option, - /// a list of **sorted** non-overlapping injection ranges note that + /// a list of **sorted** non-overlapping injection ranges. Note that /// injection ranges are not relative to the start of this layer but the /// start of the root layer - pub(crate) injection_ranges: Box<[InjectionRange]>, + pub(crate) injections: Box<[Injection]>, } -#[derive(Debug)] -pub(crate) struct InjectionRange { +#[derive(Debug, Clone)] +pub(crate) struct Injection { pub byte_range: Range, pub layer: LayerId, } @@ -39,11 +46,11 @@ pub(crate) struct InjectionRange { impl LanguageLayer { /// Returns the injection range **within this layers** that contains `idx`. /// This function will not descend into nested injections - pub(crate) fn injection_at_byte_idx(&self, idx: usize) -> Option<&InjectionRange> { + pub(crate) fn injection_at_byte_idx(&self, idx: usize) -> Option<&Injection> { let i = self - .injection_ranges + .injections .partition_point(|range| range.byte_range.start <= idx); - self.injection_ranges + self.injections .get(i) .filter(|injection| injection.byte_range.end > idx) } @@ -75,20 +82,187 @@ pub fn layer_for_byte_range(&self, start: usize, end: usize) -> LayerId { } } -struct ActiveInjection<'a> { - injections: Peekable>, - range: InjectionRange, +#[derive(Clone)] +pub struct MatchedNode { + pub capture: Capture, + pub byte_range: Range, } -struct ActiveLayer<'a, State> { - state: State, - /// the query captures just for this layer - layer_captures: Peekable>, +struct LayerQueryIter<'a> { + cursor: QueryCursor<'a, 'a, RopeTsInput<'a>>, + peeked: Option, } -type LayerQueryCaptures<'a> = tree_sitter::QueryCaptures<'a, 'a, RopeProvider<'a>, &'a [u8]>; +impl<'a> LayerQueryIter<'a> { + fn peek(&mut self) -> Option<&MatchedNode> { + if self.peeked.is_none() { + let (query_match, node_idx) = self.cursor.next_matched_node()?; + let matched_node = query_match.matched_node(node_idx); + self.peeked = Some(MatchedNode { + capture: matched_node.capture, + byte_range: matched_node.syntax_node.byte_range(), + }); + } + self.peeked.as_ref() + } -pub struct QueryCaptures<'a> { - active_layers: HashMap>, - active_injections: Vec>, + fn consume(&mut self) -> MatchedNode { + self.peeked.take().unwrap() + } +} + +struct ActiveLayer<'a> { + query_iter: LayerQueryIter<'a>, + injections: Peekable>, +} + +struct QueryBuilder<'a, 'tree> { + query: &'a Query, + node: &'a SyntaxTreeNode<'tree>, + src: RopeSlice<'a>, + injection_tree: &'a InjectionTree, +} + +pub struct QueryIter<'a, 'tree> { + query_builder: Box>, + active_layers: HashMap>, + active_injections: Vec, + current_injection: Injection, +} + +impl<'a> QueryIter<'a, '_> { + fn enter_injection(&mut self, injection: Injection) -> bool { + self.active_layers + .entry(injection.layer) + .or_insert_with(|| { + let layer = &self.query_builder.injection_tree.layers[injection.layer]; + let injection_start = layer + .injections + .partition_point(|child| child.byte_range.start < injection.byte_range.start); + let cursor = get_cursor().execute_query( + self.query_builder.query, + self.query_builder.node, + RopeTsInput::new(self.query_builder.src), + ); + ActiveLayer { + query_iter: LayerQueryIter { + cursor, + peeked: None, + }, + injections: layer.injections[injection_start..].iter().peekable(), + } + }); + let old_injection = replace(&mut self.current_injection, injection); + self.active_injections.push(old_injection); + true + } + + fn exit_injection(&mut self) -> Option { + let injection = replace(&mut self.current_injection, self.active_injections.pop()?); + let finished_layer = self.active_layers[&injection.layer] + .query_iter + .peeked + .is_none(); + if finished_layer { + let layer = self.active_layers.remove(&injection.layer).unwrap(); + reuse_cursor(layer.query_iter.cursor.reuse()); + } + Some(injection) + } +} + +pub enum QueryIterEvent { + EnterInjection(Injection), + Match(MatchedNode), + ExitInjection(Injection), +} + +impl<'a> Iterator for QueryIter<'a, '_> { + type Item = QueryIterEvent; + + fn next(&mut self) -> Option { + loop { + let active_layer = self + .active_layers + .get_mut(&self.current_injection.layer) + .unwrap(); + let next_injection = active_layer.injections.peek().filter(|injection| { + injection.byte_range.start < self.current_injection.byte_range.end + }); + let next_match = active_layer.query_iter.peek().filter(|matched_node| { + matched_node.byte_range.start < self.current_injection.byte_range.end + }); + + match (next_match, next_injection) { + (None, None) => { + return self.exit_injection().map(QueryIterEvent::ExitInjection); + } + (Some(_), None) => { + // consume match + let matched_node = active_layer.query_iter.consume(); + return Some(QueryIterEvent::Match(matched_node)); + } + (Some(matched_node), Some(injection)) + if matched_node.byte_range.start <= injection.byte_range.end => + { + // consume match + let matched_node = active_layer.query_iter.consume(); + // ignore nodes that are overlapped by the injection + if matched_node.byte_range.start <= injection.byte_range.start { + return Some(QueryIterEvent::Match(matched_node)); + } + } + (Some(_), Some(_)) | (None, Some(_)) => { + // consume injection + let injection = active_layer.injections.next().unwrap(); + if self.enter_injection(injection.clone()) { + return Some(QueryIterEvent::EnterInjection(injection.clone())); + } + } + } + } + } +} + +struct TsParser { + parser: crate::tree_sitter::Parser, + pub cursors: Vec, +} + +// could also just use a pool, or a single instance? +thread_local! { + static PARSER: RefCell = RefCell::new(TsParser { + parser: Parser::new(), + cursors: Vec::new(), + }) +} + +pub fn with_cursor(f: impl FnOnce(&mut InactiveQueryCursor) -> T) -> T { + PARSER.with(|parser| { + let mut parser = parser.borrow_mut(); + let mut cursor = parser + .cursors + .pop() + .unwrap_or_else(InactiveQueryCursor::new); + let res = f(&mut cursor); + parser.cursors.push(cursor); + res + }) +} + +pub fn get_cursor() -> InactiveQueryCursor { + PARSER.with(|parser| { + let mut parser = parser.borrow_mut(); + parser + .cursors + .pop() + .unwrap_or_else(InactiveQueryCursor::new) + }) +} + +pub fn reuse_cursor(cursor: InactiveQueryCursor) { + PARSER.with(|parser| { + let mut parser = parser.borrow_mut(); + parser.cursors.push(cursor) + }) } diff --git a/helix-syntax/src/lib.rs b/helix-syntax/src/lib.rs index 074f87272..b7331a3aa 100644 --- a/helix-syntax/src/lib.rs +++ b/helix-syntax/src/lib.rs @@ -337,7 +337,7 @@ struct TsParser { pub fn with_cursor(f: impl FnOnce(&mut QueryCursor) -> T) -> T { PARSER.with(|parser| { let mut parser = parser.borrow_mut(); - let mut cursor = parser.cursors.pop().unwrap_or_else(QueryCursor::new); + let mut cursor = parser.cursors.pop().unwrap_or_default(); let res = f(&mut cursor); parser.cursors.push(cursor); res diff --git a/helix-syntax/src/tree_sitter.rs b/helix-syntax/src/tree_sitter.rs index c2aa6fad9..bb188d120 100644 --- a/helix-syntax/src/tree_sitter.rs +++ b/helix-syntax/src/tree_sitter.rs @@ -1,13 +1,19 @@ mod grammar; mod parser; mod query; -mod query_captures; +mod query_cursor; +mod query_match; mod ropey; mod syntax_tree; mod syntax_tree_node; +use std::ops; + pub use grammar::Grammar; pub use parser::{Parser, ParserInputRaw}; +pub use query::{Capture, ParserErrorLocation, Pattern, Query, QueryStr}; +pub use query_cursor::{InactiveQueryCursor, MatchedNode, MatchedNodeIdx, QueryCursor, QueryMatch}; +pub use ropey::RopeTsInput; pub use syntax_tree::{InputEdit, SyntaxTree}; pub use syntax_tree_node::SyntaxTreeNode; @@ -26,3 +32,14 @@ pub struct Range { pub start_byte: u32, pub end_byte: u32, } + +pub trait TsInput { + type Cursor: regex_cursor::Cursor; + fn cursor_at(&mut self, offset: usize) -> &mut Self::Cursor; + fn eq(&mut self, range1: ops::Range, range2: ops::Range) -> bool; +} + +pub trait IntoTsInput { + type TsInput: TsInput; + fn into_ts_input(self) -> Self::TsInput; +} diff --git a/helix-syntax/src/tree_sitter/parser.rs b/helix-syntax/src/tree_sitter/parser.rs index 611637390..bcd5fa790 100644 --- a/helix-syntax/src/tree_sitter/parser.rs +++ b/helix-syntax/src/tree_sitter/parser.rs @@ -1,10 +1,12 @@ use std::os::raw::c_void; -use std::panic::catch_unwind; +use std::panic::{catch_unwind, AssertUnwindSafe}; use std::ptr::NonNull; use std::{fmt, ptr}; +use regex_cursor::Cursor; + use crate::tree_sitter::syntax_tree::{SyntaxTree, SyntaxTreeData}; -use crate::tree_sitter::{Grammar, Point, Range}; +use crate::tree_sitter::{Grammar, IntoTsInput, Point, Range, TsInput}; // opaque data enum ParserData {} @@ -51,25 +53,28 @@ pub fn set_included_ranges(&mut self, ranges: &[Range]) -> Result<(), InvalidRan } #[must_use] - pub fn parse( + pub fn parse( &mut self, - input: impl IntoParserInput, + input: impl IntoTsInput, old_tree: Option<&SyntaxTree>, ) -> Option { - let mut input = input.into_parser_input(); - unsafe extern "C" fn read( + let mut input = input.into_ts_input(); + unsafe extern "C" fn read( payload: NonNull, byte_index: u32, _position: Point, - bytes_read: &mut u32, + bytes_read: *mut u32, ) -> *const u8 { - match catch_unwind(|| { - let cursor: &mut C = payload.cast().as_mut(); - cursor.read(byte_index as usize) - }) { - Ok(slice) => { - *bytes_read = slice.len() as u32; - slice.as_ptr() + let cursor = catch_unwind(AssertUnwindSafe(move || { + let input: &mut C = payload.cast().as_mut(); + let cursor = input.cursor_at(byte_index as usize); + let slice = cursor.chunk(); + (slice.as_ptr(), slice.len().try_into().unwrap()) + })); + match cursor { + Ok((ptr, len)) => { + *bytes_read = len; + ptr } Err(_) => { *bytes_read = 0; @@ -121,7 +126,7 @@ impl std::error::Error for InvalidRangesErrror {} payload: NonNull, byte_index: u32, position: Point, - bytes_read: &mut u32, + bytes_read: *mut u32, ) -> *const u8; #[repr(C)] @@ -132,15 +137,6 @@ pub struct ParserInputRaw { pub encoding: u32, } -pub trait ParserInput { - fn read(&mut self, offset: usize) -> &[u8]; -} - -pub trait IntoParserInput { - type ParserInput; - fn into_parser_input(self) -> Self::ParserInput; -} - extern "C" { /// Create a new parser fn ts_parser_new() -> NonNull; diff --git a/helix-syntax/src/tree_sitter/query.rs b/helix-syntax/src/tree_sitter/query.rs index 514b89908..3fb1fc188 100644 --- a/helix-syntax/src/tree_sitter/query.rs +++ b/helix-syntax/src/tree_sitter/query.rs @@ -4,8 +4,6 @@ use std::ptr::NonNull; use std::{slice, str}; -use regex_cursor::Cursor; - use crate::tree_sitter::query::predicate::{InvalidPredicateError, Predicate, TextPredicate}; use crate::tree_sitter::query::property::QueryProperty; use crate::tree_sitter::Grammar; @@ -13,20 +11,23 @@ mod predicate; mod property; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Pattern(pub(crate) u32); + pub enum QueryData {} -pub(super) struct Pattern { +pub(super) struct PatternData { text_predicates: Range, properties: Range, } pub struct Query { - raw: NonNull, + pub(crate) raw: NonNull, num_captures: u32, num_strings: u32, text_predicates: Vec, properties: Vec, - patterns: Box<[Pattern]>, + patterns: Box<[PatternData]>, } impl Query { @@ -40,7 +41,7 @@ pub fn new( grammar: Grammar, source: &str, path: impl AsRef, - mut custom_predicate: impl FnMut(Predicate) -> Result<(), InvalidPredicateError>, + mut custom_predicate: impl FnMut(Pattern, Predicate) -> Result<(), InvalidPredicateError>, ) -> Result { assert!( source.len() <= i32::MAX as usize, @@ -121,7 +122,7 @@ pub fn new( } RawQueryError::Language => unreachable!("should be handled at grammar load"), }; - return Err(err) + return Err(err); }; // I am not going to bother with safety comments here, all of these are @@ -141,7 +142,7 @@ pub fn new( let patterns: Result<_, ParseError> = (0..num_patterns) .map(|pattern| { query - .parse_pattern_predicates(pattern, &mut custom_predicate) + .parse_pattern_predicates(Pattern(pattern), &mut custom_predicate) .map_err(|err| ParseError::InvalidPredicate { message: err.msg.into(), location: ParserErrorLocation::new( @@ -157,35 +158,6 @@ pub fn new( Ok(query) } - pub fn satsifies_text_predicate( - &self, - cursor: &mut regex_cursor::Input, - pattern: u32, - ) { - let text_predicates = self.patterns[pattern as usize].text_predicates; - let text_predicates = - &self.text_predicates[text_predicates.start as usize..text_predicates.end as usize]; - for predicate in text_predicates { - match predicate.kind { - predicate::TextPredicateKind::EqString(_) => todo!(), - predicate::TextPredicateKind::EqCapture(_) => todo!(), - predicate::TextPredicateKind::MatchString(_) => todo!(), - predicate::TextPredicateKind::AnyString(_) => todo!(), - } - } - } - - // fn parse_predicates(&mut self) { - // let pattern_count = unsafe { ts_query_pattern_count(self.raw) }; - - // let mut text_predicates = Vec::with_capacity(pattern_count as usize); - // let mut property_predicates = Vec::with_capacity(pattern_count as usize); - // let mut property_settings = Vec::with_capacity(pattern_count as usize); - // let mut general_predicates = Vec::with_capacity(pattern_count as usize); - - // for i in 0..pattern_count {} - // } - #[inline] fn get_string(&self, str: QueryStr) -> &str { let value_id = str.0; @@ -218,10 +190,15 @@ pub fn capture_name(&self, capture_idx: Capture) -> &str { } } - pub fn pattern_properies(&self, pattern_idx: u32) -> &[QueryProperty] { - let range = self.patterns[pattern_idx as usize].properties.clone(); + pub fn pattern_properies(&self, pattern_idx: Pattern) -> &[QueryProperty] { + let range = self.patterns[pattern_idx.0 as usize].properties.clone(); &self.properties[range.start as usize..range.end as usize] } + + pub(crate) fn pattern_text_predicates(&self, pattern_idx: u16) -> &[TextPredicate] { + let range = self.patterns[pattern_idx as usize].text_predicates.clone(); + &self.text_predicates[range.start as usize..range.end as usize] + } } impl Drop for Query { @@ -231,6 +208,7 @@ fn drop(&mut self) { } #[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(transparent)] pub struct Capture(u32); impl Capture { diff --git a/helix-syntax/src/tree_sitter/query/predicate.rs b/helix-syntax/src/tree_sitter/query/predicate.rs index 8fac6cf7d..7a2f858ed 100644 --- a/helix-syntax/src/tree_sitter/query/predicate.rs +++ b/helix-syntax/src/tree_sitter/query/predicate.rs @@ -1,11 +1,16 @@ use std::error::Error; +use std::iter::zip; +use std::ops::Range; use std::ptr::NonNull; use std::{fmt, slice}; use crate::tree_sitter::query::property::QueryProperty; -use crate::tree_sitter::query::{Capture, Pattern, Query, QueryData, QueryStr}; +use crate::tree_sitter::query::{Capture, Pattern, PatternData, Query, QueryData, QueryStr}; +use crate::tree_sitter::query_cursor::MatchedNode; +use crate::tree_sitter::TsInput; use regex_cursor::engines::meta::Regex; +use regex_cursor::Cursor; macro_rules! bail { ($($args:tt)*) => {{ @@ -29,25 +34,141 @@ pub(super) enum TextPredicateKind { AnyString(Box<[QueryStr]>), } -pub(super) struct TextPredicate { +pub(crate) struct TextPredicate { capture: Capture, kind: TextPredicateKind, negated: bool, match_all: bool, } +fn input_matches_str(str: &str, range: Range, input: &mut I) -> bool { + if str.len() != range.len() { + return false; + } + let mut str = str.as_bytes(); + let cursor = input.cursor_at(range.start); + let start_in_chunk = range.start - cursor.offset(); + if range.end - cursor.offset() <= cursor.chunk().len() { + // hotpath + return &cursor.chunk()[start_in_chunk..range.end - cursor.offset()] == str; + } + if cursor.chunk()[start_in_chunk..] != str[..cursor.chunk().len() - start_in_chunk] { + return false; + } + str = &str[..cursor.chunk().len() - start_in_chunk]; + while cursor.advance() { + if str.len() <= cursor.chunk().len() { + return &cursor.chunk()[..range.end - cursor.offset()] == str; + } + if &str[..cursor.chunk().len()] != cursor.chunk() { + return false; + } + str = &str[cursor.chunk().len()..] + } + // buggy cursor/invalid range + false +} + +fn inputs_match(str: &str, range: Range, input: &mut I) -> bool { + if str.len() != range.len() { + return false; + } + let mut str = str.as_bytes(); + let cursor = input.cursor_at(range.start); + let start_in_chunk = range.start - cursor.offset(); + if range.end - cursor.offset() <= cursor.chunk().len() { + // hotpath + return &cursor.chunk()[start_in_chunk..range.end - cursor.offset()] == str; + } + if cursor.chunk()[start_in_chunk..] != str[..cursor.chunk().len() - start_in_chunk] { + return false; + } + str = &str[..cursor.chunk().len() - start_in_chunk]; + while cursor.advance() { + if str.len() <= cursor.chunk().len() { + return &cursor.chunk()[..range.end - cursor.offset()] == str; + } + if &str[..cursor.chunk().len()] != cursor.chunk() { + return false; + } + str = &str[cursor.chunk().len()..] + } + // buggy cursor/invalid range + false +} + +impl TextPredicate { + /// handlers match_all and negated + fn satisfied_helper(&self, mut nodes: impl Iterator) -> bool { + if self.match_all { + nodes.all(|matched| matched != self.negated) + } else { + nodes.any(|matched| matched != self.negated) + } + } + + pub fn satsified( + &self, + input: &mut I, + matched_nodes: &[MatchedNode], + query: &Query, + ) -> bool { + let mut capture_nodes = matched_nodes + .iter() + .filter(|matched_node| matched_node.capture == self.capture); + match self.kind { + TextPredicateKind::EqString(str) => self.satisfied_helper(capture_nodes.map(|node| { + let range = node.syntax_node.byte_range(); + input_matches_str(query.get_string(str), range.clone(), input) + })), + TextPredicateKind::EqCapture(other_capture) => { + let mut other_nodes = matched_nodes + .iter() + .filter(|matched_node| matched_node.capture == other_capture); + + let res = self.satisfied_helper(zip(&mut capture_nodes, &mut other_nodes).map( + |(node1, node2)| { + let range1 = node1.syntax_node.byte_range(); + let range2 = node2.syntax_node.byte_range(); + input.eq(range1, range2) + }, + )); + let consumed_all = capture_nodes.next().is_none() && other_nodes.next().is_none(); + res && (!self.match_all || consumed_all) + } + TextPredicateKind::MatchString(ref regex) => { + self.satisfied_helper(capture_nodes.map(|node| { + let range = node.syntax_node.byte_range(); + let input = regex_cursor::Input::new(input.cursor_at(range.start)).range(range); + regex.is_match(input) + })) + } + TextPredicateKind::AnyString(ref strings) => { + let strings = strings.iter().map(|&str| query.get_string(str)); + self.satisfied_helper(capture_nodes.map(|node| { + let range = node.syntax_node.byte_range(); + strings + .clone() + .filter(|str| str.len() == range.len()) + .any(|str| input_matches_str(str, range.clone(), input)) + })) + } + } + } +} + impl Query { pub(super) fn parse_pattern_predicates( &mut self, - pattern_index: u32, - mut custom_predicate: impl FnMut(Predicate) -> Result<(), InvalidPredicateError>, - ) -> Result { + pattern: Pattern, + mut custom_predicate: impl FnMut(Pattern, Predicate) -> Result<(), InvalidPredicateError>, + ) -> Result { let text_predicate_start = self.text_predicates.len() as u32; let property_start = self.properties.len() as u32; let predicate_steps = unsafe { let mut len = 0u32; - let raw_predicates = ts_query_predicates_for_pattern(self.raw, pattern_index, &mut len); + let raw_predicates = ts_query_predicates_for_pattern(self.raw, pattern.0, &mut len); (len != 0) .then(|| slice::from_raw_parts(raw_predicates, len as usize)) .unwrap_or_default() @@ -118,10 +239,10 @@ pub(super) fn parse_pattern_predicates( // is and is-not are better handeled as custom predicates since interpreting is context dependent // "is?" => property_predicates.push((QueryProperty::parse(&predicate), false)), // "is-not?" => property_predicates.push((QueryProperty::parse(&predicate), true)), - _ => custom_predicate(predicate)?, + _ => custom_predicate(pattern, predicate)?, } } - Ok(Pattern { + Ok(PatternData { text_predicates: text_predicate_start..self.text_predicates.len() as u32, properties: property_start..self.properties.len() as u32, }) diff --git a/helix-syntax/src/tree_sitter/query_captures.rs b/helix-syntax/src/tree_sitter/query_captures.rs deleted file mode 100644 index 41a2c0938..000000000 --- a/helix-syntax/src/tree_sitter/query_captures.rs +++ /dev/null @@ -1,66 +0,0 @@ -use std::ptr::{self, NonNull}; - -use regex_cursor::Cursor; - -use crate::tree_sitter::query::Query; -use crate::tree_sitter::syntax_tree_node::SyntaxTreeNodeRaw; - -enum QueryCursorData {} - -pub struct QueryCaptures<'a> { - query: &'a Query, - query_cursor: &'a mut QueryCursorData, - text_cursor: regex_cursor::RopeyCursor<'a>, -} - -impl QueryCaptures<'_, C> { - fn next(&mut self) { - let mut query_match = TSQueryMatch { - id: 0, - pattern_index: 0, - capture_count: 0, - captures: ptr::null(), - }; - let mut capture_idx = 0; - loop { - let success = unsafe { - ts_query_cursor_next_capture( - &mut self.query_cursor, - &mut query_match, - &mut capture_idx, - ) - }; - if !success { - break; - } - } - let mut input = regex_cursor::Input::new(self.text_cursor.clone()); - } -} - -#[repr(C)] -#[derive(Debug)] -struct TSQueryCapture { - node: SyntaxTreeNodeRaw, - index: u32, -} - -#[repr(C)] -#[derive(Debug)] -struct TSQueryMatch { - id: u32, - pattern_index: u16, - capture_count: u16, - captures: *const TSQueryCapture, -} - -extern "C" { - /// Advance to the next capture of the currently running query. - /// If there is a capture, write its match to `*match` and its index within - /// the matche's capture list to `*capture_index`. Otherwise, return `false`. - fn ts_query_cursor_next_capture( - self_: &mut QueryCursorData, - match_: &mut TSQueryMatch, - capture_index: &mut u32, - ) -> bool; -} diff --git a/helix-syntax/src/tree_sitter/query_cursor.rs b/helix-syntax/src/tree_sitter/query_cursor.rs new file mode 100644 index 000000000..368aeadfa --- /dev/null +++ b/helix-syntax/src/tree_sitter/query_cursor.rs @@ -0,0 +1,319 @@ +use core::slice; +use std::marker::PhantomData; +use std::mem::replace; +use std::ops::Range; +use std::ptr::{self, NonNull}; + +use crate::tree_sitter::query::{Capture, Pattern, Query, QueryData}; +use crate::tree_sitter::syntax_tree_node::SyntaxTreeNodeRaw; +use crate::tree_sitter::{SyntaxTree, SyntaxTreeNode, TsInput}; + +enum QueryCursorData {} + +pub struct QueryCursor<'a, 'tree, I: TsInput> { + query: &'a Query, + ptr: *mut QueryCursorData, + tree: PhantomData<&'tree SyntaxTree>, + input: I, +} + +impl<'tree, I: TsInput> QueryCursor<'_, 'tree, I> { + pub fn next_match(&mut self) -> Option> { + let mut query_match = TSQueryMatch { + id: 0, + pattern_index: 0, + capture_count: 0, + captures: ptr::null(), + }; + loop { + let success = unsafe { ts_query_cursor_next_match(self.ptr, &mut query_match) }; + if !success { + return None; + } + let matched_nodes = unsafe { + slice::from_raw_parts( + query_match.captures.cast(), + query_match.capture_count as usize, + ) + }; + let satisfies_predicates = self + .query + .pattern_text_predicates(query_match.pattern_index) + .iter() + .all(|predicate| predicate.satsified(&mut self.input, matched_nodes, self.query)); + if satisfies_predicates { + let res = QueryMatch { + id: query_match.id, + pattern: Pattern(query_match.pattern_index as u32), + matched_nodes, + query_cursor: unsafe { &mut *self.ptr }, + _tree: PhantomData, + }; + return Some(res); + } + } + } + + pub fn next_matched_node(&mut self) -> Option<(QueryMatch<'_, 'tree>, MatchedNodeIdx)> { + let mut query_match = TSQueryMatch { + id: 0, + pattern_index: 0, + capture_count: 0, + captures: ptr::null(), + }; + let mut capture_idx = 0; + loop { + let success = unsafe { + ts_query_cursor_next_capture(self.ptr, &mut query_match, &mut capture_idx) + }; + if !success { + return None; + } + let matched_nodes = unsafe { + slice::from_raw_parts( + query_match.captures.cast(), + query_match.capture_count as usize, + ) + }; + let satisfies_predicates = self + .query + .pattern_text_predicates(query_match.pattern_index) + .iter() + .all(|predicate| predicate.satsified(&mut self.input, matched_nodes, self.query)); + if satisfies_predicates { + let res = QueryMatch { + id: query_match.id, + pattern: Pattern(query_match.pattern_index as u32), + matched_nodes, + query_cursor: unsafe { &mut *self.ptr }, + _tree: PhantomData, + }; + return Some((res, capture_idx)); + } else { + unsafe { + ts_query_cursor_remove_match(self.ptr, query_match.id); + } + } + } + } + + pub fn set_byte_range(&mut self, range: Range) { + unsafe { + ts_query_cursor_set_byte_range(self.ptr, range.start as u32, range.end as u32); + } + } + + pub fn reuse(mut self) -> InactiveQueryCursor { + let ptr = replace(&mut self.ptr, ptr::null_mut()); + InactiveQueryCursor { + ptr: unsafe { NonNull::new_unchecked(ptr) }, + } + } +} + +impl Drop for QueryCursor<'_, '_, I> { + fn drop(&mut self) { + // we allow moving the cursor data out so we need the null check here + // would be cleaner with a subtype but doesn't really matter at the end of the day + if !self.ptr.is_null() { + unsafe { ts_query_cursor_delete(self.ptr) } + } + } +} + +/// A query cursor that is not actively associated with a query +pub struct InactiveQueryCursor { + ptr: NonNull, +} + +impl InactiveQueryCursor { + pub fn new() -> Self { + InactiveQueryCursor { + ptr: unsafe { NonNull::new_unchecked(ts_query_cursor_new()) }, + } + } + + /// Return the maximum number of in-progress matches for this cursor. + #[doc(alias = "ts_query_cursor_match_limit")] + #[must_use] + pub fn match_limit(&self) -> u32 { + unsafe { ts_query_cursor_match_limit(self.ptr.as_ptr()) } + } + + /// Set the maximum number of in-progress matches for this cursor. The + /// limit must be > 0 and <= 65536. + #[doc(alias = "ts_query_cursor_set_match_limit")] + pub fn set_match_limit(&mut self, limit: u32) { + unsafe { + ts_query_cursor_set_match_limit(self.ptr.as_ptr(), limit); + } + } + + /// Check if, on its last execution, this cursor exceeded its maximum number + /// of in-progress matches. + #[doc(alias = "ts_query_cursor_did_exceed_match_limit")] + #[must_use] + pub fn did_exceed_match_limit(&self) -> bool { + unsafe { ts_query_cursor_did_exceed_match_limit(self.ptr.as_ptr()) } + } + + pub fn set_byte_range(&mut self, range: Range) { + unsafe { + ts_query_cursor_set_byte_range(self.ptr.as_ptr(), range.start as u32, range.end as u32); + } + } + + pub fn execute_query<'a, 'tree, I: TsInput>( + self, + query: &'a Query, + node: &SyntaxTreeNode<'tree>, + input: I, + ) -> QueryCursor<'a, 'tree, I> { + let ptr = self.ptr.as_ptr(); + unsafe { ts_query_cursor_exec(self.ptr.as_ptr(), query.raw.as_ref(), node.as_raw()) }; + QueryCursor { + query, + ptr, + tree: PhantomData, + input, + } + } +} + +impl Drop for InactiveQueryCursor { + fn drop(&mut self) { + unsafe { ts_query_cursor_delete(self.ptr.as_ptr()) } + } +} + +pub type MatchedNodeIdx = u32; + +#[repr(C)] +#[derive(Clone)] +pub struct MatchedNode<'tree> { + pub syntax_node: SyntaxTreeNode<'tree>, + pub capture: Capture, +} + +pub struct QueryMatch<'cursor, 'tree> { + id: u32, + pattern: Pattern, + matched_nodes: &'cursor [MatchedNode<'tree>], + query_cursor: &'cursor mut QueryCursorData, + _tree: PhantomData<&'tree super::SyntaxTree>, +} + +impl<'tree> QueryMatch<'_, 'tree> { + pub fn matched_nodes(&self) -> impl Iterator> { + self.matched_nodes.iter() + } + + pub fn matched_node(&self, i: MatchedNodeIdx) -> &MatchedNode { + &self.matched_nodes[i as usize] + } + + #[must_use] + pub const fn id(&self) -> u32 { + self.id + } + + #[must_use] + pub const fn pattern(&self) -> Pattern { + self.pattern + } + + #[doc(alias = "ts_query_cursor_remove_match")] + /// removes this match from the cursor so that further captures + /// from its cursor so that future captures that belong to this match + /// are no longer returned by capture iterators + pub fn remove(self) { + unsafe { + ts_query_cursor_remove_match(self.query_cursor, self.id); + } + } +} + +#[repr(C)] +#[derive(Debug)] +struct TSQueryCapture { + node: SyntaxTreeNodeRaw, + index: u32, +} + +#[repr(C)] +#[derive(Debug)] +struct TSQueryMatch { + id: u32, + pattern_index: u16, + capture_count: u16, + captures: *const TSQueryCapture, +} + +extern "C" { + /// Advance to the next capture of the currently running query. + /// If there is a capture, write its match to `*match` and its index within + /// the matche's capture list to `*capture_index`. Otherwise, return `false`. + fn ts_query_cursor_next_capture( + self_: *mut QueryCursorData, + match_: &mut TSQueryMatch, + capture_index: &mut u32, + ) -> bool; + + /// Advance to the next match of the currently running query. + /// + /// If there is a match, write it to `*match` and return `true`. + /// Otherwise, return `false`. + pub fn ts_query_cursor_next_match( + self_: *mut QueryCursorData, + match_: &mut TSQueryMatch, + ) -> bool; + fn ts_query_cursor_remove_match(self_: *mut QueryCursorData, match_id: u32); + /// Delete a query cursor, freeing all of the memory that it used + pub fn ts_query_cursor_delete(self_: *mut QueryCursorData); + /// Create a new cursor for executing a given query. + /// The cursor stores the state that is needed to iteratively search + /// for matches. To use the query cursor, first call [`ts_query_cursor_exec`] + /// to start running a given query on a given syntax node. Then, there are + /// two options for consuming the results of the query: + /// 1. Repeatedly call [`ts_query_cursor_next_match`] to iterate over all of the + /// *matches* in the order that they were found. Each match contains the + /// index of the pattern that matched, and an array of captures. Because + /// multiple patterns can match the same set of nodes, one match may contain + /// captures that appear *before* some of the captures from a previous match. + /// 2. Repeatedly call [`ts_query_cursor_next_capture`] to iterate over all of the + /// individual *captures* in the order that they appear. This is useful if + /// don't care about which pattern matched, and just want a single ordered + /// sequence of captures. + /// If you don't care about consuming all of the results, you can stop calling + /// [`ts_query_cursor_next_match`] or [`ts_query_cursor_next_capture`] at any point. + /// You can then start executing another query on another node by calling + /// [`ts_query_cursor_exec`] again."] + pub fn ts_query_cursor_new() -> *mut QueryCursorData; + + /// Start running a given query on a given node. + pub fn ts_query_cursor_exec( + self_: *mut QueryCursorData, + query: &QueryData, + node: SyntaxTreeNodeRaw, + ); + /// Manage the maximum number of in-progress matches allowed by this query + /// cursor. + /// + /// Query cursors have an optional maximum capacity for storing lists of + /// in-progress captures. If this capacity is exceeded, then the + /// earliest-starting match will silently be dropped to make room for further + /// matches. This maximum capacity is optional — by default, query cursors allow + /// any number of pending matches, dynamically allocating new space for them as + /// needed as the query is executed. + pub fn ts_query_cursor_did_exceed_match_limit(self_: *const QueryCursorData) -> bool; + pub fn ts_query_cursor_match_limit(self_: *const QueryCursorData) -> u32; + pub fn ts_query_cursor_set_match_limit(self_: *mut QueryCursorData, limit: u32); + /// Set the range of bytes or (row, column) positions in which the query + /// will be executed. + pub fn ts_query_cursor_set_byte_range( + self_: *mut QueryCursorData, + start_byte: u32, + end_byte: u32, + ); + +} diff --git a/helix-syntax/src/tree_sitter/query_match.rs b/helix-syntax/src/tree_sitter/query_match.rs new file mode 100644 index 000000000..b0f1eaaaf --- /dev/null +++ b/helix-syntax/src/tree_sitter/query_match.rs @@ -0,0 +1 @@ +pub struct QueryMatch {} diff --git a/helix-syntax/src/tree_sitter/ropey.rs b/helix-syntax/src/tree_sitter/ropey.rs index aa59b2f27..3e2354da6 100644 --- a/helix-syntax/src/tree_sitter/ropey.rs +++ b/helix-syntax/src/tree_sitter/ropey.rs @@ -1,38 +1,54 @@ +use std::ops; + use regex_cursor::{Cursor, RopeyCursor}; use ropey::RopeSlice; -use crate::tree_sitter::parser::{IntoParserInput, ParserInput}; +use crate::tree_sitter::{IntoTsInput, TsInput}; -pub struct RopeParserInput<'a> { +pub struct RopeTsInput<'a> { src: RopeSlice<'a>, cursor: regex_cursor::RopeyCursor<'a>, } -impl<'a> IntoParserInput for RopeSlice<'a> { - type ParserInput = RopeParserInput<'a>; +impl<'a> RopeTsInput<'a> { + pub fn new(src: RopeSlice<'a>) -> Self { + RopeTsInput { + src, + cursor: regex_cursor::RopeyCursor::new(src), + } + } +} - fn into_parser_input(self) -> Self::ParserInput { - RopeParserInput { +impl<'a> IntoTsInput for RopeSlice<'a> { + type TsInput = RopeTsInput<'a>; + + fn into_ts_input(self) -> Self::TsInput { + RopeTsInput { src: self, cursor: RopeyCursor::new(self), } } } -impl ParserInput for RopeParserInput<'_> { - fn read(&mut self, offset: usize) -> &[u8] { +impl<'a> TsInput for RopeTsInput<'a> { + type Cursor = RopeyCursor<'a>; + fn cursor_at(&mut self, offset: usize) -> &mut RopeyCursor<'a> { // this cursor is optimized for contigous reads which are by far the most common during parsing // very far jumps (like injections at the other end of the document) are handelde - // by restarting a new cursor (new chunks iterator) - if offset < self.cursor.offset() && self.cursor.offset() - offset > 4906 { + // by starting a new cursor (new chunks iterator) + if offset < self.cursor.offset() || self.cursor.offset() - offset > 4906 { self.cursor = regex_cursor::RopeyCursor::at(self.src, offset); } else { while self.cursor.offset() + self.cursor.chunk().len() >= offset { if !self.cursor.advance() { - return &[]; + break; } } } - self.cursor.chunk() + &mut self.cursor + } + + fn eq(&mut self, range1: ops::Range, range2: ops::Range) -> bool { + self.src.byte_slice(range1) == self.src.byte_slice(range2) } } diff --git a/helix-syntax/src/tree_sitter/syntax_tree_node.rs b/helix-syntax/src/tree_sitter/syntax_tree_node.rs index b50c79696..1afca0e3d 100644 --- a/helix-syntax/src/tree_sitter/syntax_tree_node.rs +++ b/helix-syntax/src/tree_sitter/syntax_tree_node.rs @@ -25,6 +25,7 @@ fn from(node: SyntaxTreeNode) -> SyntaxTreeNodeRaw { } #[derive(Debug, Clone)] +#[repr(C)] pub struct SyntaxTreeNode<'tree> { context: [u32; 4], id: NonNull, @@ -44,7 +45,7 @@ pub(super) unsafe fn from_raw(raw: SyntaxTreeNodeRaw) -> Option { } #[inline] - fn as_raw(&self) -> SyntaxTreeNodeRaw { + pub(crate) fn as_raw(&self) -> SyntaxTreeNodeRaw { SyntaxTreeNodeRaw { context: self.context, id: self.id.as_ptr(),