From cd02976fa3a55c2c1f01b95c40d178061968f797 Mon Sep 17 00:00:00 2001 From: Pascal Kuthe Date: Mon, 26 Feb 2024 08:45:20 +0100 Subject: [PATCH] switch to regex-cursor (#9422) --- Cargo.lock | 18 ++++++- helix-core/src/selection.rs | 98 +++++++++++++++++++++++-------------- helix-core/src/syntax.rs | 12 +++-- helix-stdx/Cargo.toml | 1 + helix-stdx/src/rope.rs | 45 ++++++++++++++++- helix-term/src/commands.rs | 56 ++++++++------------- helix-term/src/ui/mod.rs | 33 ++++++++++--- 7 files changed, 176 insertions(+), 87 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2b8a25c85..b8d375c51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1344,6 +1344,7 @@ version = "23.10.0" dependencies = [ "dunce", "etcetera", + "regex-cursor", "ropey", "tempfile", "which", @@ -1938,15 +1939,28 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] +[[package]] +name = "regex-cursor" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a43718aa0040434d45728c43f56bd53bda75a91c46954cdf0f2ff4dbc8aabbe7" +dependencies = [ + "log", + "memchr", + "regex-automata", + "regex-syntax", + "ropey", +] + [[package]] name = "regex-syntax" version = "0.8.2" diff --git a/helix-core/src/selection.rs b/helix-core/src/selection.rs index c44685eea..91f1d0de5 100644 --- a/helix-core/src/selection.rs +++ b/helix-core/src/selection.rs @@ -7,9 +7,11 @@ ensure_grapheme_boundary_next, ensure_grapheme_boundary_prev, next_grapheme_boundary, prev_grapheme_boundary, }, + line_ending::get_line_ending, movement::Direction, Assoc, ChangeSet, RopeGraphemes, RopeSlice, }; +use helix_stdx::rope::{self, RopeSliceExt}; use smallvec::{smallvec, SmallVec}; use std::borrow::Cow; @@ -708,12 +710,12 @@ impl IntoIterator for Selection { pub fn keep_or_remove_matches( text: RopeSlice, selection: &Selection, - regex: &crate::regex::Regex, + regex: &rope::Regex, remove: bool, ) -> Option { let result: SmallVec<_> = selection .iter() - .filter(|range| regex.is_match(&range.fragment(text)) ^ remove) + .filter(|range| regex.is_match(text.regex_input_at(range.from()..range.to())) ^ remove) .copied() .collect(); @@ -724,25 +726,20 @@ pub fn keep_or_remove_matches( None } +// TODO: support to split on capture #N instead of whole match pub fn select_on_matches( text: RopeSlice, selection: &Selection, - regex: &crate::regex::Regex, + regex: &rope::Regex, ) -> Option { let mut result = SmallVec::with_capacity(selection.len()); for sel in selection { - // TODO: can't avoid occasional allocations since Regex can't operate on chunks yet - let fragment = sel.fragment(text); - - let sel_start = sel.from(); - let start_byte = text.char_to_byte(sel_start); - - for mat in regex.find_iter(&fragment) { + for mat in regex.find_iter(text.regex_input_at(sel.from()..sel.to())) { // TODO: retain range direction - let start = text.byte_to_char(start_byte + mat.start()); - let end = text.byte_to_char(start_byte + mat.end()); + let start = text.byte_to_char(mat.start()); + let end = text.byte_to_char(mat.end()); let range = Range::new(start, end); // Make sure the match is not right outside of the selection. @@ -761,12 +758,7 @@ pub fn select_on_matches( None } -// TODO: support to split on capture #N instead of whole match -pub fn split_on_matches( - text: RopeSlice, - selection: &Selection, - regex: &crate::regex::Regex, -) -> Selection { +pub fn split_on_newline(text: RopeSlice, selection: &Selection) -> Selection { let mut result = SmallVec::with_capacity(selection.len()); for sel in selection { @@ -776,21 +768,47 @@ pub fn split_on_matches( continue; } - // TODO: can't avoid occasional allocations since Regex can't operate on chunks yet - let fragment = sel.fragment(text); - let sel_start = sel.from(); let sel_end = sel.to(); - let start_byte = text.char_to_byte(sel_start); - let mut start = sel_start; - for mat in regex.find_iter(&fragment) { + for mat in sel.slice(text).lines() { + let len = mat.len_chars(); + let line_end_len = get_line_ending(&mat).map(|le| le.len_chars()).unwrap_or(0); // TODO: retain range direction - let end = text.byte_to_char(start_byte + mat.start()); + result.push(Range::new(start, start + len - line_end_len)); + start += len; + } + + if start < sel_end { + result.push(Range::new(start, sel_end)); + } + } + + // TODO: figure out a new primary index + Selection::new(result, 0) +} + +pub fn split_on_matches(text: RopeSlice, selection: &Selection, regex: &rope::Regex) -> Selection { + let mut result = SmallVec::with_capacity(selection.len()); + + for sel in selection { + // Special case: zero-width selection. + if sel.from() == sel.to() { + result.push(*sel); + continue; + } + + let sel_start = sel.from(); + let sel_end = sel.to(); + let mut start = sel_start; + + for mat in regex.find_iter(text.regex_input_at(sel_start..sel_end)) { + // TODO: retain range direction + let end = text.byte_to_char(mat.start()); result.push(Range::new(start, end)); - start = text.byte_to_char(start_byte + mat.end()); + start = text.byte_to_char(mat.end()); } if start < sel_end { @@ -1021,14 +1039,12 @@ fn test_min_width_1() { #[test] fn test_select_on_matches() { - use crate::regex::{Regex, RegexBuilder}; - let r = Rope::from_str("Nobody expects the Spanish inquisition"); let s = r.slice(..); let selection = Selection::single(0, r.len_chars()); assert_eq!( - select_on_matches(s, &selection, &Regex::new(r"[A-Z][a-z]*").unwrap()), + select_on_matches(s, &selection, &rope::Regex::new(r"[A-Z][a-z]*").unwrap()), Some(Selection::new( smallvec![Range::new(0, 6), Range::new(19, 26)], 0 @@ -1038,8 +1054,14 @@ fn test_select_on_matches() { let r = Rope::from_str("This\nString\n\ncontains multiple\nlines"); let s = r.slice(..); - let start_of_line = RegexBuilder::new(r"^").multi_line(true).build().unwrap(); - let end_of_line = RegexBuilder::new(r"$").multi_line(true).build().unwrap(); + let start_of_line = rope::RegexBuilder::new() + .syntax(rope::Config::new().multi_line(true)) + .build(r"^") + .unwrap(); + let end_of_line = rope::RegexBuilder::new() + .syntax(rope::Config::new().multi_line(true)) + .build(r"$") + .unwrap(); // line without ending assert_eq!( @@ -1077,9 +1099,9 @@ fn test_select_on_matches() { select_on_matches( s, &Selection::single(0, s.len_chars()), - &RegexBuilder::new(r"^[a-z ]*$") - .multi_line(true) - .build() + &rope::RegexBuilder::new() + .syntax(rope::Config::new().multi_line(true)) + .build(r"^[a-z ]*$") .unwrap() ), Some(Selection::new( @@ -1171,13 +1193,15 @@ fn test_put_cursor() { #[test] fn test_split_on_matches() { - use crate::regex::Regex; - let text = Rope::from(" abcd efg wrs xyz 123 456"); let selection = Selection::new(smallvec![Range::new(0, 9), Range::new(11, 20),], 0); - let result = split_on_matches(text.slice(..), &selection, &Regex::new(r"\s+").unwrap()); + let result = split_on_matches( + text.slice(..), + &selection, + &rope::Regex::new(r"\s+").unwrap(), + ); assert_eq!( result.ranges(), diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index a9344448f..0d8559ca9 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -12,6 +12,7 @@ use bitflags::bitflags; use globset::GlobSet; use hashbrown::raw::RawTable; +use helix_stdx::rope::{self, RopeSliceExt}; use slotmap::{DefaultKey as LayerId, HopSlotMap}; use std::{ @@ -1961,11 +1962,16 @@ fn injection_pair<'a>( node_slice }; - static SHEBANG_REGEX: Lazy = Lazy::new(|| Regex::new(SHEBANG).unwrap()); + static SHEBANG_REGEX: Lazy = + Lazy::new(|| rope::Regex::new(SHEBANG).unwrap()); injection_capture = SHEBANG_REGEX - .captures(&Cow::from(lines)) - .map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned())) + .captures_iter(lines.regex_input()) + .map(|cap| { + let cap = lines.byte_slice(cap.get_group(1).unwrap().range()); + InjectionLanguageMarker::Shebang(cap.into()) + }) + .next() } else if index == self.injection_content_capture_index { content_node = Some(capture.node); } diff --git a/helix-stdx/Cargo.toml b/helix-stdx/Cargo.toml index 540a1b99a..5ac7c011f 100644 --- a/helix-stdx/Cargo.toml +++ b/helix-stdx/Cargo.toml @@ -16,6 +16,7 @@ dunce = "1.0" etcetera = "0.8" ropey = { version = "1.6.1", default-features = false } which = "6.0" +regex-cursor = "0.1.3" [dev-dependencies] tempfile = "3.10" diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs index 4ee39d4a8..7b4edda4f 100644 --- a/helix-stdx/src/rope.rs +++ b/helix-stdx/src/rope.rs @@ -1,11 +1,22 @@ +use std::ops::{Bound, RangeBounds}; + +pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex}; +pub use regex_cursor::regex_automata::util::syntax::Config; +use regex_cursor::{Input as RegexInput, RopeyCursor}; use ropey::RopeSlice; -pub trait RopeSliceExt: Sized { +pub trait RopeSliceExt<'a>: Sized { fn ends_with(self, text: &str) -> bool; fn starts_with(self, text: &str) -> bool; + fn regex_input(self) -> RegexInput>; + fn regex_input_at_bytes>( + self, + byte_range: R, + ) -> RegexInput>; + fn regex_input_at>(self, char_range: R) -> RegexInput>; } -impl RopeSliceExt for RopeSlice<'_> { +impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { fn ends_with(self, text: &str) -> bool { let len = self.len_bytes(); if len < text.len() { @@ -23,4 +34,34 @@ fn starts_with(self, text: &str) -> bool { self.get_byte_slice(..len - text.len()) .map_or(false, |start| start == text) } + + fn regex_input(self) -> RegexInput> { + RegexInput::new(self) + } + + fn regex_input_at>(self, char_range: R) -> RegexInput> { + let start_bound = match char_range.start_bound() { + Bound::Included(&val) => Bound::Included(self.char_to_byte(val)), + Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)), + Bound::Unbounded => Bound::Unbounded, + }; + let end_bound = match char_range.end_bound() { + Bound::Included(&val) => Bound::Included(self.char_to_byte(val)), + Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)), + Bound::Unbounded => Bound::Unbounded, + }; + self.regex_input_at_bytes((start_bound, end_bound)) + } + fn regex_input_at_bytes>( + self, + byte_range: R, + ) -> RegexInput> { + let input = match byte_range.start_bound() { + Bound::Included(&pos) | Bound::Excluded(&pos) => { + RegexInput::new(RopeyCursor::at(self, pos)) + } + Bound::Unbounded => RegexInput::new(self), + }; + input.range(byte_range) + } } diff --git a/helix-term/src/commands.rs b/helix-term/src/commands.rs index 51a1ede9b..fdad31a81 100644 --- a/helix-term/src/commands.rs +++ b/helix-term/src/commands.rs @@ -3,6 +3,7 @@ pub(crate) mod typed; pub use dap::*; +use helix_stdx::rope::{self, RopeSliceExt}; use helix_vcs::Hunk; pub use lsp::*; use tui::widgets::Row; @@ -19,7 +20,7 @@ match_brackets, movement::{self, move_vertically_visual, Direction}, object, pos_at_coords, - regex::{self, Regex, RegexBuilder}, + regex::{self, Regex}, search::{self, CharMatcher}, selection, shellwords, surround, syntax::LanguageServerFeature, @@ -1907,11 +1908,7 @@ fn split_selection(cx: &mut Context) { fn split_selection_on_newline(cx: &mut Context) { let (view, doc) = current!(cx.editor); let text = doc.text().slice(..); - // only compile the regex once - #[allow(clippy::trivial_regex)] - static REGEX: Lazy = - Lazy::new(|| Regex::new(r"\r\n|[\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}]").unwrap()); - let selection = selection::split_on_matches(text, doc.selection(view.id), ®EX); + let selection = selection::split_on_newline(text, doc.selection(view.id)); doc.set_selection(view.id, selection); } @@ -1930,8 +1927,7 @@ fn merge_consecutive_selections(cx: &mut Context) { #[allow(clippy::too_many_arguments)] fn search_impl( editor: &mut Editor, - contents: &str, - regex: &Regex, + regex: &rope::Regex, movement: Movement, direction: Direction, scrolloff: usize, @@ -1959,23 +1955,20 @@ fn search_impl( // do a reverse search and wraparound to the end, we don't need to search // the text before the current cursor position for matches, but by slicing // it out, we need to add it back to the position of the selection. - let mut offset = 0; + let doc = doc!(editor).text().slice(..); // use find_at to find the next match after the cursor, loop around the end // Careful, `Regex` uses `bytes` as offsets, not character indices! let mut mat = match direction { - Direction::Forward => regex.find_at(contents, start), - Direction::Backward => regex.find_iter(&contents[..start]).last(), + Direction::Forward => regex.find(doc.regex_input_at_bytes(start..)), + Direction::Backward => regex.find_iter(doc.regex_input_at_bytes(..start)).last(), }; if mat.is_none() { if wrap_around { mat = match direction { - Direction::Forward => regex.find(contents), - Direction::Backward => { - offset = start; - regex.find_iter(&contents[start..]).last() - } + Direction::Forward => regex.find(doc.regex_input()), + Direction::Backward => regex.find_iter(doc.regex_input_at_bytes(start..)).last(), }; } if show_warnings { @@ -1992,8 +1985,8 @@ fn search_impl( let selection = doc.selection(view.id); if let Some(mat) = mat { - let start = text.byte_to_char(mat.start() + offset); - let end = text.byte_to_char(mat.end() + offset); + let start = text.byte_to_char(mat.start()); + let end = text.byte_to_char(mat.end()); if end == 0 { // skip empty matches that don't make sense @@ -2037,13 +2030,7 @@ fn searcher(cx: &mut Context, direction: Direction) { let scrolloff = config.scrolloff; let wrap_around = config.search.wrap_around; - let doc = doc!(cx.editor); - // TODO: could probably share with select_on_matches? - - // HAXX: sadly we can't avoid allocating a single string for the whole buffer since we can't - // feed chunks into the regex yet - let contents = doc.text().slice(..).to_string(); let completions = search_completions(cx, Some(reg)); ui::regex_prompt( @@ -2065,7 +2052,6 @@ fn searcher(cx: &mut Context, direction: Direction) { } search_impl( cx.editor, - &contents, ®ex, Movement::Move, direction, @@ -2085,8 +2071,6 @@ fn search_next_or_prev_impl(cx: &mut Context, movement: Movement, direction: Dir let config = cx.editor.config(); let scrolloff = config.scrolloff; if let Some(query) = cx.editor.registers.first(register, cx.editor) { - let doc = doc!(cx.editor); - let contents = doc.text().slice(..).to_string(); let search_config = &config.search; let case_insensitive = if search_config.smart_case { !query.chars().any(char::is_uppercase) @@ -2094,15 +2078,17 @@ fn search_next_or_prev_impl(cx: &mut Context, movement: Movement, direction: Dir false }; let wrap_around = search_config.wrap_around; - if let Ok(regex) = RegexBuilder::new(&query) - .case_insensitive(case_insensitive) - .multi_line(true) - .build() + if let Ok(regex) = rope::RegexBuilder::new() + .syntax( + rope::Config::new() + .case_insensitive(case_insensitive) + .multi_line(true), + ) + .build(&query) { for _ in 0..count { search_impl( cx.editor, - &contents, ®ex, movement, direction, @@ -2239,7 +2225,7 @@ fn format(&self, current_path: &Self::Data) -> Row { let reg = cx.register.unwrap_or('/'); let completions = search_completions(cx, Some(reg)); - ui::regex_prompt( + ui::raw_regex_prompt( cx, "global-search:".into(), Some(reg), @@ -2250,7 +2236,7 @@ fn format(&self, current_path: &Self::Data) -> Row { .map(|comp| (0.., std::borrow::Cow::Owned(comp.clone()))) .collect() }, - move |cx, regex, event| { + move |cx, _, input, event| { if event != PromptEvent::Validate { return; } @@ -2265,7 +2251,7 @@ fn format(&self, current_path: &Self::Data) -> Row { if let Ok(matcher) = RegexMatcherBuilder::new() .case_smart(smart_case) - .build(regex.as_str()) + .build(input) { let search_root = helix_stdx::env::current_working_dir(); if !search_root.exists() { diff --git a/helix-term/src/ui/mod.rs b/helix-term/src/ui/mod.rs index 0873116cb..a4b148af3 100644 --- a/helix-term/src/ui/mod.rs +++ b/helix-term/src/ui/mod.rs @@ -18,6 +18,7 @@ use crate::job::{self, Callback}; pub use completion::{Completion, CompletionItem}; pub use editor::EditorView; +use helix_stdx::rope; pub use markdown::Markdown; pub use menu::Menu; pub use picker::{DynamicPicker, FileLocation, Picker}; @@ -26,8 +27,6 @@ pub use spinner::{ProgressSpinners, Spinner}; pub use text::Text; -use helix_core::regex::Regex; -use helix_core::regex::RegexBuilder; use helix_view::Editor; use std::path::PathBuf; @@ -63,7 +62,22 @@ pub fn regex_prompt( prompt: std::borrow::Cow<'static, str>, history_register: Option, completion_fn: impl FnMut(&Editor, &str) -> Vec + 'static, - fun: impl Fn(&mut crate::compositor::Context, Regex, PromptEvent) + 'static, + fun: impl Fn(&mut crate::compositor::Context, rope::Regex, PromptEvent) + 'static, +) { + raw_regex_prompt( + cx, + prompt, + history_register, + completion_fn, + move |cx, regex, _, event| fun(cx, regex, event), + ); +} +pub fn raw_regex_prompt( + cx: &mut crate::commands::Context, + prompt: std::borrow::Cow<'static, str>, + history_register: Option, + completion_fn: impl FnMut(&Editor, &str) -> Vec + 'static, + fun: impl Fn(&mut crate::compositor::Context, rope::Regex, &str, PromptEvent) + 'static, ) { let (view, doc) = current!(cx.editor); let doc_id = view.doc; @@ -94,10 +108,13 @@ pub fn regex_prompt( false }; - match RegexBuilder::new(input) - .case_insensitive(case_insensitive) - .multi_line(true) - .build() + match rope::RegexBuilder::new() + .syntax( + rope::Config::new() + .case_insensitive(case_insensitive) + .multi_line(true), + ) + .build(input) { Ok(regex) => { let (view, doc) = current!(cx.editor); @@ -110,7 +127,7 @@ pub fn regex_prompt( view.jumps.push((doc_id, snapshot.clone())); } - fun(cx, regex, event); + fun(cx, regex, input, event); let (view, doc) = current!(cx.editor); view.ensure_cursor_in_view(doc, config.scrolloff);