switch to regex-cursor (#9422)

This commit is contained in:
Pascal Kuthe 2024-02-26 08:45:20 +01:00 committed by GitHub
parent c68ec92c5e
commit cd02976fa3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 176 additions and 87 deletions

18
Cargo.lock generated
View file

@ -1344,6 +1344,7 @@ version = "23.10.0"
dependencies = [
"dunce",
"etcetera",
"regex-cursor",
"ropey",
"tempfile",
"which",
@ -1938,15 +1939,28 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.4.4"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a"
checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-cursor"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a43718aa0040434d45728c43f56bd53bda75a91c46954cdf0f2ff4dbc8aabbe7"
dependencies = [
"log",
"memchr",
"regex-automata",
"regex-syntax",
"ropey",
]
[[package]]
name = "regex-syntax"
version = "0.8.2"

View file

@ -7,9 +7,11 @@ use crate::{
ensure_grapheme_boundary_next, ensure_grapheme_boundary_prev, next_grapheme_boundary,
prev_grapheme_boundary,
},
line_ending::get_line_ending,
movement::Direction,
Assoc, ChangeSet, RopeGraphemes, RopeSlice,
};
use helix_stdx::rope::{self, RopeSliceExt};
use smallvec::{smallvec, SmallVec};
use std::borrow::Cow;
@ -708,12 +710,12 @@ impl IntoIterator for Selection {
pub fn keep_or_remove_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
regex: &rope::Regex,
remove: bool,
) -> Option<Selection> {
let result: SmallVec<_> = selection
.iter()
.filter(|range| regex.is_match(&range.fragment(text)) ^ remove)
.filter(|range| regex.is_match(text.regex_input_at(range.from()..range.to())) ^ remove)
.copied()
.collect();
@ -724,25 +726,20 @@ pub fn keep_or_remove_matches(
None
}
// TODO: support to split on capture #N instead of whole match
pub fn select_on_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
regex: &rope::Regex,
) -> Option<Selection> {
let mut result = SmallVec::with_capacity(selection.len());
for sel in selection {
// TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
let fragment = sel.fragment(text);
let sel_start = sel.from();
let start_byte = text.char_to_byte(sel_start);
for mat in regex.find_iter(&fragment) {
for mat in regex.find_iter(text.regex_input_at(sel.from()..sel.to())) {
// TODO: retain range direction
let start = text.byte_to_char(start_byte + mat.start());
let end = text.byte_to_char(start_byte + mat.end());
let start = text.byte_to_char(mat.start());
let end = text.byte_to_char(mat.end());
let range = Range::new(start, end);
// Make sure the match is not right outside of the selection.
@ -761,12 +758,7 @@ pub fn select_on_matches(
None
}
// TODO: support to split on capture #N instead of whole match
pub fn split_on_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
) -> Selection {
pub fn split_on_newline(text: RopeSlice, selection: &Selection) -> Selection {
let mut result = SmallVec::with_capacity(selection.len());
for sel in selection {
@ -776,21 +768,47 @@ pub fn split_on_matches(
continue;
}
// TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
let fragment = sel.fragment(text);
let sel_start = sel.from();
let sel_end = sel.to();
let start_byte = text.char_to_byte(sel_start);
let mut start = sel_start;
for mat in regex.find_iter(&fragment) {
for mat in sel.slice(text).lines() {
let len = mat.len_chars();
let line_end_len = get_line_ending(&mat).map(|le| le.len_chars()).unwrap_or(0);
// TODO: retain range direction
let end = text.byte_to_char(start_byte + mat.start());
result.push(Range::new(start, start + len - line_end_len));
start += len;
}
if start < sel_end {
result.push(Range::new(start, sel_end));
}
}
// TODO: figure out a new primary index
Selection::new(result, 0)
}
pub fn split_on_matches(text: RopeSlice, selection: &Selection, regex: &rope::Regex) -> Selection {
let mut result = SmallVec::with_capacity(selection.len());
for sel in selection {
// Special case: zero-width selection.
if sel.from() == sel.to() {
result.push(*sel);
continue;
}
let sel_start = sel.from();
let sel_end = sel.to();
let mut start = sel_start;
for mat in regex.find_iter(text.regex_input_at(sel_start..sel_end)) {
// TODO: retain range direction
let end = text.byte_to_char(mat.start());
result.push(Range::new(start, end));
start = text.byte_to_char(start_byte + mat.end());
start = text.byte_to_char(mat.end());
}
if start < sel_end {
@ -1021,14 +1039,12 @@ mod test {
#[test]
fn test_select_on_matches() {
use crate::regex::{Regex, RegexBuilder};
let r = Rope::from_str("Nobody expects the Spanish inquisition");
let s = r.slice(..);
let selection = Selection::single(0, r.len_chars());
assert_eq!(
select_on_matches(s, &selection, &Regex::new(r"[A-Z][a-z]*").unwrap()),
select_on_matches(s, &selection, &rope::Regex::new(r"[A-Z][a-z]*").unwrap()),
Some(Selection::new(
smallvec![Range::new(0, 6), Range::new(19, 26)],
0
@ -1038,8 +1054,14 @@ mod test {
let r = Rope::from_str("This\nString\n\ncontains multiple\nlines");
let s = r.slice(..);
let start_of_line = RegexBuilder::new(r"^").multi_line(true).build().unwrap();
let end_of_line = RegexBuilder::new(r"$").multi_line(true).build().unwrap();
let start_of_line = rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"^")
.unwrap();
let end_of_line = rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"$")
.unwrap();
// line without ending
assert_eq!(
@ -1077,9 +1099,9 @@ mod test {
select_on_matches(
s,
&Selection::single(0, s.len_chars()),
&RegexBuilder::new(r"^[a-z ]*$")
.multi_line(true)
.build()
&rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"^[a-z ]*$")
.unwrap()
),
Some(Selection::new(
@ -1171,13 +1193,15 @@ mod test {
#[test]
fn test_split_on_matches() {
use crate::regex::Regex;
let text = Rope::from(" abcd efg wrs xyz 123 456");
let selection = Selection::new(smallvec![Range::new(0, 9), Range::new(11, 20),], 0);
let result = split_on_matches(text.slice(..), &selection, &Regex::new(r"\s+").unwrap());
let result = split_on_matches(
text.slice(..),
&selection,
&rope::Regex::new(r"\s+").unwrap(),
);
assert_eq!(
result.ranges(),

View file

@ -12,6 +12,7 @@ use arc_swap::{ArcSwap, Guard};
use bitflags::bitflags;
use globset::GlobSet;
use hashbrown::raw::RawTable;
use helix_stdx::rope::{self, RopeSliceExt};
use slotmap::{DefaultKey as LayerId, HopSlotMap};
use std::{
@ -1961,11 +1962,16 @@ impl HighlightConfiguration {
node_slice
};
static SHEBANG_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(SHEBANG).unwrap());
static SHEBANG_REGEX: Lazy<rope::Regex> =
Lazy::new(|| rope::Regex::new(SHEBANG).unwrap());
injection_capture = SHEBANG_REGEX
.captures(&Cow::from(lines))
.map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned()))
.captures_iter(lines.regex_input())
.map(|cap| {
let cap = lines.byte_slice(cap.get_group(1).unwrap().range());
InjectionLanguageMarker::Shebang(cap.into())
})
.next()
} else if index == self.injection_content_capture_index {
content_node = Some(capture.node);
}

View file

@ -16,6 +16,7 @@ dunce = "1.0"
etcetera = "0.8"
ropey = { version = "1.6.1", default-features = false }
which = "6.0"
regex-cursor = "0.1.3"
[dev-dependencies]
tempfile = "3.10"

View file

@ -1,11 +1,22 @@
use std::ops::{Bound, RangeBounds};
pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
pub use regex_cursor::regex_automata::util::syntax::Config;
use regex_cursor::{Input as RegexInput, RopeyCursor};
use ropey::RopeSlice;
pub trait RopeSliceExt: Sized {
pub trait RopeSliceExt<'a>: Sized {
fn ends_with(self, text: &str) -> bool;
fn starts_with(self, text: &str) -> bool;
fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;
fn regex_input_at_bytes<R: RangeBounds<usize>>(
self,
byte_range: R,
) -> RegexInput<RopeyCursor<'a>>;
fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
}
impl RopeSliceExt for RopeSlice<'_> {
impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
fn ends_with(self, text: &str) -> bool {
let len = self.len_bytes();
if len < text.len() {
@ -23,4 +34,34 @@ impl RopeSliceExt for RopeSlice<'_> {
self.get_byte_slice(..len - text.len())
.map_or(false, |start| start == text)
}
fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {
RegexInput::new(self)
}
fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {
let start_bound = match char_range.start_bound() {
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
Bound::Unbounded => Bound::Unbounded,
};
let end_bound = match char_range.end_bound() {
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
Bound::Unbounded => Bound::Unbounded,
};
self.regex_input_at_bytes((start_bound, end_bound))
}
fn regex_input_at_bytes<R: RangeBounds<usize>>(
self,
byte_range: R,
) -> RegexInput<RopeyCursor<'a>> {
let input = match byte_range.start_bound() {
Bound::Included(&pos) | Bound::Excluded(&pos) => {
RegexInput::new(RopeyCursor::at(self, pos))
}
Bound::Unbounded => RegexInput::new(self),
};
input.range(byte_range)
}
}

View file

@ -3,6 +3,7 @@ pub(crate) mod lsp;
pub(crate) mod typed;
pub use dap::*;
use helix_stdx::rope::{self, RopeSliceExt};
use helix_vcs::Hunk;
pub use lsp::*;
use tui::widgets::Row;
@ -19,7 +20,7 @@ use helix_core::{
match_brackets,
movement::{self, move_vertically_visual, Direction},
object, pos_at_coords,
regex::{self, Regex, RegexBuilder},
regex::{self, Regex},
search::{self, CharMatcher},
selection, shellwords, surround,
syntax::LanguageServerFeature,
@ -1907,11 +1908,7 @@ fn split_selection(cx: &mut Context) {
fn split_selection_on_newline(cx: &mut Context) {
let (view, doc) = current!(cx.editor);
let text = doc.text().slice(..);
// only compile the regex once
#[allow(clippy::trivial_regex)]
static REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|[\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}]").unwrap());
let selection = selection::split_on_matches(text, doc.selection(view.id), &REGEX);
let selection = selection::split_on_newline(text, doc.selection(view.id));
doc.set_selection(view.id, selection);
}
@ -1930,8 +1927,7 @@ fn merge_consecutive_selections(cx: &mut Context) {
#[allow(clippy::too_many_arguments)]
fn search_impl(
editor: &mut Editor,
contents: &str,
regex: &Regex,
regex: &rope::Regex,
movement: Movement,
direction: Direction,
scrolloff: usize,
@ -1959,23 +1955,20 @@ fn search_impl(
// do a reverse search and wraparound to the end, we don't need to search
// the text before the current cursor position for matches, but by slicing
// it out, we need to add it back to the position of the selection.
let mut offset = 0;
let doc = doc!(editor).text().slice(..);
// use find_at to find the next match after the cursor, loop around the end
// Careful, `Regex` uses `bytes` as offsets, not character indices!
let mut mat = match direction {
Direction::Forward => regex.find_at(contents, start),
Direction::Backward => regex.find_iter(&contents[..start]).last(),
Direction::Forward => regex.find(doc.regex_input_at_bytes(start..)),
Direction::Backward => regex.find_iter(doc.regex_input_at_bytes(..start)).last(),
};
if mat.is_none() {
if wrap_around {
mat = match direction {
Direction::Forward => regex.find(contents),
Direction::Backward => {
offset = start;
regex.find_iter(&contents[start..]).last()
}
Direction::Forward => regex.find(doc.regex_input()),
Direction::Backward => regex.find_iter(doc.regex_input_at_bytes(start..)).last(),
};
}
if show_warnings {
@ -1992,8 +1985,8 @@ fn search_impl(
let selection = doc.selection(view.id);
if let Some(mat) = mat {
let start = text.byte_to_char(mat.start() + offset);
let end = text.byte_to_char(mat.end() + offset);
let start = text.byte_to_char(mat.start());
let end = text.byte_to_char(mat.end());
if end == 0 {
// skip empty matches that don't make sense
@ -2037,13 +2030,7 @@ fn searcher(cx: &mut Context, direction: Direction) {
let scrolloff = config.scrolloff;
let wrap_around = config.search.wrap_around;
let doc = doc!(cx.editor);
// TODO: could probably share with select_on_matches?
// HAXX: sadly we can't avoid allocating a single string for the whole buffer since we can't
// feed chunks into the regex yet
let contents = doc.text().slice(..).to_string();
let completions = search_completions(cx, Some(reg));
ui::regex_prompt(
@ -2065,7 +2052,6 @@ fn searcher(cx: &mut Context, direction: Direction) {
}
search_impl(
cx.editor,
&contents,
&regex,
Movement::Move,
direction,
@ -2085,8 +2071,6 @@ fn search_next_or_prev_impl(cx: &mut Context, movement: Movement, direction: Dir
let config = cx.editor.config();
let scrolloff = config.scrolloff;
if let Some(query) = cx.editor.registers.first(register, cx.editor) {
let doc = doc!(cx.editor);
let contents = doc.text().slice(..).to_string();
let search_config = &config.search;
let case_insensitive = if search_config.smart_case {
!query.chars().any(char::is_uppercase)
@ -2094,15 +2078,17 @@ fn search_next_or_prev_impl(cx: &mut Context, movement: Movement, direction: Dir
false
};
let wrap_around = search_config.wrap_around;
if let Ok(regex) = RegexBuilder::new(&query)
.case_insensitive(case_insensitive)
.multi_line(true)
.build()
if let Ok(regex) = rope::RegexBuilder::new()
.syntax(
rope::Config::new()
.case_insensitive(case_insensitive)
.multi_line(true),
)
.build(&query)
{
for _ in 0..count {
search_impl(
cx.editor,
&contents,
&regex,
movement,
direction,
@ -2239,7 +2225,7 @@ fn global_search(cx: &mut Context) {
let reg = cx.register.unwrap_or('/');
let completions = search_completions(cx, Some(reg));
ui::regex_prompt(
ui::raw_regex_prompt(
cx,
"global-search:".into(),
Some(reg),
@ -2250,7 +2236,7 @@ fn global_search(cx: &mut Context) {
.map(|comp| (0.., std::borrow::Cow::Owned(comp.clone())))
.collect()
},
move |cx, regex, event| {
move |cx, _, input, event| {
if event != PromptEvent::Validate {
return;
}
@ -2265,7 +2251,7 @@ fn global_search(cx: &mut Context) {
if let Ok(matcher) = RegexMatcherBuilder::new()
.case_smart(smart_case)
.build(regex.as_str())
.build(input)
{
let search_root = helix_stdx::env::current_working_dir();
if !search_root.exists() {

View file

@ -18,6 +18,7 @@ use crate::filter_picker_entry;
use crate::job::{self, Callback};
pub use completion::{Completion, CompletionItem};
pub use editor::EditorView;
use helix_stdx::rope;
pub use markdown::Markdown;
pub use menu::Menu;
pub use picker::{DynamicPicker, FileLocation, Picker};
@ -26,8 +27,6 @@ pub use prompt::{Prompt, PromptEvent};
pub use spinner::{ProgressSpinners, Spinner};
pub use text::Text;
use helix_core::regex::Regex;
use helix_core::regex::RegexBuilder;
use helix_view::Editor;
use std::path::PathBuf;
@ -63,7 +62,22 @@ pub fn regex_prompt(
prompt: std::borrow::Cow<'static, str>,
history_register: Option<char>,
completion_fn: impl FnMut(&Editor, &str) -> Vec<prompt::Completion> + 'static,
fun: impl Fn(&mut crate::compositor::Context, Regex, PromptEvent) + 'static,
fun: impl Fn(&mut crate::compositor::Context, rope::Regex, PromptEvent) + 'static,
) {
raw_regex_prompt(
cx,
prompt,
history_register,
completion_fn,
move |cx, regex, _, event| fun(cx, regex, event),
);
}
pub fn raw_regex_prompt(
cx: &mut crate::commands::Context,
prompt: std::borrow::Cow<'static, str>,
history_register: Option<char>,
completion_fn: impl FnMut(&Editor, &str) -> Vec<prompt::Completion> + 'static,
fun: impl Fn(&mut crate::compositor::Context, rope::Regex, &str, PromptEvent) + 'static,
) {
let (view, doc) = current!(cx.editor);
let doc_id = view.doc;
@ -94,10 +108,13 @@ pub fn regex_prompt(
false
};
match RegexBuilder::new(input)
.case_insensitive(case_insensitive)
.multi_line(true)
.build()
match rope::RegexBuilder::new()
.syntax(
rope::Config::new()
.case_insensitive(case_insensitive)
.multi_line(true),
)
.build(input)
{
Ok(regex) => {
let (view, doc) = current!(cx.editor);
@ -110,7 +127,7 @@ pub fn regex_prompt(
view.jumps.push((doc_id, snapshot.clone()));
}
fun(cx, regex, event);
fun(cx, regex, input, event);
let (view, doc) = current!(cx.editor);
view.ensure_cursor_in_view(doc, config.scrolloff);