Fix Unicode (#135)

* init * wip * wip * fix unicode break * fix unicode break * Update helix-core/src/transaction.rs Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu> * clippy * fix * add changes * added test * wip * wip * wip * wip * fix * fix view * fix #88 Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>
2021-06-08 00:20:15 -04:00 · 2021-06-08 00:20:15 -04:00 · b873fb9897
commit b873fb9897
parent 8f1eb7b2b0
7 changed files with 94 additions and 26 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -265,6 +265,7 @@ dependencies = [
 "tendril",
 "toml",
 "tree-sitter",
+ "unicode-general-category",
 "unicode-segmentation",
 "unicode-width",
 ]
@ -969,6 +970,12 @@ dependencies = [
 "matches",
 ]

+[[package]]
+name = "unicode-general-category"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07547e3ee45e28326cc23faac56d44f58f16ab23e413db526debce3b0bfd2742"
+
 [[package]]
 name = "unicode-normalization"
 version = "0.1.19"
--- a/helix-core/Cargo.toml
+++ b/helix-core/Cargo.toml
@ -19,6 +19,7 @@ smallvec = "1.4"
 tendril = "0.4.2"
 unicode-segmentation = "1.6"
 unicode-width = "0.1"
+unicode-general-category = "0.4.0"
 # slab = "0.4.2"
 tree-sitter = "0.19"
 once_cell = "1.4"
--- a/helix-core/src/movement.rs
+++ b/helix-core/src/movement.rs
@ -88,11 +88,11 @@ pub fn move_next_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->

        if is_word(ch) {
            skip_over_next(slice, &mut end, is_word);
-        } else if ch.is_ascii_punctuation() {
-            skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            skip_over_next(slice, &mut end, is_punctuation);
        }

-        skip_over_next(slice, &mut end, is_horiz_blank);
+        skip_over_next(slice, &mut end, char::is_whitespace);
    }

    Some(Range::new(begin, end - 1))
@ -119,15 +119,15 @@ pub fn move_prev_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->

        end = begin;

-        with_end = skip_over_prev(slice, &mut end, is_horiz_blank);
+        with_end = skip_over_prev(slice, &mut end, char::is_whitespace);

        // refetch
        let ch = slice.char(end);

        if is_word(ch) {
            with_end = skip_over_prev(slice, &mut end, is_word);
-        } else if ch.is_ascii_punctuation() {
-            with_end = skip_over_prev(slice, &mut end, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            with_end = skip_over_prev(slice, &mut end, is_punctuation);
        }
    }

@ -155,15 +155,15 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O

        end = begin;

-        skip_over_next(slice, &mut end, is_horiz_blank);
+        skip_over_next(slice, &mut end, char::is_whitespace);

        // refetch
        let ch = slice.char(end);

        if is_word(ch) {
            skip_over_next(slice, &mut end, is_word);
-        } else if ch.is_ascii_punctuation() {
-            skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            skip_over_next(slice, &mut end, is_punctuation);
        }
    }

@ -174,12 +174,28 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O

 // used for by-word movement

+#[inline]
 pub(crate) fn is_word(ch: char) -> bool {
    ch.is_alphanumeric() || ch == '_'
 }

-pub(crate) fn is_horiz_blank(ch: char) -> bool {
-    matches!(ch, ' ' | '\t')
+#[inline]
+pub(crate) fn is_punctuation(ch: char) -> bool {
+    use unicode_general_category::{get_general_category, GeneralCategory};
+
+    matches!(
+        get_general_category(ch),
+        GeneralCategory::OtherPunctuation
+            | GeneralCategory::OpenPunctuation
+            | GeneralCategory::ClosePunctuation
+            | GeneralCategory::InitialPunctuation
+            | GeneralCategory::FinalPunctuation
+            | GeneralCategory::ConnectorPunctuation
+            | GeneralCategory::DashPunctuation
+            | GeneralCategory::MathSymbol
+            | GeneralCategory::CurrencySymbol
+            | GeneralCategory::ModifierSymbol
+    )
 }

 #[derive(Debug, Eq, PartialEq)]
@ -191,14 +207,15 @@ pub(crate) enum Category {
    Unknown,
 }

+#[inline]
 pub(crate) fn categorize(ch: char) -> Category {
    if ch == '\n' {
        Category::Eol
-    } else if ch.is_ascii_whitespace() {
+    } else if ch.is_whitespace() {
        Category::Whitespace
    } else if is_word(ch) {
        Category::Word
-    } else if ch.is_ascii_punctuation() {
+    } else if is_punctuation(ch) {
        Category::Punctuation
    } else {
        Category::Unknown
@ -213,6 +230,7 @@ where
 {
    let mut chars = slice.chars_at(*pos);

+    #[allow(clippy::while_let_on_iterator)]
    while let Some(ch) = chars.next() {
        if !fun(ch) {
            break;
@ -231,6 +249,7 @@ where
    // need to +1 so that prev() includes current char
    let mut chars = slice.chars_at(*pos + 1);

+    #[allow(clippy::while_let_on_iterator)]
    while let Some(ch) = chars.prev() {
        if !fun(ch) {
            break;
@ -259,4 +278,44 @@ mod test {
            (1, 2).into()
        );
    }
+
+    #[test]
+    fn test_categorize() {
+        const WORD_TEST_CASE: &'static str =
+            "_hello_world_あいうえおー1234567890１２３４５６７８９０";
+        const PUNCTUATION_TEST_CASE: &'static str = "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~！”＃＄％＆’（）＊＋、。：；＜＝＞？＠「」＾｀｛｜｝～";
+        const WHITESPACE_TEST_CASE: &'static str = "  　   ";
+
+        assert_eq!(Category::Eol, categorize('\n'));
+
+        for ch in WHITESPACE_TEST_CASE.chars() {
+            assert_eq!(
+                Category::Whitespace,
+                categorize(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
+                ch,
+                categorize(ch)
+            );
+        }
+
+        for ch in WORD_TEST_CASE.chars() {
+            assert_eq!(
+                Category::Word,
+                categorize(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Word`",
+                ch,
+                categorize(ch)
+            );
+        }
+
+        for ch in PUNCTUATION_TEST_CASE.chars() {
+            assert_eq!(
+                Category::Punctuation,
+                categorize(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
+                ch,
+                categorize(ch)
+            );
+        }
+    }
 }
--- a/helix-core/src/transaction.rs
+++ b/helix-core/src/transaction.rs
@ -758,7 +758,7 @@ mod test {

    #[test]
    fn combine_with_utf8() {
-        const TEST_CASE: &'static str = "Hello, これはヒレクスエディターです！";
+        const TEST_CASE: &'static str = "Hello, これはヘリックスエディターです！";

        let empty = Rope::from("");
        let mut a = ChangeSet::new(&empty);
--- a/helix-core/src/words.rs
+++ b/helix-core/src/words.rs
@ -1,4 +1,4 @@
-use crate::movement::{categorize, is_horiz_blank, is_word, skip_over_prev};
+use crate::movement::{categorize, is_punctuation, is_word, skip_over_prev};
 use ropey::RopeSlice;

 #[must_use]
@ -13,15 +13,15 @@ pub fn nth_prev_word_boundary(slice: RopeSlice, mut char_idx: usize, count: usiz
        // return if not skip while?
        skip_over_prev(slice, &mut char_idx, |ch| ch == '\n');

-        with_end = skip_over_prev(slice, &mut char_idx, is_horiz_blank);
+        with_end = skip_over_prev(slice, &mut char_idx, char::is_whitespace);

        // refetch
        let ch = slice.char(char_idx);

        if is_word(ch) {
            with_end = skip_over_prev(slice, &mut char_idx, is_word);
-        } else if ch.is_ascii_punctuation() {
-            with_end = skip_over_prev(slice, &mut char_idx, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            with_end = skip_over_prev(slice, &mut char_idx, is_punctuation);
        }
    }

@ -47,11 +47,11 @@ fn different_prev_word_boundary() {
    t("hello, world", "hello, ");
    t("hello, ", "hello");
    t("hello", "");
-    t("こんにちは、世界！", "こんにちは、世界！"); // TODO: punctuation
+    t("こんにちは、世界！", "こんにちは、世界");
    t("こんにちは、世界", "こんにちは、");
-    t("こんにちは、", "こんにちは、"); // what?
+    t("こんにちは、", "こんにちは");
    t("こんにちは", "");
-    t("この世界。", "この世界。"); // what?
+    t("この世界。", "この世界");
    t("この世界", "");
    t("お前はもう死んでいる", "");
    t("その300円です", ""); // TODO: should stop at 300
--- a/helix-term/src/commands.rs
+++ b/helix-term/src/commands.rs
@ -654,9 +654,10 @@ pub fn split_selection_on_newline(cx: &mut Context) {
 fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, extend: bool) {
    let text = doc.text();
    let selection = doc.selection(view.id);
-    let start = selection.cursor();
+    let start = text.char_to_byte(selection.cursor());

    // use find_at to find the next match after the cursor, loop around the end
+    // Careful, `Regex` uses `bytes` as offsets, not character indices!
    let mat = regex
        .find_at(contents, start)
        .or_else(|| regex.find(contents));
@ -670,7 +671,7 @@ fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, e
            return;
        }

-        let head = end;
+        let head = end - 1;

        let selection = if extend {
            selection.clone().push(Range::new(start, head))
@ -1027,7 +1028,7 @@ pub fn command_mode(cx: &mut Context) {
    let mut prompt = Prompt::new(
        ":".to_owned(),
        |input: &str| {
-            // we use .this over split_ascii_whitespace() because we care about empty segments
+            // we use .this over split_whitespace() because we care about empty segments
            let parts = input.split(' ').collect::<Vec<&str>>();

            // simple heuristic: if there's no just one part, complete command name.
@ -1069,7 +1070,7 @@ pub fn command_mode(cx: &mut Context) {
                return;
            }

-            let parts = input.split_ascii_whitespace().collect::<Vec<&str>>();
+            let parts = input.split_whitespace().collect::<Vec<&str>>();
            if parts.is_empty() {
                return;
            }
--- a/helix-view/src/view.rs
+++ b/helix-view/src/view.rs
@ -106,7 +106,7 @@ impl View {
    /// Calculates the last visible line on screen
    #[inline]
    pub fn last_line(&self, doc: &Document) -> usize {
-        let height = self.area.height.saturating_sub(2); // - 2 for statusline
+        let height = self.area.height.saturating_sub(1); // - 1 for statusline
        std::cmp::min(
            self.first_line + height as usize,
            doc.text().len_lines() - 1,