2376fc875e
Ropey's non-unicode lines impl has some bugs still
137 lines
4.3 KiB
Rust
137 lines
4.3 KiB
Rust
//! Utility functions to categorize a `char`.
|
||
|
||
use crate::LineEnding;
|
||
|
||
#[derive(Debug, Eq, PartialEq)]
|
||
pub enum CharCategory {
|
||
Whitespace,
|
||
Eol,
|
||
Word,
|
||
Punctuation,
|
||
Unknown,
|
||
}
|
||
|
||
#[inline]
|
||
pub fn categorize_char(ch: char) -> CharCategory {
|
||
if char_is_line_ending(ch) {
|
||
CharCategory::Eol
|
||
} else if ch.is_whitespace() {
|
||
CharCategory::Whitespace
|
||
} else if char_is_word(ch) {
|
||
CharCategory::Word
|
||
} else if char_is_punctuation(ch) {
|
||
CharCategory::Punctuation
|
||
} else {
|
||
CharCategory::Unknown
|
||
}
|
||
}
|
||
|
||
/// Determine whether a character is a line ending.
|
||
#[inline]
|
||
pub fn char_is_line_ending(ch: char) -> bool {
|
||
LineEnding::from_char(ch).is_some()
|
||
}
|
||
|
||
/// Determine whether a character qualifies as (non-line-break)
|
||
/// whitespace.
|
||
#[inline]
|
||
pub fn char_is_whitespace(ch: char) -> bool {
|
||
// TODO: this is a naive binary categorization of whitespace
|
||
// characters. For display, word wrapping, etc. we'll need a better
|
||
// categorization based on e.g. breaking vs non-breaking spaces
|
||
// and whether they're zero-width or not.
|
||
match ch {
|
||
//'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
|
||
'\u{0009}' | // Character Tabulation
|
||
'\u{0020}' | // Space
|
||
'\u{00A0}' | // No-break Space
|
||
'\u{180E}' | // Mongolian Vowel Separator
|
||
'\u{202F}' | // Narrow No-break Space
|
||
'\u{205F}' | // Medium Mathematical Space
|
||
'\u{3000}' | // Ideographic Space
|
||
'\u{FEFF}' // Zero Width No-break Space
|
||
=> true,
|
||
|
||
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
|
||
// Four-per-em Space, Six-per-em Space, Figure Space,
|
||
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
|
||
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
|
||
|
||
_ => false,
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
pub fn char_is_punctuation(ch: char) -> bool {
|
||
use unicode_general_category::{get_general_category, GeneralCategory};
|
||
|
||
matches!(
|
||
get_general_category(ch),
|
||
GeneralCategory::OtherPunctuation
|
||
| GeneralCategory::OpenPunctuation
|
||
| GeneralCategory::ClosePunctuation
|
||
| GeneralCategory::InitialPunctuation
|
||
| GeneralCategory::FinalPunctuation
|
||
| GeneralCategory::ConnectorPunctuation
|
||
| GeneralCategory::DashPunctuation
|
||
| GeneralCategory::MathSymbol
|
||
| GeneralCategory::CurrencySymbol
|
||
| GeneralCategory::ModifierSymbol
|
||
)
|
||
}
|
||
|
||
#[inline]
|
||
pub fn char_is_word(ch: char) -> bool {
|
||
ch.is_alphanumeric() || ch == '_'
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod test {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn test_categorize() {
|
||
#[cfg(not(feature = "unicode-lines"))]
|
||
const EOL_TEST_CASE: &str = "\n";
|
||
#[cfg(feature = "unicode-lines")]
|
||
const EOL_TEST_CASE: &str = "\n\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
|
||
const WORD_TEST_CASE: &str = "_hello_world_あいうえおー12345678901234567890";
|
||
const PUNCTUATION_TEST_CASE: &str =
|
||
"!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
|
||
const WHITESPACE_TEST_CASE: &str = " ";
|
||
|
||
for ch in EOL_TEST_CASE.chars() {
|
||
assert_eq!(CharCategory::Eol, categorize_char(ch));
|
||
}
|
||
|
||
for ch in WHITESPACE_TEST_CASE.chars() {
|
||
assert_eq!(
|
||
CharCategory::Whitespace,
|
||
categorize_char(ch),
|
||
"Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
|
||
ch,
|
||
categorize_char(ch)
|
||
);
|
||
}
|
||
|
||
for ch in WORD_TEST_CASE.chars() {
|
||
assert_eq!(
|
||
CharCategory::Word,
|
||
categorize_char(ch),
|
||
"Testing '{}', but got `{:?}` instead of `Category::Word`",
|
||
ch,
|
||
categorize_char(ch)
|
||
);
|
||
}
|
||
|
||
for ch in PUNCTUATION_TEST_CASE.chars() {
|
||
assert_eq!(
|
||
CharCategory::Punctuation,
|
||
categorize_char(ch),
|
||
"Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
|
||
ch,
|
||
categorize_char(ch)
|
||
);
|
||
}
|
||
}
|
||
}
|