2021-10-25 20:02:16 +04:00
|
|
|
|
//! Utility functions to categorize a `char`.
|
|
|
|
|
|
2021-06-21 02:09:10 +04:00
|
|
|
|
use crate::LineEnding;
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
|
|
|
pub enum CharCategory {
|
|
|
|
|
Whitespace,
|
|
|
|
|
Eol,
|
|
|
|
|
Word,
|
|
|
|
|
Punctuation,
|
|
|
|
|
Unknown,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
|
pub fn categorize_char(ch: char) -> CharCategory {
|
|
|
|
|
if char_is_line_ending(ch) {
|
|
|
|
|
CharCategory::Eol
|
|
|
|
|
} else if ch.is_whitespace() {
|
|
|
|
|
CharCategory::Whitespace
|
|
|
|
|
} else if char_is_word(ch) {
|
|
|
|
|
CharCategory::Word
|
|
|
|
|
} else if char_is_punctuation(ch) {
|
|
|
|
|
CharCategory::Punctuation
|
|
|
|
|
} else {
|
|
|
|
|
CharCategory::Unknown
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Determine whether a character is a line ending.
|
|
|
|
|
#[inline]
|
|
|
|
|
pub fn char_is_line_ending(ch: char) -> bool {
|
|
|
|
|
LineEnding::from_char(ch).is_some()
|
2021-06-14 06:13:31 +04:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Determine whether a character qualifies as (non-line-break)
|
|
|
|
|
/// whitespace.
|
2021-06-21 02:09:10 +04:00
|
|
|
|
#[inline]
|
|
|
|
|
pub fn char_is_whitespace(ch: char) -> bool {
|
2021-06-14 06:13:31 +04:00
|
|
|
|
// TODO: this is a naive binary categorization of whitespace
|
|
|
|
|
// characters. For display, word wrapping, etc. we'll need a better
|
|
|
|
|
// categorization based on e.g. breaking vs non-breaking spaces
|
|
|
|
|
// and whether they're zero-width or not.
|
2021-06-21 02:09:10 +04:00
|
|
|
|
match ch {
|
2021-06-14 06:13:31 +04:00
|
|
|
|
//'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
|
|
|
|
|
'\u{0009}' | // Character Tabulation
|
|
|
|
|
'\u{0020}' | // Space
|
|
|
|
|
'\u{00A0}' | // No-break Space
|
|
|
|
|
'\u{180E}' | // Mongolian Vowel Separator
|
|
|
|
|
'\u{202F}' | // Narrow No-break Space
|
|
|
|
|
'\u{205F}' | // Medium Mathematical Space
|
|
|
|
|
'\u{3000}' | // Ideographic Space
|
|
|
|
|
'\u{FEFF}' // Zero Width No-break Space
|
|
|
|
|
=> true,
|
|
|
|
|
|
|
|
|
|
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
|
|
|
|
|
// Four-per-em Space, Six-per-em Space, Figure Space,
|
|
|
|
|
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
|
2021-06-21 02:09:10 +04:00
|
|
|
|
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
|
2021-06-14 06:13:31 +04:00
|
|
|
|
|
|
|
|
|
_ => false,
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-06-21 02:09:10 +04:00
|
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
|
pub fn char_is_punctuation(ch: char) -> bool {
|
|
|
|
|
use unicode_general_category::{get_general_category, GeneralCategory};
|
|
|
|
|
|
|
|
|
|
matches!(
|
|
|
|
|
get_general_category(ch),
|
|
|
|
|
GeneralCategory::OtherPunctuation
|
|
|
|
|
| GeneralCategory::OpenPunctuation
|
|
|
|
|
| GeneralCategory::ClosePunctuation
|
|
|
|
|
| GeneralCategory::InitialPunctuation
|
|
|
|
|
| GeneralCategory::FinalPunctuation
|
|
|
|
|
| GeneralCategory::ConnectorPunctuation
|
|
|
|
|
| GeneralCategory::DashPunctuation
|
|
|
|
|
| GeneralCategory::MathSymbol
|
|
|
|
|
| GeneralCategory::CurrencySymbol
|
|
|
|
|
| GeneralCategory::ModifierSymbol
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
|
pub fn char_is_word(ch: char) -> bool {
|
|
|
|
|
ch.is_alphanumeric() || ch == '_'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod test {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_categorize() {
|
2022-01-23 11:37:23 +04:00
|
|
|
|
const EOL_TEST_CASE: &str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
|
|
|
|
|
const WORD_TEST_CASE: &str = "_hello_world_あいうえおー12345678901234567890";
|
|
|
|
|
const PUNCTUATION_TEST_CASE: &str =
|
2021-06-21 02:09:10 +04:00
|
|
|
|
"!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
|
2022-01-23 11:37:23 +04:00
|
|
|
|
const WHITESPACE_TEST_CASE: &str = " ";
|
2021-06-21 02:09:10 +04:00
|
|
|
|
|
|
|
|
|
for ch in EOL_TEST_CASE.chars() {
|
|
|
|
|
assert_eq!(CharCategory::Eol, categorize_char(ch));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ch in WHITESPACE_TEST_CASE.chars() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
CharCategory::Whitespace,
|
|
|
|
|
categorize_char(ch),
|
|
|
|
|
"Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
|
|
|
|
|
ch,
|
|
|
|
|
categorize_char(ch)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ch in WORD_TEST_CASE.chars() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
CharCategory::Word,
|
|
|
|
|
categorize_char(ch),
|
|
|
|
|
"Testing '{}', but got `{:?}` instead of `Category::Word`",
|
|
|
|
|
ch,
|
|
|
|
|
categorize_char(ch)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ch in PUNCTUATION_TEST_CASE.chars() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
CharCategory::Punctuation,
|
|
|
|
|
categorize_char(ch),
|
|
|
|
|
"Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
|
|
|
|
|
ch,
|
|
|
|
|
categorize_char(ch)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|