feat(stdx): add str mod with unescape function

This commit introduces a `str` module and an `unescape` function to
`helix-stdx`, which processes escape sequences in strings and converts
them into their corresponding literal characters. The function handles a
variety of escape sequences, including:

- `\n` for newlines
- `\t` for tabs
- `\u{...}` for Unicode characters

The function does not unescape sequences like `\\` to `\`, leaving them
as they are. This opinionated behavior ensures that only certain escape
sequences are processed, and is built around user input, not general
input.

Given that its based around user input, a conservative approach was
taken for handling bad input, where if the string cannot be processed as
expected, it returns the original input.

Examples:
- Converting escaped newlines: `unescape("hello\\nworld")` results in
`"hello\nworld"`.
- Converting escaped tabs: `unescape("hello\\tworld")` results in
`"hello\tworld"`.
- Converting Unicode escape sequences:
`unescape("hello\\u{1f929}world")` results in `"hello🤩world"`.
- Handling invalid Unicode escape sequence:
`unescape("hello\\u{999999999}world")` results in the original
`"hello\\u{999999999}world"`.

The implementation also includes tests, but no gaurantees for edgecases.
This commit is contained in:
Rolo 2024-06-22 01:09:02 -07:00
parent b05ed9bf85
commit 6fbc85e168
4 changed files with 206 additions and 0 deletions

1
Cargo.lock generated
View File

@ -1416,6 +1416,7 @@ dependencies = [
"regex-cursor",
"ropey",
"rustix",
"smartstring",
"tempfile",
"which",
"windows-sys 0.52.0",

View File

@ -18,6 +18,7 @@ ropey = { version = "1.6.1", default-features = false }
which = "6.0"
regex-cursor = "0.1.4"
bitflags = "2.6"
smartstring = "1.0.1"
[target.'cfg(windows)'.dependencies]
windows-sys = { version = "0.52", features = ["Win32_Security", "Win32_Security_Authorization", "Win32_System_Threading"] }

View File

@ -2,3 +2,4 @@
pub mod faccess;
pub mod path;
pub mod rope;
pub mod str;

203
helix-stdx/src/str.rs Normal file
View File

@ -0,0 +1,203 @@
use smartstring::{LazyCompact, SmartString};
use std::borrow::Cow;
/// Unescapes a string, converting escape sequences into their literal characters.
///
/// This function handles the following escape sequences:
/// - `\\n` is converted to `\n` (newline)
/// - `\\t` is converted to `\t` (tab)
/// - `\\u{...}` is converted to the corresponding Unicode character
///
/// Other escape sequences, such as `\\` followed by any character not listed above, will remain unchanged.
///
/// If input is invalid, for example if there is invalid unicode, \u{999999999}, it will return the input as is.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// # use helix_stdx::str::unescape;
/// let unescaped = unescape("hello\\nworld");
/// assert_eq!("hello\nworld", unescaped);
/// ```
///
/// Unescaping tabs:
///
/// ```
/// # use helix_stdx::str::unescape;
/// let unescaped = unescape("hello\\tworld");
/// assert_eq!("hello\tworld", unescaped);
/// ```
///
/// Unescaping Unicode characters:
///
/// ```
/// # use helix_stdx::str::unescape;
/// let unescaped = unescape("hello\\u{1f929}world");
/// assert_eq!("hello\u{1f929}world", unescaped);
/// assert_eq!("hello🤩world", unescaped);
/// ```
///
/// Handling backslashes:
///
/// ```
/// # use helix_stdx::str::unescape;
/// let unescaped = unescape(r"hello\\world");
/// assert_eq!(r"hello\\world", unescaped);
///
/// let unescaped = unescape(r"hello\\\\world");
/// assert_eq!(r"hello\\\\world", unescaped);
/// ```
///
/// # Note
///
/// This function is opinionated, with a clear purpose of handling user input, not a general or generic unescaping utility, and does not unescape sequences like `\\'` or `\\\"`, leaving them as is.
#[inline]
pub fn unescape(s: &str) -> Cow<'_, str> {
enum State {
Normal,
Escaped,
Unicode,
}
let mut unescaped = String::new();
let mut state = State::Normal;
let mut is_escaped = false;
// NOTE: Max unicode code point is U+10FFFF for a maximum of 6 chars
let mut unicode = SmartString::<LazyCompact>::new_const();
for (idx, ch) in s.char_indices() {
match state {
State::Normal => match ch {
'\\' => {
if !is_escaped {
// PERF: As not every separator will be escaped, we use `String::new` as that has no initial
// allocation. If an escape is found, then we reserve capacity thats the len of the separator,
// as the new unescaped string will be at least that long.
unescaped.reserve(s.len());
if idx > 0 {
// First time finding an escape, so all prior chars can be added to the new unescaped
// version if its not the very first char found.
unescaped.push_str(&s[0..idx]);
}
}
state = State::Escaped;
is_escaped = true;
}
_ => {
if is_escaped {
unescaped.push(ch);
}
}
},
State::Escaped => {
match ch {
'n' => unescaped.push('\n'),
't' => unescaped.push('\t'),
'u' => {
state = State::Unicode;
continue;
}
// Uncomment if you want to handle '\\' to '\'
// '\\' => unescaped.push('\\'),
_ => {
unescaped.push('\\');
unescaped.push(ch);
}
}
state = State::Normal;
}
State::Unicode => match ch {
'{' => continue,
'}' => {
let Ok(digit) = u32::from_str_radix(&unicode, 16) else {
return s.into();
};
let Some(point) = char::from_u32(digit) else {
return s.into();
};
unescaped.push(point);
// Might be more unicode to unescape so clear for reuse.
unicode.clear();
state = State::Normal;
}
_ => unicode.push(ch),
},
}
}
if is_escaped {
unescaped.into()
} else {
s.into()
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn should_unescape_newline() {
let unescaped = unescape("hello\\nworld");
assert_eq!("hello\nworld", unescaped);
}
#[test]
fn should_unescape_tab() {
let unescaped = unescape("hello\\tworld");
assert_eq!("hello\tworld", unescaped);
}
#[test]
fn should_unescape_unicode() {
let unescaped = unescape("hello\\u{1f929}world");
assert_eq!("hello\u{1f929}world", unescaped, "char: 🤩 ");
assert_eq!("hello🤩world", unescaped);
}
#[test]
fn should_return_original_input_due_to_bad_unicode() {
let unescaped = unescape("hello\\u{999999999}world");
assert_eq!("hello\\u{999999999}world", unescaped);
}
#[test]
fn should_not_unescape_slash() {
let unescaped = unescape(r"hello\\world");
assert_eq!(r"hello\\world", unescaped);
let unescaped = unescape(r"hello\\\\world");
assert_eq!(r"hello\\\\world", unescaped);
}
#[test]
fn should_not_unescape_slash_single_quote() {
let unescaped = unescape("\\'");
assert_eq!(r"\'", unescaped);
}
#[test]
fn should_not_unescape_slash_double_quote() {
let unescaped = unescape("\\\"");
assert_eq!(r#"\""#, unescaped);
}
#[test]
fn should_not_change_anything() {
let unescaped = unescape("'");
assert_eq!("'", unescaped);
let unescaped = unescape(r#"""#);
assert_eq!(r#"""#, unescaped);
}
#[test]
fn should_only_unescape_newline_not_slash_single_quote() {
let unescaped = unescape("\\n\'");
assert_eq!("\n'", unescaped);
let unescaped = unescape("\\n\\'");
assert_eq!("\n\\'", unescaped);
}
}