helix-mirror/helix-core/src/graphemes.rs
Nathan Vegdahl 2dba228c76 Fix highlight code splitting graphemes.
This resulted in phantom blank lines in files with CRLF line
endings, but could potentially have manifested with other
graphemes as well.
2021-06-26 11:09:50 +09:00

254 lines
8.9 KiB
Rust

// Based on https://github.com/cessen/led/blob/c4fa72405f510b7fd16052f90a598c429b3104a6/src/graphemes.rs
use ropey::{iter::Chunks, str_utils::byte_to_char_idx, RopeSlice};
use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
use unicode_width::UnicodeWidthStr;
use std::fmt;
#[must_use]
pub fn grapheme_width(g: &str) -> usize {
if g.as_bytes()[0] <= 127 {
// Fast-path ascii.
// Point 1: theoretically, ascii control characters should have zero
// width, but in our case we actually want them to have width: if they
// show up in text, we want to treat them as textual elements that can
// be editied. So we can get away with making all ascii single width
// here.
// Point 2: we're only examining the first codepoint here, which means
// we're ignoring graphemes formed with combining characters. However,
// if it starts with ascii, it's going to be a single-width grapeheme
// regardless, so, again, we can get away with that here.
// Point 3: we're only examining the first _byte_. But for utf8, when
// checking for ascii range values only, that works.
1
} else {
// We use max(1) here because all grapeheme clusters--even illformed
// ones--should have at least some width so they can be edited
// properly.
UnicodeWidthStr::width(g).max(1)
}
}
#[must_use]
pub fn nth_prev_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize {
// Bounds check
debug_assert!(char_idx <= slice.len_chars());
// We work with bytes for this, so convert.
let mut byte_idx = slice.char_to_byte(char_idx);
// Get the chunk with our byte index in it.
let (mut chunk, mut chunk_byte_idx, mut chunk_char_idx, _) = slice.chunk_at_byte(byte_idx);
// Set up the grapheme cursor.
let mut gc = GraphemeCursor::new(byte_idx, slice.len_bytes(), true);
// Find the previous grapheme cluster boundary.
for _ in 0..n {
loop {
match gc.prev_boundary(chunk, chunk_byte_idx) {
Ok(None) => return 0,
Ok(Some(n)) => {
byte_idx = n;
break;
}
Err(GraphemeIncomplete::PrevChunk) => {
let (a, b, c, _) = slice.chunk_at_byte(chunk_byte_idx - 1);
chunk = a;
chunk_byte_idx = b;
chunk_char_idx = c;
}
Err(GraphemeIncomplete::PreContext(n)) => {
let ctx_chunk = slice.chunk_at_byte(n - 1).0;
gc.provide_context(ctx_chunk, n - ctx_chunk.len());
}
_ => unreachable!(),
}
}
}
let tmp = byte_to_char_idx(chunk, byte_idx - chunk_byte_idx);
chunk_char_idx + tmp
}
/// Finds the previous grapheme boundary before the given char position.
pub fn prev_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize {
nth_prev_grapheme_boundary(slice, char_idx, 1)
}
#[must_use]
pub fn nth_next_grapheme_boundary(slice: RopeSlice, char_idx: usize, n: usize) -> usize {
// Bounds check
debug_assert!(char_idx <= slice.len_chars());
// We work with bytes for this, so convert.
let mut byte_idx = slice.char_to_byte(char_idx);
// Get the chunk with our byte index in it.
let (mut chunk, mut chunk_byte_idx, mut chunk_char_idx, _) = slice.chunk_at_byte(byte_idx);
// Set up the grapheme cursor.
let mut gc = GraphemeCursor::new(byte_idx, slice.len_bytes(), true);
// Find the nth next grapheme cluster boundary.
for _ in 0..n {
loop {
match gc.next_boundary(chunk, chunk_byte_idx) {
Ok(None) => return slice.len_chars(),
Ok(Some(n)) => {
byte_idx = n;
break;
}
Err(GraphemeIncomplete::NextChunk) => {
chunk_byte_idx += chunk.len();
let (a, _, c, _) = slice.chunk_at_byte(chunk_byte_idx);
chunk = a;
chunk_char_idx = c;
}
Err(GraphemeIncomplete::PreContext(n)) => {
let ctx_chunk = slice.chunk_at_byte(n - 1).0;
gc.provide_context(ctx_chunk, n - ctx_chunk.len());
}
_ => unreachable!(),
}
}
}
let tmp = byte_to_char_idx(chunk, byte_idx - chunk_byte_idx);
chunk_char_idx + tmp
}
/// Finds the next grapheme boundary after the given char position.
pub fn next_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize {
nth_next_grapheme_boundary(slice, char_idx, 1)
}
/// Returns the passed char index if it's already a grapheme boundary,
/// or the next grapheme boundary char index if not.
pub fn ensure_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> usize {
if char_idx == 0 {
0
} else {
next_grapheme_boundary(slice, char_idx - 1)
}
}
/// Returns the passed byte index if it's already a grapheme boundary,
/// or the next grapheme boundary byte index if not.
pub fn ensure_grapheme_boundary_byte(slice: RopeSlice, byte_idx: usize) -> usize {
// TODO: we can avoid the byte/char conversions entirely
// if we also make byte versions of the other functions.
let char_idx = slice.byte_to_char(byte_idx);
let fixed_char_idx = ensure_grapheme_boundary(slice, char_idx);
if fixed_char_idx == char_idx {
byte_idx
} else {
slice.char_to_byte(fixed_char_idx)
}
}
/// Returns whether the given char position is a grapheme boundary.
pub fn is_grapheme_boundary(slice: RopeSlice, char_idx: usize) -> bool {
// Bounds check
debug_assert!(char_idx <= slice.len_chars());
// We work with bytes for this, so convert.
let byte_idx = slice.char_to_byte(char_idx);
// Get the chunk with our byte index in it.
let (chunk, chunk_byte_idx, _, _) = slice.chunk_at_byte(byte_idx);
// Set up the grapheme cursor.
let mut gc = GraphemeCursor::new(byte_idx, slice.len_bytes(), true);
// Determine if the given position is a grapheme cluster boundary.
loop {
match gc.is_boundary(chunk, chunk_byte_idx) {
Ok(n) => return n,
Err(GraphemeIncomplete::PreContext(n)) => {
let (ctx_chunk, ctx_byte_start, _, _) = slice.chunk_at_byte(n - 1);
gc.provide_context(ctx_chunk, ctx_byte_start);
}
Err(_) => unreachable!(),
}
}
}
/// An iterator over the graphemes of a `RopeSlice`.
#[derive(Clone)]
pub struct RopeGraphemes<'a> {
text: RopeSlice<'a>,
chunks: Chunks<'a>,
cur_chunk: &'a str,
cur_chunk_start: usize,
cursor: GraphemeCursor,
}
impl<'a> fmt::Debug for RopeGraphemes<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("RopeGraphemes")
.field("text", &self.text)
.field("chunks", &self.chunks)
.field("cur_chunk", &self.cur_chunk)
.field("cur_chunk_start", &self.cur_chunk_start)
// .field("cursor", &self.cursor)
.finish()
}
}
impl<'a> RopeGraphemes<'a> {
#[must_use]
pub fn new(slice: RopeSlice) -> RopeGraphemes {
let mut chunks = slice.chunks();
let first_chunk = chunks.next().unwrap_or("");
RopeGraphemes {
text: slice,
chunks,
cur_chunk: first_chunk,
cur_chunk_start: 0,
cursor: GraphemeCursor::new(0, slice.len_bytes(), true),
}
}
}
impl<'a> Iterator for RopeGraphemes<'a> {
type Item = RopeSlice<'a>;
fn next(&mut self) -> Option<RopeSlice<'a>> {
let a = self.cursor.cur_cursor();
let b;
loop {
match self
.cursor
.next_boundary(self.cur_chunk, self.cur_chunk_start)
{
Ok(None) => {
return None;
}
Ok(Some(n)) => {
b = n;
break;
}
Err(GraphemeIncomplete::NextChunk) => {
self.cur_chunk_start += self.cur_chunk.len();
self.cur_chunk = self.chunks.next().unwrap_or("");
}
Err(GraphemeIncomplete::PreContext(idx)) => {
let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1));
self.cursor.provide_context(chunk, byte_idx);
}
_ => unreachable!(),
}
}
if a < self.cur_chunk_start {
let a_char = self.text.byte_to_char(a);
let b_char = self.text.byte_to_char(b);
Some(self.text.slice(a_char..b_char))
} else {
let a2 = a - self.cur_chunk_start;
let b2 = b - self.cur_chunk_start;
Some((&self.cur_chunk[a2..b2]).into())
}
}
}