Don't crash when the line split boundary is on a combining character

Fixes #358
2022-09-12 23:02:20 +07:00 · 2022-09-12 23:02:20 +07:00 · aff584af46
parent c03808537a
commit aff584af46
2 changed files with 37 additions and 30 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -16,6 +16,9 @@ ZIP files are now always treated as binary files.
 Changed words in changed comments are now also underlined, to make
 them more visible.

+Fixed crash when word-wrapping lines containing Unicode combining
+diacritics characters.
+
 ## 0.35 (released 2nd September 2022)

 ### Diffing
--- a/src/display/style.rs
+++ b/src/display/style.rs
@ -24,33 +24,27 @@ impl BackgroundColor {
    }
 }

-/// Slice `s` from `start` inclusive to `end` exclusive by width.
-fn substring_by_width(s: &str, start: usize, end: usize) -> &str {
-    if start == end {
-        return &s[0..0];
+/// Find the largest byte offset in `s` that gives the longest
+/// starting substring whose display width does not exceed `width`.
+///
+/// Note that the resulting substring may have a display width less
+/// than `width`, if the string contains full-width or emoji
+/// characters which have a display width greater than 1.
+fn byte_offset_for_width(s: &str, width: usize) -> usize {
+    let mut current_offset = 0;
+    let mut current_width = 0;
+
+    for (offset, ch) in s.char_indices() {
+        current_offset = offset;
+        let char_width = ch.width().unwrap_or(0);
+        current_width += char_width;
+
+        if current_width > width {
+            break;
+        }
    }

-    assert!(end > start);
-
-    let mut idx_width_iter = s
-        .char_indices()
-        .scan(0, |w, (idx, ch)| {
-            let before = *w;
-            *w += ch.width().unwrap_or(0);
-            Some((idx, before, *w))
-        })
-        .skip_while(|(_, before, _)| *before < start);
-    let byte_start = idx_width_iter
-        .next()
-        .expect("Expected a width index inside `s`.")
-        .0;
-    match idx_width_iter
-        .skip_while(|(_, _, after)| *after <= end)
-        .next()
-    {
-        Some(byte_end) => &s[byte_start..byte_end.0],
-        None => &s[byte_start..],
-    }
+    current_offset
 }

 fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
@ -70,16 +64,18 @@ fn split_string_by_width(s: &str, max_width: usize, pad: bool) -> Vec<(&str, usi
    let mut s = s;

    while s.width() > max_width {
-        let l = substring_by_width(s, 0, max_width);
-        let used = l.width();
-        let padding = if pad && used < max_width {
+        let offset = byte_offset_for_width(s, max_width);
+
+        let part = substring_by_byte(s, 0, offset);
+        s = substring_by_byte(s, offset, s.len());
+
+        let padding = if pad && part.width() < max_width {
            // a fullwidth char is followed
            1
        } else {
            0
        };
-        res.push((l, padding));
-        s = substring_by_width(s, used, s.width());
+        res.push((part, padding));
    }

    if res.is_empty() || !s.is_empty() {
@ -436,6 +432,14 @@ mod tests {
        );
    }

+    #[test]
+    fn test_combining_char() {
+        assert_eq!(
+            split_string_by_width("aabbcc\u{300}x", 6, false),
+            vec![("aabbcc\u{300}", 0), ("x", 0)],
+        );
+    }
+
    #[test]
    fn split_string_cjk() {
        assert_eq!(