From 40d3ccd06cab98c5ca6e0bd259306d288451a250 Mon Sep 17 00:00:00 2001
From: Wilfred Hughes <me@wilfred.me.uk>
Date: Tue, 15 Mar 2022 09:50:13 -0700
Subject: [PATCH] Fix confusion between byte length and codepoint length in
 styling

We should split lines based on their codepoint length, so all our
lengths are on codepoint boundaries. We can then safely index by byte position.

All the positions are measured in bytes, not code points. Tweak
function names to make this explicit.

Fixes #149
---
 sample_files/compare.expected    |  3 +++
 sample_files/multibyte_after.py  |  1 +
 sample_files/multibyte_before.py |  1 +
 src/lines.rs                     |  8 ++++++
 src/style.rs                     | 44 +++++++++++++++++---------------
 5 files changed, 36 insertions(+), 21 deletions(-)
 create mode 100644 sample_files/multibyte_after.py
 create mode 100644 sample_files/multibyte_before.py
diff --git a/sample_files/compare.expected b/sample_files/compare.expected
index fa8f7a2b6..4663687c0 100644
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@@ -61,6 +61,9 @@ bf8527f91baff9a5f56380784b2a438e  -
 sample_files/modules_before.ml sample_files/modules_after.ml
 d182f73a8729211cc41e297aeab8f7a9  -
 
+sample_files/multibyte_before.py sample_files/multibyte_after.py
+8a5e31775d5e788e9158bd4a36e03254  -
+
 sample_files/nest_before.rs sample_files/nest_after.rs
 791901de922abc7f24e5b9068706417d  -
 
diff --git a/sample_files/multibyte_after.py b/sample_files/multibyte_after.py
new file mode 100644
index 000000000..6fcbc61c7
--- /dev/null
+++ b/sample_files/multibyte_after.py
@@ -0,0 +1 @@
+"bar€".format()
\ No newline at end of file
diff --git a/sample_files/multibyte_before.py b/sample_files/multibyte_before.py
new file mode 100644
index 000000000..3ed538d86
--- /dev/null
+++ b/sample_files/multibyte_before.py
@@ -0,0 +1 @@
+"foo€".format()
\ No newline at end of file
diff --git a/src/lines.rs b/src/lines.rs
index b595b2657..4076bb762 100644
--- a/src/lines.rs
+++ b/src/lines.rs
@@ -151,6 +151,14 @@ pub fn codepoint_len(s: &str) -> usize {
     s.chars().count()
 }
 
+/// Return the length of `s` in bytes.
+///
+/// This is a trivial wrapper to make it clear when we want bytes not
+/// codepoints.
+pub fn byte_len(s: &str) -> usize {
+    s.len()
+}
+
 pub trait MaxLine {
     fn max_line(&self) -> LineNumber;
 }
diff --git a/src/style.rs b/src/style.rs
index a97f76d8d..cc598fa70 100644
--- a/src/style.rs
+++ b/src/style.rs
@@ -1,7 +1,7 @@
 //! Apply colours and styling to strings.
 
 use crate::{
-    lines::{codepoint_len, LineNumber},
+    lines::{byte_len, codepoint_len, LineNumber},
     positions::SingleLineSpan,
     syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
 };
@@ -44,13 +44,17 @@ fn substring_by_codepoint(s: &str, start: usize, end: usize) -> &str {
     }
 }
 
+fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
+    &s[start..end]
+}
+
 /// Split a string into equal length parts, padding the last part if
 /// necessary.
 ///
 /// ```
-/// split_string("fooba", 3) // vec!["foo", "ba "]
+/// split_string_by_codepoint("fooba", 3) // vec!["foo", "ba "]
 /// ```
-fn split_string(s: &str, max_len: usize) -> Vec<String> {
+fn split_string_by_codepoint(s: &str, max_len: usize) -> Vec<String> {
     let mut res = vec![];
     let mut s = s;
 
@@ -81,7 +85,7 @@ pub fn split_and_apply(
 ) -> Vec<String> {
     if styles.is_empty() && !line.is_empty() {
         // Missing styles is a bug, so highlight in purple to make this obvious.
-        return split_string(line, max_len)
+        return split_string_by_codepoint(line, max_len)
             .into_iter()
             .map(|part| {
                 if use_color {
@@ -96,12 +100,12 @@ pub fn split_and_apply(
     let mut styled_parts = vec![];
     let mut part_start = 0;
 
-    for part in split_string(line, max_len) {
+    for part in split_string_by_codepoint(line, max_len) {
         let mut res = String::with_capacity(part.len());
         let mut prev_style_end = 0;
         for (span, style) in styles {
             // The remaining spans are beyond the end of this part.
-            if span.start_col >= part_start + codepoint_len(&part) {
+            if span.start_col >= part_start + byte_len(&part) {
                 break;
             }
 
@@ -109,7 +113,7 @@ pub fn split_and_apply(
             if span.start_col > part_start && prev_style_end < span.start_col {
                 // Then append that text without styling.
                 let unstyled_start = max(prev_style_end, part_start);
-                res.push_str(substring_by_codepoint(
+                res.push_str(substring_by_byte(
                     &part,
                     unstyled_start - part_start,
                     span.start_col - part_start,
@@ -118,10 +122,10 @@ pub fn split_and_apply(
 
             // Apply style to the substring in this span.
             if span.end_col > part_start {
-                let span_s = substring_by_codepoint(
+                let span_s = substring_by_byte(
                     &part,
                     max(0, span.start_col as isize - part_start as isize) as usize,
-                    min(codepoint_len(&part), span.end_col - part_start),
+                    min(byte_len(&part), span.end_col - part_start),
                 );
                 res.push_str(&span_s.style(*style).to_string());
             }
@@ -136,13 +140,12 @@ pub fn split_and_apply(
 
         // Unstyled text after the last span.
         if prev_style_end < part_start + codepoint_len(&part) {
-            let span_s =
-                substring_by_codepoint(&part, prev_style_end - part_start, codepoint_len(&part));
+            let span_s = substring_by_byte(&part, prev_style_end - part_start, byte_len(&part));
             res.push_str(span_s);
         }
 
         styled_parts.push(res);
-        part_start += codepoint_len(&part)
+        part_start += byte_len(&part)
     }
 
     styled_parts
@@ -155,31 +158,30 @@ fn apply_line(line: &str, styles: &[(SingleLineSpan, Style)]) -> String {
         return highlight_missing_style_bug(line);
     }
 
-    let line_codepoints = codepoint_len(line);
+    let line_bytes = byte_len(line);
     let mut res = String::with_capacity(line.len());
     let mut i = 0;
     for (span, style) in styles {
         // The remaining spans are beyond the end of this line. This
         // occurs when we truncate the line to fit on the display.
-        if span.start_col >= line_codepoints {
+        if span.start_col >= line_bytes {
             break;
         }
 
         // Unstyled text before the next span.
         if i < span.start_col {
-            res.push_str(substring_by_codepoint(line, i, span.start_col));
+            res.push_str(substring_by_byte(line, i, span.start_col));
         }
 
         // Apply style to the substring in this span.
-        let span_s =
-            substring_by_codepoint(line, span.start_col, min(line_codepoints, span.end_col));
+        let span_s = substring_by_byte(line, span.start_col, min(line_bytes, span.end_col));
         res.push_str(&span_s.style(*style).to_string());
         i = span.end_col;
     }
 
     // Unstyled text after the last span.
-    if i < line_codepoints {
-        let span_s = substring_by_codepoint(line, i, line_codepoints);
+    if i < line_bytes {
+        let span_s = substring_by_byte(line, i, line_bytes);
         res.push_str(span_s);
     }
     res
@@ -349,12 +351,12 @@ mod tests {
 
     #[test]
     fn split_string_simple() {
-        assert_eq!(split_string("fooba", 3), vec!["foo", "ba "]);
+        assert_eq!(split_string_by_codepoint("fooba", 3), vec!["foo", "ba "]);
     }
 
     #[test]
     fn split_string_unicode() {
-        assert_eq!(split_string("ab📦def", 3), vec!["ab📦", "def"]);
+        assert_eq!(split_string_by_codepoint("ab📦def", 3), vec!["ab📦", "def"]);
     }
 
     #[test]