Fix confusion between byte length and codepoint length in styling

We should split lines based on their codepoint length, so all our
lengths are on codepoint boundaries. We can then safely index by byte position.

All the positions are measured in bytes, not code points. Tweak
function names to make this explicit.

Fixes #149
pull/166/head
Wilfred Hughes 2022-03-15 09:50:13 +07:00
parent b2229d66a8
commit 40d3ccd06c
5 changed files with 36 additions and 21 deletions

@ -61,6 +61,9 @@ bf8527f91baff9a5f56380784b2a438e -
sample_files/modules_before.ml sample_files/modules_after.ml sample_files/modules_before.ml sample_files/modules_after.ml
d182f73a8729211cc41e297aeab8f7a9 - d182f73a8729211cc41e297aeab8f7a9 -
sample_files/multibyte_before.py sample_files/multibyte_after.py
8a5e31775d5e788e9158bd4a36e03254 -
sample_files/nest_before.rs sample_files/nest_after.rs sample_files/nest_before.rs sample_files/nest_after.rs
791901de922abc7f24e5b9068706417d - 791901de922abc7f24e5b9068706417d -

@ -0,0 +1 @@
"bar€".format()

@ -0,0 +1 @@
"foo€".format()

@ -151,6 +151,14 @@ pub fn codepoint_len(s: &str) -> usize {
s.chars().count() s.chars().count()
} }
/// Return the length of `s` in bytes.
///
/// This is a trivial wrapper to make it clear when we want bytes not
/// codepoints.
pub fn byte_len(s: &str) -> usize {
s.len()
}
pub trait MaxLine { pub trait MaxLine {
fn max_line(&self) -> LineNumber; fn max_line(&self) -> LineNumber;
} }

@ -1,7 +1,7 @@
//! Apply colours and styling to strings. //! Apply colours and styling to strings.
use crate::{ use crate::{
lines::{codepoint_len, LineNumber}, lines::{byte_len, codepoint_len, LineNumber},
positions::SingleLineSpan, positions::SingleLineSpan,
syntax::{AtomKind, MatchKind, MatchedPos, TokenKind}, syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
}; };
@ -44,13 +44,17 @@ fn substring_by_codepoint(s: &str, start: usize, end: usize) -> &str {
} }
} }
fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
&s[start..end]
}
/// Split a string into equal length parts, padding the last part if /// Split a string into equal length parts, padding the last part if
/// necessary. /// necessary.
/// ///
/// ``` /// ```
/// split_string("fooba", 3) // vec!["foo", "ba "] /// split_string_by_codepoint("fooba", 3) // vec!["foo", "ba "]
/// ``` /// ```
fn split_string(s: &str, max_len: usize) -> Vec<String> { fn split_string_by_codepoint(s: &str, max_len: usize) -> Vec<String> {
let mut res = vec![]; let mut res = vec![];
let mut s = s; let mut s = s;
@ -81,7 +85,7 @@ pub fn split_and_apply(
) -> Vec<String> { ) -> Vec<String> {
if styles.is_empty() && !line.is_empty() { if styles.is_empty() && !line.is_empty() {
// Missing styles is a bug, so highlight in purple to make this obvious. // Missing styles is a bug, so highlight in purple to make this obvious.
return split_string(line, max_len) return split_string_by_codepoint(line, max_len)
.into_iter() .into_iter()
.map(|part| { .map(|part| {
if use_color { if use_color {
@ -96,12 +100,12 @@ pub fn split_and_apply(
let mut styled_parts = vec![]; let mut styled_parts = vec![];
let mut part_start = 0; let mut part_start = 0;
for part in split_string(line, max_len) { for part in split_string_by_codepoint(line, max_len) {
let mut res = String::with_capacity(part.len()); let mut res = String::with_capacity(part.len());
let mut prev_style_end = 0; let mut prev_style_end = 0;
for (span, style) in styles { for (span, style) in styles {
// The remaining spans are beyond the end of this part. // The remaining spans are beyond the end of this part.
if span.start_col >= part_start + codepoint_len(&part) { if span.start_col >= part_start + byte_len(&part) {
break; break;
} }
@ -109,7 +113,7 @@ pub fn split_and_apply(
if span.start_col > part_start && prev_style_end < span.start_col { if span.start_col > part_start && prev_style_end < span.start_col {
// Then append that text without styling. // Then append that text without styling.
let unstyled_start = max(prev_style_end, part_start); let unstyled_start = max(prev_style_end, part_start);
res.push_str(substring_by_codepoint( res.push_str(substring_by_byte(
&part, &part,
unstyled_start - part_start, unstyled_start - part_start,
span.start_col - part_start, span.start_col - part_start,
@ -118,10 +122,10 @@ pub fn split_and_apply(
// Apply style to the substring in this span. // Apply style to the substring in this span.
if span.end_col > part_start { if span.end_col > part_start {
let span_s = substring_by_codepoint( let span_s = substring_by_byte(
&part, &part,
max(0, span.start_col as isize - part_start as isize) as usize, max(0, span.start_col as isize - part_start as isize) as usize,
min(codepoint_len(&part), span.end_col - part_start), min(byte_len(&part), span.end_col - part_start),
); );
res.push_str(&span_s.style(*style).to_string()); res.push_str(&span_s.style(*style).to_string());
} }
@ -136,13 +140,12 @@ pub fn split_and_apply(
// Unstyled text after the last span. // Unstyled text after the last span.
if prev_style_end < part_start + codepoint_len(&part) { if prev_style_end < part_start + codepoint_len(&part) {
let span_s = let span_s = substring_by_byte(&part, prev_style_end - part_start, byte_len(&part));
substring_by_codepoint(&part, prev_style_end - part_start, codepoint_len(&part));
res.push_str(span_s); res.push_str(span_s);
} }
styled_parts.push(res); styled_parts.push(res);
part_start += codepoint_len(&part) part_start += byte_len(&part)
} }
styled_parts styled_parts
@ -155,31 +158,30 @@ fn apply_line(line: &str, styles: &[(SingleLineSpan, Style)]) -> String {
return highlight_missing_style_bug(line); return highlight_missing_style_bug(line);
} }
let line_codepoints = codepoint_len(line); let line_bytes = byte_len(line);
let mut res = String::with_capacity(line.len()); let mut res = String::with_capacity(line.len());
let mut i = 0; let mut i = 0;
for (span, style) in styles { for (span, style) in styles {
// The remaining spans are beyond the end of this line. This // The remaining spans are beyond the end of this line. This
// occurs when we truncate the line to fit on the display. // occurs when we truncate the line to fit on the display.
if span.start_col >= line_codepoints { if span.start_col >= line_bytes {
break; break;
} }
// Unstyled text before the next span. // Unstyled text before the next span.
if i < span.start_col { if i < span.start_col {
res.push_str(substring_by_codepoint(line, i, span.start_col)); res.push_str(substring_by_byte(line, i, span.start_col));
} }
// Apply style to the substring in this span. // Apply style to the substring in this span.
let span_s = let span_s = substring_by_byte(line, span.start_col, min(line_bytes, span.end_col));
substring_by_codepoint(line, span.start_col, min(line_codepoints, span.end_col));
res.push_str(&span_s.style(*style).to_string()); res.push_str(&span_s.style(*style).to_string());
i = span.end_col; i = span.end_col;
} }
// Unstyled text after the last span. // Unstyled text after the last span.
if i < line_codepoints { if i < line_bytes {
let span_s = substring_by_codepoint(line, i, line_codepoints); let span_s = substring_by_byte(line, i, line_bytes);
res.push_str(span_s); res.push_str(span_s);
} }
res res
@ -349,12 +351,12 @@ mod tests {
#[test] #[test]
fn split_string_simple() { fn split_string_simple() {
assert_eq!(split_string("fooba", 3), vec!["foo", "ba "]); assert_eq!(split_string_by_codepoint("fooba", 3), vec!["foo", "ba "]);
} }
#[test] #[test]
fn split_string_unicode() { fn split_string_unicode() {
assert_eq!(split_string("ab📦def", 3), vec!["ab📦", "def"]); assert_eq!(split_string_by_codepoint("ab📦def", 3), vec!["ab📦", "def"]);
} }
#[test] #[test]