Replace tabs during display, so parsing sees the original source

Fixes #350
2023-01-01 22:28:08 +07:00 · 2023-01-01 22:28:08 +07:00 · e8e5ca8e47
parent 7e560ec943
commit e8e5ca8e47
7 changed files with 6750 additions and 50 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,11 @@
 ## 0.41 (unreleased)

+### Parsing
+
+Tab replacement is now done after parsing. If tab characters are
+syntactically important, they are now handled correctly. This was
+particularly an issue in Makefiles, where indentation must be tabs.
+
 ## 0.40 (released 28th December 2022)

 ### Diffing
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -106,6 +106,9 @@ sample_files/load_before.js sample_files/load_after.js
 sample_files/lua_before.lua sample_files/lua_after.lua
 9886d61f459cdf566be9c42f7fa61a12  -

+sample_files/makefile_before.mk sample_files/makefile_after.mk
+43524233cc128cbd5c23673874095251  -
+
 sample_files/metadata_before.clj sample_files/metadata_after.clj
 e8e76c97731accc5572708664bb61910  -

--- a/sample_files/makefile_after.mk
+++ b/sample_files/makefile_after.mk
--- a/sample_files/makefile_before.mk
+++ b/sample_files/makefile_before.mk
--- a/src/display/side_by_side.rs
+++ b/src/display/side_by_side.rs
@ -508,6 +508,7 @@ pub fn print(
                    Some(lhs_line_num) => split_and_apply(
                        lhs_lines[lhs_line_num.as_usize()],
                        source_dims.content_width,
+                        display_options.tab_width,
                        display_options.use_color,
                        lhs_highlights.get(lhs_line_num).unwrap_or(&vec![]),
                        Side::Left,
@ -518,6 +519,7 @@ pub fn print(
                    Some(rhs_line_num) => split_and_apply(
                        rhs_lines[rhs_line_num.as_usize()],
                        source_dims.content_width,
+                        display_options.tab_width,
                        display_options.use_color,
                        rhs_highlights.get(rhs_line_num).unwrap_or(&vec![]),
                        Side::Right,
--- a/src/display/style.rs
+++ b/src/display/style.rs
@ -30,16 +30,20 @@ impl BackgroundColor {
 /// Find the largest byte offset in `s` that gives the longest
 /// starting substring whose display width does not exceed `width`.
 ///
-/// Note that the resulting substring may have a display width less
-/// than `width`, if the string contains full-width or emoji
-/// characters which have a display width greater than 1.
-fn byte_offset_for_width(s: &str, width: usize) -> usize {
+/// If `s` contains full-width Unicode characters, or emoji, or tabs,
+/// its display width may be less than `width`.
+fn byte_offset_for_width(s: &str, width: usize, tab_width: usize) -> usize {
    let mut current_offset = 0;
    let mut current_width = 0;

    for (offset, ch) in s.char_indices() {
        current_offset = offset;
-        let char_width = ch.width().unwrap_or(0);
+
+        let char_width = if ch == '\t' {
+            tab_width
+        } else {
+            ch.width().unwrap_or(0)
+        };
        current_width += char_width;

        if current_width > width {
@ -54,27 +58,52 @@ fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
    &s[start..end]
 }

-/// Split a string into equal length parts and how many spaces should be padded.
+fn substring_by_byte_replace_tabs(s: &str, start: usize, end: usize, tab_width: usize) -> String {
+    let s = s[start..end].to_string();
+    s.replace('\t', &" ".repeat(tab_width))
+}
+
+fn width_respecting_tabs(s: &str, tab_width: usize) -> usize {
+    let display_width = s.width();
+
+    // .width() on tabs returns 0, wheras we want to model them as
+    // `tab_width` spaces.
+    debug_assert_eq!("\t".width(), 0);
+    let tab_count = s.matches('\t').count();
+    let tab_display_width_extra = tab_count * tab_width;
+
+    display_width + tab_display_width_extra
+}
+
+/// Split a string into parts whose display length does not
+/// exceed `max_width`.
 ///
-/// Return splitted strings and how many spaces each should be padded with.
+/// If any part has a display width less than `max_width`, also
+/// specify the number of spaces required to pad the part to reach the
+/// desired width.
 ///
 /// ```
 /// split_string_by_width("fooba", 3, true) // vec![("foo", 0), ("ba", 1)]
 /// split_string_by_width("一个汉字两列宽", 8, false) // vec![("一个汉字", 0), ("两列宽", 0)]
 /// ```
-fn split_string_by_width(s: &str, max_width: usize, pad: bool) -> Vec<(&str, usize)> {
-    let mut res = vec![];
+fn split_string_by_width(
+    s: &str,
+    max_width: usize,
+    tab_width: usize,
+    pad: bool,
+) -> Vec<(&str, usize)> {
+    let mut res: Vec<(&str, usize)> = vec![];
    let mut s = s;

-    while s.width() > max_width {
-        let offset = byte_offset_for_width(s, max_width);
+    while width_respecting_tabs(s, tab_width) > max_width {
+        let offset = byte_offset_for_width(s, max_width, tab_width);

        let part = substring_by_byte(s, 0, offset);
        s = substring_by_byte(s, offset, s.len());

-        let padding = if pad && part.width() < max_width {
-            // a fullwidth char is followed
-            1
+        let part_width = width_respecting_tabs(part, tab_width);
+        let padding = if pad && part_width < max_width {
+            max_width - part_width
        } else {
            0
        };
@ -82,7 +111,11 @@ fn split_string_by_width(s: &str, max_width: usize, pad: bool) -> Vec<(&str, usi
    }

    if res.is_empty() || !s.is_empty() {
-        let padding = if pad { max_width - s.width() } else { 0 };
+        let padding = if pad {
+            max_width - width_respecting_tabs(s, tab_width)
+        } else {
+            0
+        };
        res.push((s, padding));
    }

@ -93,29 +126,48 @@ fn highlight_missing_style_bug(s: &str) -> String {
    s.on_purple().to_string()
 }

+/// Return a copy of `src` with all the tab characters replaced by
+/// `tab_width` strings.
+fn replace_tabs(src: &str, tab_width: usize) -> String {
+    let tab_as_spaces = " ".repeat(tab_width);
+    src.replace('\t', &tab_as_spaces)
+}
+
 /// Split `line` (from the source code) into multiple lines of
 /// `max_len` (i.e. word wrapping), and apply `styles` to each part
 /// according to its original position in `line`.
 pub fn split_and_apply(
    line: &str,
    max_len: usize,
+    tab_width: usize,
    use_color: bool,
    styles: &[(SingleLineSpan, Style)],
    side: Side,
 ) -> Vec<String> {
-    assert!(max_len > 0);
+    assert!(
+        max_len > 0,
+        "Splitting lines into pieces of length 0 will never terminate"
+    );
+    assert!(
+        max_len > tab_width,
+        "Parts must be big enough to hold at least one tab (max_len = {} tab_width = {})",
+        max_len,
+        tab_width
+    );

    if styles.is_empty() && !line.trim().is_empty() {
-        return split_string_by_width(line, max_len, matches!(side, Side::Left))
+        return split_string_by_width(line, max_len, tab_width, matches!(side, Side::Left))
            .into_iter()
            .map(|(part, pad)| {
+                let part = replace_tabs(part, tab_width);
+
                let mut res = String::with_capacity(part.len() + pad);
                if use_color {
                    // If we're syntax highlighting and have no
                    // styles, that's a bug.
-                    res.push_str(&highlight_missing_style_bug(part));
+                    res.push_str(&highlight_missing_style_bug(&part));
                } else {
-                    res.push_str(part);
+                    res.push_str(&part);
                }

                res.push_str(&" ".repeat(pad));
@ -127,7 +179,9 @@ pub fn split_and_apply(
    let mut styled_parts = vec![];
    let mut part_start = 0;

-    for (line_part, pad) in split_string_by_width(line, max_len, matches!(side, Side::Left)) {
+    for (line_part, pad) in
+        split_string_by_width(line, max_len, tab_width, matches!(side, Side::Left))
+    {
        let mut res = String::with_capacity(line_part.len() + pad);
        let mut prev_style_end = 0;
        for (span, style) in styles {
@ -143,19 +197,21 @@ pub fn split_and_apply(
            if start_col > part_start && prev_style_end < start_col {
                // Then append that text without styling.
                let unstyled_start = max(prev_style_end, part_start);
-                res.push_str(substring_by_byte(
+                res.push_str(&substring_by_byte_replace_tabs(
                    line_part,
                    unstyled_start - part_start,
                    start_col - part_start,
+                    tab_width,
                ));
            }

            // Apply style to the substring in this span.
            if end_col > part_start {
-                let span_s = substring_by_byte(
+                let span_s = substring_by_byte_replace_tabs(
                    line_part,
                    max(0, span.start_col as isize - part_start as isize) as usize,
                    min(byte_len(line_part), end_col - part_start),
+                    tab_width,
                );
                res.push_str(&span_s.style(*style).to_string());
            }
@ -170,10 +226,15 @@ pub fn split_and_apply(

        // Unstyled text after the last span.
        if prev_style_end < part_start + byte_len(line_part) {
-            let span_s =
-                substring_by_byte(line_part, prev_style_end - part_start, byte_len(line_part));
+            let span_s = &substring_by_byte_replace_tabs(
+                line_part,
+                prev_style_end - part_start,
+                byte_len(line_part),
+                tab_width,
+            );
            res.push_str(span_s);
        }
+
        res.push_str(&" ".repeat(pad));

        styled_parts.push(res);
@ -460,13 +521,15 @@ pub fn header(

 #[cfg(test)]
 mod tests {
+    const TAB_WIDTH: usize = 2;
+
    use super::*;
    use pretty_assertions::assert_eq;

    #[test]
    fn split_string_simple() {
        assert_eq!(
-            split_string_by_width("fooba", 3, true),
+            split_string_by_width("fooba", 3, TAB_WIDTH, true),
            vec![("foo", 0), ("ba", 1)]
        );
    }
@ -474,7 +537,7 @@ mod tests {
    #[test]
    fn split_string_simple_no_pad() {
        assert_eq!(
-            split_string_by_width("fooba", 3, false),
+            split_string_by_width("fooba", 3, TAB_WIDTH, false),
            vec![("foo", 0), ("ba", 0)]
        );
    }
@ -482,7 +545,7 @@ mod tests {
    #[test]
    fn split_string_unicode() {
        assert_eq!(
-            split_string_by_width("ab📦def", 4, true),
+            split_string_by_width("ab📦def", 4, TAB_WIDTH, true),
            vec![("ab📦", 0), ("def", 1)]
        );
    }
@ -490,7 +553,7 @@ mod tests {
    #[test]
    fn test_combining_char() {
        assert_eq!(
-            split_string_by_width("aabbcc\u{300}x", 6, false),
+            split_string_by_width("aabbcc\u{300}x", 6, TAB_WIDTH, false),
            vec![("aabbcc\u{300}", 0), ("x", 0)],
        );
    }
@ -498,7 +561,7 @@ mod tests {
    #[test]
    fn split_string_cjk() {
        assert_eq!(
-            split_string_by_width("一个汉字两列宽", 8, false),
+            split_string_by_width("一个汉字两列宽", 8, TAB_WIDTH, false),
            vec![("一个汉字", 0), ("两列宽", 0)]
        );
    }
@ -506,14 +569,14 @@ mod tests {
    #[test]
    fn split_string_cjk2() {
        assert_eq!(
-            split_string_by_width("你好啊", 5, true),
+            split_string_by_width("你好啊", 5, TAB_WIDTH, true),
            vec![("你好", 1), ("啊", 3)]
        );
    }

    #[test]
    fn test_split_and_apply_missing() {
-        let res = split_and_apply("foo", 3, true, &[], Side::Left);
+        let res = split_and_apply("foo", 3, TAB_WIDTH, true, &[], Side::Left);
        assert_eq!(res, vec![highlight_missing_style_bug("foo")])
    }

@ -522,6 +585,7 @@ mod tests {
        let res = split_and_apply(
            "foo",
            3,
+            TAB_WIDTH,
            true,
            &[(
                SingleLineSpan {
@ -541,6 +605,7 @@ mod tests {
        let res = split_and_apply(
            "foobar",
            6,
+            TAB_WIDTH,
            true,
            &[(
                SingleLineSpan {
@ -560,6 +625,7 @@ mod tests {
        let res = split_and_apply(
            "foobar",
            3,
+            TAB_WIDTH,
            true,
            &[
                (
@ -589,6 +655,7 @@ mod tests {
        let res = split_and_apply(
            "foobar      ",
            6,
+            TAB_WIDTH,
            true,
            &[(
                SingleLineSpan {
--- a/src/main.rs
+++ b/src/main.rs
@ -57,7 +57,7 @@ use parse::guess_language::{guess, language_name};
 static GLOBAL: MiMalloc = MiMalloc;

 use diff::sliders::fix_all_sliders;
-use options::{DiffOptions, DisplayMode, DisplayOptions, FileArgument, Mode, DEFAULT_TAB_WIDTH};
+use options::{DiffOptions, DisplayMode, DisplayOptions, FileArgument, Mode};
 use owo_colors::OwoColorize;
 use rayon::prelude::*;
 use std::sync::atomic::{AtomicBool, Ordering};
@ -100,8 +100,6 @@ fn main() {
            let path = Path::new(&path);
            let bytes = read_or_die(path);
            let src = String::from_utf8_lossy(&bytes).to_string();
-            // TODO: Load display options rather than hard-coding.
-            let src = replace_tabs(&src, DEFAULT_TAB_WIDTH);

            let language = language_override.or_else(|| guess(path, &src));
            match language {
@ -122,8 +120,6 @@ fn main() {
            let path = Path::new(&path);
            let bytes = read_or_die(path);
            let src = String::from_utf8_lossy(&bytes).to_string();
-            // TODO: Load display options rather than hard-coding.
-            let src = replace_tabs(&src, DEFAULT_TAB_WIDTH);

            let language = language_override.or_else(|| guess(path, &src));
            match language {
@ -258,16 +254,6 @@ fn main() {
    };
 }

-/// Return a copy of `str` with all the tab characters replaced by
-/// `tab_width` strings.
-///
-/// TODO: This break parsers that require tabs, such as Makefile
-/// parsing. We shouldn't do this transform until after parsing.
-fn replace_tabs(src: &str, tab_width: usize) -> String {
-    let tab_as_spaces = " ".repeat(tab_width);
-    src.replace('\t', &tab_as_spaces)
-}
-
 /// Print a diff between two files.
 fn diff_file(
    lhs_display_path: &str,
@ -323,10 +309,6 @@ fn diff_file_content(
        (ProbableFileKind::Text(lhs_src), ProbableFileKind::Text(rhs_src)) => (lhs_src, rhs_src),
    };

-    // TODO: don't replace tab characters inside string literals.
-    lhs_src = replace_tabs(&lhs_src, display_options.tab_width);
-    rhs_src = replace_tabs(&rhs_src, display_options.tab_width);
-
    // Ignore the trailing newline, if present.
    // TODO: highlight if this has changes (#144).
    // TODO: factor out a string cleaning function.