Merge pull request #301 from lilydjwg/master

use unicode-width to align CJK characters
2022-07-04 15:07:25 +07:00 · 2022-07-04 15:07:25 +07:00 · 719654d462
parent 156c701459 0648b0a6cf
commit 719654d462
7 changed files with 203 additions and 51 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -211,6 +211,7 @@ dependencies = [
 "tree-sitter",
 "tree_magic_mini",
 "typed-arena",
+ "unicode-width",
 "walkdir",
 "wu-diff",
 ]
@ -626,6 +627,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d22af068fba1eb5edcb4aea19d382b2a3deb4c8f9d475c589b6ada9e0fd493ee"

+[[package]]
+name = "unicode-width"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
+
 [[package]]
 name = "unicode-xid"
 version = "0.2.3"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -47,6 +47,7 @@ wu-diff = "0.1.2"
 rayon = "1.5.2"
 tree_magic_mini = "3.0.3"
 bumpalo = "3.9.1"
+unicode-width = "0.1.9"

 [dev-dependencies]
 pretty_assertions = "1.2.1"
--- a/sample_files/chinese_after.po
+++ b/sample_files/chinese_after.po
@ -0,0 +1,59 @@
+#: ../errors.h:589
+# reorder if possible
+#, fuzzy, c-format
+# msgid "E244: Illegal %s name \"%s\" in font name \"%s\""
+# msgstr "E244: 字体名 \"%3$s\" 中有非法 %1$s 名称 \"%2$s\""
+
+#: ../errors.h:591
+#, c-format
+msgid "E245: Illegal char '%c' in font name \"%s\""
+msgstr "E245: 不正确的字符 '%c' 出现在字体名称 \"%s\" 内"
+
+#: ../errors.h:594
+msgid "E246: FileChangedShell autocommand deleted buffer"
+msgstr "E246: FileChangedShell 自动命令删除了缓冲区"
+
+#: ../errors.h:597
+#, c-format
+msgid "E247: No registered server named \"%s\""
+msgstr "E247: 没有名叫 \"%s\" 的已注册的服务器"
+
+#: ../errors.h:788
+# reorder if possible
+#, fuzzy, c-format
+# msgid "E316: ml_get: Cannot find line %ld in buffer %d %s"
+# msgstr "E316: ml_get: 在缓冲区 %2$d %3$s 中找不到第 %1$ld 行"
+
+#: ../errors.h:790
+msgid "E317: Pointer block id wrong"
+msgstr "E317: 指针块 id 错误"
+
+#: ../errors.h:792
+msgid "E317: Pointer block id wrong 2"
+msgstr "E317: 指针块 id 错误 2"
+
+#: ../errors.h:794
+msgid "E317: Pointer block id wrong 3"
+msgstr "E317: 指针块 id 错误 3"
+
+#: ../errors.h:796
+msgid "E317: Pointer block id wrong 4"
+msgstr "E317: 指针块 id 错误 4"
+
+#: ../errors.h:2705
+# reorder if possible
+#, fuzzy, c-format
+# msgid "E1037: Cannot use \"%s\" with %s"
+# msgstr "E1037: 不能对 %2$s 使用 \"%1$s\""
+
+#: ../errors.h:2707
+msgid "E1038: \"vim9script\" can only be used in a script"
+msgstr "E1038: \"vim9script\" 只能在脚本中使用"
+
+#: ../errors.h:2709
+msgid "E1039: \"vim9script\" must be the first command in a script"
+msgstr "E1039: \"vim9script\" 必须是脚本中的第一条命令"
+
+#: ../errors.h:2712
+msgid "E1040: Cannot use :scriptversion after :vim9script"
+msgstr "E1040: :vim9script 之后不能使用 :scriptversion"
--- a/sample_files/chinese_before.po
+++ b/sample_files/chinese_before.po
@ -0,0 +1,59 @@
+#: ../errors.h:589
+# reorder if possible
+#, fuzzy, c-format
+msgid "E244: Illegal %s name \"%s\" in font name \"%s\""
+msgstr "E244: 字体名 \"%3$s\" 中有非法 %1$s 名称 \"%2$s\""
+
+#: ../errors.h:591
+#, c-format
+msgid "E245: Illegal char '%c' in font name \"%s\""
+msgstr "E245: 不正确的字符 '%c' 出现在字体名称 \"%s\" 内"
+
+#: ../errors.h:594
+msgid "E246: FileChangedShell autocommand deleted buffer"
+msgstr "E246: FileChangedShell 自动命令删除了缓冲区"
+
+#: ../errors.h:597
+#, c-format
+msgid "E247: No registered server named \"%s\""
+msgstr "E247: 没有名叫 \"%s\" 的已注册的服务器"
+
+#: ../errors.h:788
+# reorder if possible
+#, fuzzy, c-format
+msgid "E316: ml_get: Cannot find line %ld in buffer %d %s"
+msgstr "E316: ml_get: 在缓冲区 %2$d %3$s 中找不到第 %1$ld 行"
+
+#: ../errors.h:790
+msgid "E317: Pointer block id wrong"
+msgstr "E317: 指针块 id 错误"
+
+#: ../errors.h:792
+msgid "E317: Pointer block id wrong 2"
+msgstr "E317: 指针块 id 错误 2"
+
+#: ../errors.h:794
+msgid "E317: Pointer block id wrong 3"
+msgstr "E317: 指针块 id 错误 3"
+
+#: ../errors.h:796
+msgid "E317: Pointer block id wrong 4"
+msgstr "E317: 指针块 id 错误 4"
+
+#: ../errors.h:2705
+# reorder if possible
+#, fuzzy, c-format
+msgid "E1037: Cannot use \"%s\" with %s"
+msgstr "E1037: 不能对 %2$s 使用 \"%1$s\""
+
+#: ../errors.h:2707
+msgid "E1038: \"vim9script\" can only be used in a script"
+msgstr "E1038: \"vim9script\" 只能在脚本中使用"
+
+#: ../errors.h:2709
+msgid "E1039: \"vim9script\" must be the first command in a script"
+msgstr "E1039: \"vim9script\" 必须是脚本中的第一条命令"
+
+#: ../errors.h:2712
+msgid "E1040: Cannot use :scriptversion after :vim9script"
+msgstr "E1040: :vim9script 之后不能使用 :scriptversion"
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -7,11 +7,14 @@ sample_files/bad_combine_before.rs sample_files/bad_combine_after.rs
 sample_files/change_outer_before.el sample_files/change_outer_after.el
 1857b63ba1bfa0ccc0a4243db6b1c5c2  -

+sample_files/chinese_before.po sample_files/chinese_after.po
+56f0af341fd86727dbac522293e8e013  -
+
 sample_files/clojure_before.clj sample_files/clojure_after.clj
 b916e224f289888252cd7597bab339e6  -

 sample_files/comments_before.rs sample_files/comments_after.rs
-f7b56285b9db37d84405f647fb15412f  -
+0b2756c60659993310f899b131cca84f  -

 sample_files/context_before.rs sample_files/context_after.rs
 ef267b3bbea4b56a111427a11b24cc6a  -
@ -71,7 +74,7 @@ sample_files/janet_before.janet sample_files/janet_after.janet
 677604a16ef62f6b6252d76d76e86265  -

 sample_files/java_before.java sample_files/java_after.java
-22c27b91fd67d2b894de9a620bcf5c35  -
+d7cdb754cc9311e39c7aa402a8c51ab9  -

 sample_files/javascript_before.js sample_files/javascript_after.js
 f4bfe92df94f89942bacc73e4a9db882  -
@ -167,7 +170,7 @@ sample_files/text_before.txt sample_files/text_after.txt
 dfc3495b8d5931029b479f0c878a3219  -

 sample_files/todomvc_before.gleam sample_files/todomvc_after.gleam
-c1d8b44875121d81c583dd3a8fb43232  -
+6f2f2b3905fbff7e283a2d3b312dc658  -

 sample_files/toml_before.toml sample_files/toml_after.toml
 1e2de7235c339b07a0784498453e896c  -
--- a/src/display/side_by_side.rs
+++ b/src/display/side_by_side.rs
@ -520,7 +520,7 @@ pub fn print(
                            display_options.use_color,
                        );
                        if let Some(line_num) = lhs_line_num {
-                            if lhs_lines_with_novel.contains(line_num) {
+                            if display_options.use_color && lhs_lines_with_novel.contains(line_num) {
                                s = if display_options.background_color.is_dark() {
                                    s.bright_red().to_string()
                                } else {
@ -541,7 +541,7 @@ pub fn print(
                            display_options.use_color,
                        );
                        if let Some(line_num) = rhs_line_num {
-                            if rhs_lines_with_novel.contains(line_num) {
+                            if display_options.use_color && rhs_lines_with_novel.contains(line_num) {
                                s = if display_options.background_color.is_dark() {
                                    s.bright_green().to_string()
                                } else {
--- a/src/display/style.rs
+++ b/src/display/style.rs
@ -2,7 +2,7 @@

 use crate::{
    constants::Side,
-    lines::{byte_len, codepoint_len, split_on_newlines, LineNumber},
+    lines::{byte_len, split_on_newlines, LineNumber},
    options::DisplayOptions,
    parse::syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
    positions::SingleLineSpan,
@ -10,6 +10,7 @@ use crate::{
 use owo_colors::{OwoColorize, Style};
 use rustc_hash::FxHashMap;
 use std::cmp::{max, min};
+use unicode_width::{UnicodeWidthStr, UnicodeWidthChar};

 #[derive(Clone, Copy, Debug)]
 pub enum BackgroundColor {
@ -23,22 +24,26 @@ impl BackgroundColor {
    }
 }

-/// Slice `s` from `start` inclusive to `end` exclusive by codepoint. This is safer than
-/// slicing by bytes, which panics if the byte isn't on a codepoint
-/// boundary.
-fn substring_by_codepoint(s: &str, start: usize, end: usize) -> &str {
+/// Slice `s` from `start` inclusive to `end` exclusive by width.
+fn substring_by_width(s: &str, start: usize, end: usize) -> &str {
    if start == end {
        return &s[0..0];
    }

    assert!(end > start);

-    let mut char_idx_iter = s.char_indices();
-    let byte_start = char_idx_iter
-        .nth(start)
-        .expect("Expected a codepoint index inside `s`.")
+    let mut idx_width_iter = s.char_indices()
+        .scan(0, |w, (idx, ch)| {
+          let before = *w;
+          *w += ch.width().unwrap_or(0);
+          Some((idx, before, *w))
+        })
+        .skip_while(|(_, before, _)| *before < start);
+    let byte_start = idx_width_iter
+        .next()
+        .expect("Expected a width index inside `s`.")
        .0;
-    match char_idx_iter.nth(end - start - 1) {
+    match idx_width_iter.skip_while(|(_, _, after)| *after <= end).next() {
        Some(byte_end) => &s[byte_start..byte_end.0],
        None => &s[byte_start..],
    }
@ -48,27 +53,38 @@ fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
    &s[start..end]
 }

-/// Split a string into equal length parts, padding the last part if
-/// necessary.
+/// Split a string into equal length parts and how many spaces should be padded.
+///
+/// Return splitted strings and how many spaces each should be padded with.
 ///
 /// ```
-/// split_string_by_codepoint("fooba", 3) // vec!["foo", "ba "]
+/// split_string_by_width("fooba", 3, true) // vec![("foo", 0), ("ba", 1)]
+/// split_string_by_width("一个汉字两列宽", 8, false) // vec![("一个汉字", 0), ("两列宽", 0)]
 /// ```
-fn split_string_by_codepoint(s: &str, max_len: usize, pad_last: bool) -> Vec<String> {
+fn split_string_by_width(s: &str, max_width: usize, pad: bool) -> Vec<(&str, usize)> {
    let mut res = vec![];
    let mut s = s;

-    while codepoint_len(s) > max_len {
-        res.push(substring_by_codepoint(s, 0, max_len).into());
-        s = substring_by_codepoint(s, max_len, codepoint_len(s));
+    while s.width() > max_width {
+        let l = substring_by_width(s, 0, max_width);
+        let used = l.width();
+        let padding = if pad && used < max_width {
+          // a fullwidth char is followed
+          1
+        } else {
+          0
+        };
+        res.push((l, padding));
+        s = substring_by_width(s, used, s.width());
    }

    if res.is_empty() || !s.is_empty() {
-        if pad_last {
-            res.push(format!("{:width$}", s, width = max_len));
+        let padding = if pad {
+            max_width - s.width()
        } else {
-            res.push(s.to_string());
-        }
+            0
+        };
+        res.push((s, padding));
    }

    res
@ -90,13 +106,13 @@ pub fn split_and_apply(
 ) -> Vec<String> {
    if styles.is_empty() && !line.trim().is_empty() {
        // Missing styles is a bug, so highlight in purple to make this obvious.
-        return split_string_by_codepoint(line, max_len, matches!(side, Side::Left))
+        return split_string_by_width(line, max_len, matches!(side, Side::Left))
            .into_iter()
-            .map(|part| {
+            .map(|(part, _)| {
                if use_color {
-                    highlight_missing_style_bug(&part)
+                    highlight_missing_style_bug(part)
                } else {
-                    part
+                    part.to_owned()
                }
            })
            .collect();
@ -105,8 +121,8 @@ pub fn split_and_apply(
    let mut styled_parts = vec![];
    let mut part_start = 0;

-    for part in split_string_by_codepoint(line, max_len, matches!(side, Side::Left)) {
-        let mut res = String::with_capacity(part.len());
+    for (part, pad) in split_string_by_width(line, max_len, matches!(side, Side::Left)) {
+        let mut res = String::with_capacity(part.len() + pad);
        let mut prev_style_end = 0;
        for (span, style) in styles {
            let start_col = span.start_col as usize;
@ -122,7 +138,7 @@ pub fn split_and_apply(
                // Then append that text without styling.
                let unstyled_start = max(prev_style_end, part_start);
                res.push_str(substring_by_byte(
-                    &part,
+                    part,
                    unstyled_start - part_start,
                    start_col - part_start,
                ));
@ -131,9 +147,9 @@ pub fn split_and_apply(
            // Apply style to the substring in this span.
            if end_col > part_start {
                let span_s = substring_by_byte(
-                    &part,
+                    part,
                    max(0, span.start_col as isize - part_start as isize) as usize,
-                    min(byte_len(&part), end_col - part_start),
+                    min(byte_len(part), end_col - part_start),
                );
                res.push_str(&span_s.style(*style).to_string());
            }
@ -147,13 +163,14 @@ pub fn split_and_apply(
        }

        // Unstyled text after the last span.
-        if prev_style_end < part_start + codepoint_len(&part) {
-            let span_s = substring_by_byte(&part, prev_style_end - part_start, byte_len(&part));
+        if prev_style_end < part_start + byte_len(part) {
+            let span_s = substring_by_byte(part, prev_style_end - part_start, byte_len(part));
            res.push_str(span_s);
        }
+        res.push_str(&" ".repeat(pad));

        styled_parts.push(res);
-        part_start += byte_len(&part);
+        part_start += byte_len(part);
    }

    styled_parts
@ -385,36 +402,42 @@ mod tests {
    use pretty_assertions::assert_eq;

    #[test]
-    fn test_substring_by_codepoint() {
-        assert_eq!(substring_by_codepoint("abcd", 0, 2), "ab");
+    fn split_string_simple() {
+        assert_eq!(
+            split_string_by_width("fooba", 3, true),
+            vec![("foo", 0), ("ba", 1)]
+        );
    }

    #[test]
-    fn test_substring_by_codepoint_empty() {
-        assert_eq!(substring_by_codepoint("abcd", 0, 0), "");
+    fn split_string_simple_no_pad() {
+        assert_eq!(
+            split_string_by_width("fooba", 3, false),
+            vec![("foo", 0), ("ba", 0)]
+        );
    }

    #[test]
-    fn split_string_simple() {
+    fn split_string_unicode() {
        assert_eq!(
-            split_string_by_codepoint("fooba", 3, true),
-            vec!["foo", "ba "]
+            split_string_by_width("ab📦def", 4, true),
+            vec![("ab📦", 0), ("def", 1)]
        );
    }

    #[test]
-    fn split_string_simple_no_pad() {
+    fn split_string_cjk() {
        assert_eq!(
-            split_string_by_codepoint("fooba", 3, false),
-            vec!["foo", "ba"]
+            split_string_by_width("一个汉字两列宽", 8, false),
+            vec![("一个汉字", 0), ("两列宽", 0)]
        );
    }

    #[test]
-    fn split_string_unicode() {
+    fn split_string_cjk2() {
        assert_eq!(
-            split_string_by_codepoint("ab📦def", 3, true),
-            vec!["ab📦", "def"]
+            split_string_by_width("你好啊", 5, true),
+            vec![("你好", 1), ("啊", 3)]
        );
    }