Merge pull request #301 from lilydjwg/master

use unicode-width to align CJK characters
pull/315/head
Wilfred Hughes 2022-07-04 15:07:25 +07:00 committed by GitHub
commit 719654d462
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 203 additions and 51 deletions

7
Cargo.lock generated

@ -211,6 +211,7 @@ dependencies = [
"tree-sitter",
"tree_magic_mini",
"typed-arena",
"unicode-width",
"walkdir",
"wu-diff",
]
@ -626,6 +627,12 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d22af068fba1eb5edcb4aea19d382b2a3deb4c8f9d475c589b6ada9e0fd493ee"
[[package]]
name = "unicode-width"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
[[package]]
name = "unicode-xid"
version = "0.2.3"

@ -47,6 +47,7 @@ wu-diff = "0.1.2"
rayon = "1.5.2"
tree_magic_mini = "3.0.3"
bumpalo = "3.9.1"
unicode-width = "0.1.9"
[dev-dependencies]
pretty_assertions = "1.2.1"

@ -0,0 +1,59 @@
#: ../errors.h:589
# reorder if possible
#, fuzzy, c-format
# msgid "E244: Illegal %s name \"%s\" in font name \"%s\""
# msgstr "E244: 字体名 \"%3$s\" 中有非法 %1$s 名称 \"%2$s\""
#: ../errors.h:591
#, c-format
msgid "E245: Illegal char '%c' in font name \"%s\""
msgstr "E245: 不正确的字符 '%c' 出现在字体名称 \"%s\" 内"
#: ../errors.h:594
msgid "E246: FileChangedShell autocommand deleted buffer"
msgstr "E246: FileChangedShell 自动命令删除了缓冲区"
#: ../errors.h:597
#, c-format
msgid "E247: No registered server named \"%s\""
msgstr "E247: 没有名叫 \"%s\" 的已注册的服务器"
#: ../errors.h:788
# reorder if possible
#, fuzzy, c-format
# msgid "E316: ml_get: Cannot find line %ld in buffer %d %s"
# msgstr "E316: ml_get: 在缓冲区 %2$d %3$s 中找不到第 %1$ld 行"
#: ../errors.h:790
msgid "E317: Pointer block id wrong"
msgstr "E317: 指针块 id 错误"
#: ../errors.h:792
msgid "E317: Pointer block id wrong 2"
msgstr "E317: 指针块 id 错误 2"
#: ../errors.h:794
msgid "E317: Pointer block id wrong 3"
msgstr "E317: 指针块 id 错误 3"
#: ../errors.h:796
msgid "E317: Pointer block id wrong 4"
msgstr "E317: 指针块 id 错误 4"
#: ../errors.h:2705
# reorder if possible
#, fuzzy, c-format
# msgid "E1037: Cannot use \"%s\" with %s"
# msgstr "E1037: 不能对 %2$s 使用 \"%1$s\""
#: ../errors.h:2707
msgid "E1038: \"vim9script\" can only be used in a script"
msgstr "E1038: \"vim9script\" 只能在脚本中使用"
#: ../errors.h:2709
msgid "E1039: \"vim9script\" must be the first command in a script"
msgstr "E1039: \"vim9script\" 必须是脚本中的第一条命令"
#: ../errors.h:2712
msgid "E1040: Cannot use :scriptversion after :vim9script"
msgstr "E1040: :vim9script 之后不能使用 :scriptversion"

@ -0,0 +1,59 @@
#: ../errors.h:589
# reorder if possible
#, fuzzy, c-format
msgid "E244: Illegal %s name \"%s\" in font name \"%s\""
msgstr "E244: 字体名 \"%3$s\" 中有非法 %1$s 名称 \"%2$s\""
#: ../errors.h:591
#, c-format
msgid "E245: Illegal char '%c' in font name \"%s\""
msgstr "E245: 不正确的字符 '%c' 出现在字体名称 \"%s\" 内"
#: ../errors.h:594
msgid "E246: FileChangedShell autocommand deleted buffer"
msgstr "E246: FileChangedShell 自动命令删除了缓冲区"
#: ../errors.h:597
#, c-format
msgid "E247: No registered server named \"%s\""
msgstr "E247: 没有名叫 \"%s\" 的已注册的服务器"
#: ../errors.h:788
# reorder if possible
#, fuzzy, c-format
msgid "E316: ml_get: Cannot find line %ld in buffer %d %s"
msgstr "E316: ml_get: 在缓冲区 %2$d %3$s 中找不到第 %1$ld 行"
#: ../errors.h:790
msgid "E317: Pointer block id wrong"
msgstr "E317: 指针块 id 错误"
#: ../errors.h:792
msgid "E317: Pointer block id wrong 2"
msgstr "E317: 指针块 id 错误 2"
#: ../errors.h:794
msgid "E317: Pointer block id wrong 3"
msgstr "E317: 指针块 id 错误 3"
#: ../errors.h:796
msgid "E317: Pointer block id wrong 4"
msgstr "E317: 指针块 id 错误 4"
#: ../errors.h:2705
# reorder if possible
#, fuzzy, c-format
msgid "E1037: Cannot use \"%s\" with %s"
msgstr "E1037: 不能对 %2$s 使用 \"%1$s\""
#: ../errors.h:2707
msgid "E1038: \"vim9script\" can only be used in a script"
msgstr "E1038: \"vim9script\" 只能在脚本中使用"
#: ../errors.h:2709
msgid "E1039: \"vim9script\" must be the first command in a script"
msgstr "E1039: \"vim9script\" 必须是脚本中的第一条命令"
#: ../errors.h:2712
msgid "E1040: Cannot use :scriptversion after :vim9script"
msgstr "E1040: :vim9script 之后不能使用 :scriptversion"

@ -7,11 +7,14 @@ sample_files/bad_combine_before.rs sample_files/bad_combine_after.rs
sample_files/change_outer_before.el sample_files/change_outer_after.el
1857b63ba1bfa0ccc0a4243db6b1c5c2 -
sample_files/chinese_before.po sample_files/chinese_after.po
56f0af341fd86727dbac522293e8e013 -
sample_files/clojure_before.clj sample_files/clojure_after.clj
b916e224f289888252cd7597bab339e6 -
sample_files/comments_before.rs sample_files/comments_after.rs
f7b56285b9db37d84405f647fb15412f -
0b2756c60659993310f899b131cca84f -
sample_files/context_before.rs sample_files/context_after.rs
ef267b3bbea4b56a111427a11b24cc6a -
@ -71,7 +74,7 @@ sample_files/janet_before.janet sample_files/janet_after.janet
677604a16ef62f6b6252d76d76e86265 -
sample_files/java_before.java sample_files/java_after.java
22c27b91fd67d2b894de9a620bcf5c35 -
d7cdb754cc9311e39c7aa402a8c51ab9 -
sample_files/javascript_before.js sample_files/javascript_after.js
f4bfe92df94f89942bacc73e4a9db882 -
@ -167,7 +170,7 @@ sample_files/text_before.txt sample_files/text_after.txt
dfc3495b8d5931029b479f0c878a3219 -
sample_files/todomvc_before.gleam sample_files/todomvc_after.gleam
c1d8b44875121d81c583dd3a8fb43232 -
6f2f2b3905fbff7e283a2d3b312dc658 -
sample_files/toml_before.toml sample_files/toml_after.toml
1e2de7235c339b07a0784498453e896c -

@ -520,7 +520,7 @@ pub fn print(
display_options.use_color,
);
if let Some(line_num) = lhs_line_num {
if lhs_lines_with_novel.contains(line_num) {
if display_options.use_color && lhs_lines_with_novel.contains(line_num) {
s = if display_options.background_color.is_dark() {
s.bright_red().to_string()
} else {
@ -541,7 +541,7 @@ pub fn print(
display_options.use_color,
);
if let Some(line_num) = rhs_line_num {
if rhs_lines_with_novel.contains(line_num) {
if display_options.use_color && rhs_lines_with_novel.contains(line_num) {
s = if display_options.background_color.is_dark() {
s.bright_green().to_string()
} else {

@ -2,7 +2,7 @@
use crate::{
constants::Side,
lines::{byte_len, codepoint_len, split_on_newlines, LineNumber},
lines::{byte_len, split_on_newlines, LineNumber},
options::DisplayOptions,
parse::syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
positions::SingleLineSpan,
@ -10,6 +10,7 @@ use crate::{
use owo_colors::{OwoColorize, Style};
use rustc_hash::FxHashMap;
use std::cmp::{max, min};
use unicode_width::{UnicodeWidthStr, UnicodeWidthChar};
#[derive(Clone, Copy, Debug)]
pub enum BackgroundColor {
@ -23,22 +24,26 @@ impl BackgroundColor {
}
}
/// Slice `s` from `start` inclusive to `end` exclusive by codepoint. This is safer than
/// slicing by bytes, which panics if the byte isn't on a codepoint
/// boundary.
fn substring_by_codepoint(s: &str, start: usize, end: usize) -> &str {
/// Slice `s` from `start` inclusive to `end` exclusive by width.
fn substring_by_width(s: &str, start: usize, end: usize) -> &str {
if start == end {
return &s[0..0];
}
assert!(end > start);
let mut char_idx_iter = s.char_indices();
let byte_start = char_idx_iter
.nth(start)
.expect("Expected a codepoint index inside `s`.")
let mut idx_width_iter = s.char_indices()
.scan(0, |w, (idx, ch)| {
let before = *w;
*w += ch.width().unwrap_or(0);
Some((idx, before, *w))
})
.skip_while(|(_, before, _)| *before < start);
let byte_start = idx_width_iter
.next()
.expect("Expected a width index inside `s`.")
.0;
match char_idx_iter.nth(end - start - 1) {
match idx_width_iter.skip_while(|(_, _, after)| *after <= end).next() {
Some(byte_end) => &s[byte_start..byte_end.0],
None => &s[byte_start..],
}
@ -48,27 +53,38 @@ fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
&s[start..end]
}
/// Split a string into equal length parts, padding the last part if
/// necessary.
/// Split a string into equal length parts and how many spaces should be padded.
///
/// Return splitted strings and how many spaces each should be padded with.
///
/// ```
/// split_string_by_codepoint("fooba", 3) // vec!["foo", "ba "]
/// split_string_by_width("fooba", 3, true) // vec![("foo", 0), ("ba", 1)]
/// split_string_by_width("一个汉字两列宽", 8, false) // vec![("一个汉字", 0), ("两列宽", 0)]
/// ```
fn split_string_by_codepoint(s: &str, max_len: usize, pad_last: bool) -> Vec<String> {
fn split_string_by_width(s: &str, max_width: usize, pad: bool) -> Vec<(&str, usize)> {
let mut res = vec![];
let mut s = s;
while codepoint_len(s) > max_len {
res.push(substring_by_codepoint(s, 0, max_len).into());
s = substring_by_codepoint(s, max_len, codepoint_len(s));
while s.width() > max_width {
let l = substring_by_width(s, 0, max_width);
let used = l.width();
let padding = if pad && used < max_width {
// a fullwidth char is followed
1
} else {
0
};
res.push((l, padding));
s = substring_by_width(s, used, s.width());
}
if res.is_empty() || !s.is_empty() {
if pad_last {
res.push(format!("{:width$}", s, width = max_len));
let padding = if pad {
max_width - s.width()
} else {
res.push(s.to_string());
}
0
};
res.push((s, padding));
}
res
@ -90,13 +106,13 @@ pub fn split_and_apply(
) -> Vec<String> {
if styles.is_empty() && !line.trim().is_empty() {
// Missing styles is a bug, so highlight in purple to make this obvious.
return split_string_by_codepoint(line, max_len, matches!(side, Side::Left))
return split_string_by_width(line, max_len, matches!(side, Side::Left))
.into_iter()
.map(|part| {
.map(|(part, _)| {
if use_color {
highlight_missing_style_bug(&part)
highlight_missing_style_bug(part)
} else {
part
part.to_owned()
}
})
.collect();
@ -105,8 +121,8 @@ pub fn split_and_apply(
let mut styled_parts = vec![];
let mut part_start = 0;
for part in split_string_by_codepoint(line, max_len, matches!(side, Side::Left)) {
let mut res = String::with_capacity(part.len());
for (part, pad) in split_string_by_width(line, max_len, matches!(side, Side::Left)) {
let mut res = String::with_capacity(part.len() + pad);
let mut prev_style_end = 0;
for (span, style) in styles {
let start_col = span.start_col as usize;
@ -122,7 +138,7 @@ pub fn split_and_apply(
// Then append that text without styling.
let unstyled_start = max(prev_style_end, part_start);
res.push_str(substring_by_byte(
&part,
part,
unstyled_start - part_start,
start_col - part_start,
));
@ -131,9 +147,9 @@ pub fn split_and_apply(
// Apply style to the substring in this span.
if end_col > part_start {
let span_s = substring_by_byte(
&part,
part,
max(0, span.start_col as isize - part_start as isize) as usize,
min(byte_len(&part), end_col - part_start),
min(byte_len(part), end_col - part_start),
);
res.push_str(&span_s.style(*style).to_string());
}
@ -147,13 +163,14 @@ pub fn split_and_apply(
}
// Unstyled text after the last span.
if prev_style_end < part_start + codepoint_len(&part) {
let span_s = substring_by_byte(&part, prev_style_end - part_start, byte_len(&part));
if prev_style_end < part_start + byte_len(part) {
let span_s = substring_by_byte(part, prev_style_end - part_start, byte_len(part));
res.push_str(span_s);
}
res.push_str(&" ".repeat(pad));
styled_parts.push(res);
part_start += byte_len(&part);
part_start += byte_len(part);
}
styled_parts
@ -385,36 +402,42 @@ mod tests {
use pretty_assertions::assert_eq;
#[test]
fn test_substring_by_codepoint() {
assert_eq!(substring_by_codepoint("abcd", 0, 2), "ab");
fn split_string_simple() {
assert_eq!(
split_string_by_width("fooba", 3, true),
vec![("foo", 0), ("ba", 1)]
);
}
#[test]
fn test_substring_by_codepoint_empty() {
assert_eq!(substring_by_codepoint("abcd", 0, 0), "");
fn split_string_simple_no_pad() {
assert_eq!(
split_string_by_width("fooba", 3, false),
vec![("foo", 0), ("ba", 0)]
);
}
#[test]
fn split_string_simple() {
fn split_string_unicode() {
assert_eq!(
split_string_by_codepoint("fooba", 3, true),
vec!["foo", "ba "]
split_string_by_width("ab📦def", 4, true),
vec![("ab📦", 0), ("def", 1)]
);
}
#[test]
fn split_string_simple_no_pad() {
fn split_string_cjk() {
assert_eq!(
split_string_by_codepoint("fooba", 3, false),
vec!["foo", "ba"]
split_string_by_width("一个汉字两列宽", 8, false),
vec![("一个汉字", 0), ("两列宽", 0)]
);
}
#[test]
fn split_string_unicode() {
fn split_string_cjk2() {
assert_eq!(
split_string_by_codepoint("ab📦def", 3, true),
vec!["ab📦", "def"]
split_string_by_width("你好啊", 5, true),
vec![("你好", 1), ("", 3)]
);
}