Ensure line splitting distinguishes "foo" and "foo\n"

We rely on being able to split lines and rejoin them to obtain the
original string. `str::lines()` in the Rust stdlib does not have this
property.

This was causing crashes in word-diffing on textual diffing, where
code paths differed on the number of lines they thought a string had.

This was broken in 8b842387a1.

Fixes #688.
empty_delimiters_heuristic 0.59.0
Wilfred Hughes 2024-07-20 15:54:27 +07:00
parent efe1b10e8d
commit ffe27c575e
10 changed files with 166 additions and 25 deletions

@ -1,5 +1,10 @@
## 0.59 (unreleased)
### Diffing
Fixed crash on some textual files where a single change contained more than
1,000 words.
### Parsing
Added support for device tree and F#.

@ -0,0 +1,7 @@
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks=
github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0=
github.com/naoina/toml v0.1.1 h1:PT/lllxVVN0gzzSqSlHEmP8MJB4MY2U7STGxiouV4X8=
github.com/naoina/toml v0.1.1/go.mod h1:NBIhNtsFMo3G2szEBne+bO4gS192HuIYRqfvOWb4i1E=
github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd h1:625/bJvSNfQrzzK5ttnUqMqnVe8M5MILmf5ZRGgeeDY=
github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd/go.mod h1:1WNBiOZtZQLpVAyu0iTduoJL9hEsMloAK5XWrtW0xdY=

@ -0,0 +1,49 @@
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks=
github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0=
github.com/naoina/toml v0.1.1 h1:PT/lllxVVN0gzzSqSlHEmP8MJB4MY2U7STGxiouV4X8=
github.com/naoina/toml v0.1.1/go.mod h1:NBIhNtsFMo3G2szEBne+bO4gS192HuIYRqfvOWb4i1E=
github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd h1:625/bJvSNfQrzzK5ttnUqMqnVe8M5MILmf5ZRGgeeDY=
github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd/go.mod h1:1WNBiOZtZQLpVAyu0iTduoJL9hEsMloAK5XWrtW0xdY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20220708220712-1185a9018129/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g=
golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=

@ -19,6 +19,9 @@ sample_files/b2_math_1.h sample_files/b2_math_2.h
sample_files/bad_combine_1.rs sample_files/bad_combine_2.rs
f5051bf7d2b8afa3a677388cbd458891 -
sample_files/big_text_hunk_1.txt sample_files/big_text_hunk_2.txt
fd0c8912c094097f82c6b29ae66fb912 -
sample_files/change_outer_1.el sample_files/change_outer_2.el
2b9334a4cc72da63bba28eff958f0038 -
@ -140,7 +143,7 @@ sample_files/makefile_1.mk sample_files/makefile_2.mk
d0572210b5121ce68ac0ce45e43b922b -
sample_files/many_newlines_1.txt sample_files/many_newlines_2.txt
615de4b145b7b161e4fb285728280ed1 -
52ca05855e520876479e6f608c5e7831 -
sample_files/metadata_1.clj sample_files/metadata_2.clj
4b58ce366467c8cca46db53508e81323 -
@ -293,5 +296,5 @@ sample_files/yaml_1.yaml sample_files/yaml_2.yaml
f068239fc7bade0e6de96d81136c1ac5 -
sample_files/zig_1.zig sample_files/zig_2.zig
4516796003b81f35bfa57d84bb7c0cbe -
e36d1ea126b8b68e3344434bb63f205e -

@ -2,10 +2,12 @@
use crate::{
constants::Side,
display::context::{calculate_after_context, calculate_before_context, opposite_positions},
display::hunks::Hunk,
display::style::{self, apply_colors, apply_line_number_color},
lines::{format_line_num, MaxLine},
display::{
context::{calculate_after_context, calculate_before_context, opposite_positions},
hunks::Hunk,
style::{self, apply_colors, apply_line_number_color},
},
lines::{format_line_num, split_on_newlines, MaxLine},
options::DisplayOptions,
parse::syntax::MatchedPos,
summary::FileFormat,
@ -43,8 +45,12 @@ pub(crate) fn print(
)
} else {
(
lhs_src.lines().map(|s| format!("{}\n", s)).collect(),
rhs_src.lines().map(|s| format!("{}\n", s)).collect(),
split_on_newlines(lhs_src)
.map(|s| format!("{}\n", s))
.collect(),
split_on_newlines(rhs_src)
.map(|s| format!("{}\n", s))
.collect(),
)
};

@ -11,14 +11,16 @@ use owo_colors::{OwoColorize, Style};
use crate::{
constants::Side,
display::context::all_matched_lines_filled,
display::hunks::{matched_lines_indexes_for_hunk, Hunk},
display::style::{
self, apply_colors, apply_line_number_color, color_positions, novel_style, replace_tabs,
split_and_apply, BackgroundColor,
display::{
context::all_matched_lines_filled,
hunks::{matched_lines_indexes_for_hunk, Hunk},
style::{
self, apply_colors, apply_line_number_color, color_positions, novel_style,
replace_tabs, split_and_apply, BackgroundColor,
},
},
hash::DftHashMap,
lines::format_line_num,
lines::{format_line_num, split_on_newlines},
options::{DisplayMode, DisplayOptions},
parse::syntax::{zip_pad_shorter, MatchedPos},
summary::FileFormat,
@ -338,8 +340,12 @@ pub(crate) fn print(
)
} else {
(
lhs_src.lines().map(|s| format!("{}\n", s)).collect(),
rhs_src.lines().map(|s| format!("{}\n", s)).collect(),
split_on_newlines(lhs_src)
.map(|s| format!("{}\n", s))
.collect(),
split_on_newlines(rhs_src)
.map(|s| format!("{}\n", s))
.collect(),
)
};
@ -401,8 +407,21 @@ pub(crate) fn print(
let mut prev_lhs_line_num = None;
let mut prev_rhs_line_num = None;
let lhs_lines = lhs_src.lines().collect::<Vec<_>>();
let rhs_lines = rhs_src.lines().collect::<Vec<_>>();
let mut lhs_lines = split_on_newlines(lhs_src).collect::<Vec<_>>();
let mut rhs_lines = split_on_newlines(rhs_src).collect::<Vec<_>>();
// If "foo" is one line, is "foo\n" two lines? Generally we want
// to care about newlines when deciding whether content differs.
//
// Ending a file with a trailing newline is extremely common
// though. If both files have a trailing newline, consider "foo\n"
// to be "foo" so we don't end up displaying a blank line on both
// sides.
if lhs_lines.last() == Some(&"") && rhs_lines.last() == Some(&"") {
lhs_lines.pop();
rhs_lines.pop();
}
let matched_lines = all_matched_lines_filled(lhs_mps, rhs_mps, &lhs_lines, &rhs_lines);
let mut matched_lines_to_print = &matched_lines[..];

@ -7,6 +7,7 @@ use line_numbers::SingleLineSpan;
use owo_colors::{OwoColorize, Style};
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
use crate::lines::split_on_newlines;
use crate::parse::syntax::StringKind;
use crate::{
constants::Side,
@ -401,7 +402,7 @@ pub(crate) fn apply_colors(
positions: &[MatchedPos],
) -> Vec<String> {
let styles = color_positions(side, background, syntax_highlight, file_format, positions);
let lines = s.lines().collect::<Vec<_>>();
let lines = split_on_newlines(s).collect::<Vec<_>>();
style_lines(&lines, &styles)
}

@ -32,6 +32,21 @@ impl<S: AsRef<str>> MaxLine for S {
}
}
/// Split `s` on \n or \r\n. Always returns a non-empty vec. Each line
/// in the vec does not include the trailing newline.
///
/// This differs from `str::lines`, which considers `""` to be zero
/// lines and `"foo\n"` to be one line.
pub(crate) fn split_on_newlines(s: &str) -> impl Iterator<Item = &str> {
s.split('\n').map(|l| {
if let Some(l) = l.strip_suffix('\r') {
l
} else {
l
}
})
}
pub(crate) fn is_all_whitespace(s: &str) -> bool {
s.chars().all(|c| c.is_whitespace())
}
@ -66,6 +81,40 @@ mod tests {
assert_eq!(line.max_line().0, 1);
}
#[test]
fn test_split_line_empty() {
assert_eq!(split_on_newlines("").collect::<Vec<_>>(), vec![""]);
}
#[test]
fn test_split_line_single() {
assert_eq!(split_on_newlines("foo").collect::<Vec<_>>(), vec!["foo"]);
}
#[test]
fn test_split_line_with_newline() {
assert_eq!(
split_on_newlines("foo\nbar").collect::<Vec<_>>(),
vec!["foo", "bar"]
);
}
#[test]
fn test_split_line_with_crlf() {
assert_eq!(
split_on_newlines("foo\r\nbar").collect::<Vec<_>>(),
vec!["foo", "bar"]
);
}
#[test]
fn test_split_line_with_trailing_newline() {
assert_eq!(
split_on_newlines("foo\nbar\n").collect::<Vec<_>>(),
vec!["foo", "bar", ""]
);
}
#[test]
fn test_is_all_whitespace() {
assert!(is_all_whitespace(" \n\t"));

@ -179,6 +179,8 @@ pub(crate) fn language_name(language: Language) -> &'static str {
use Language::*;
use crate::lines::split_on_newlines;
/// File globs that identify languages based on the file path.
pub(crate) fn language_globs(language: Language) -> Vec<glob::Pattern> {
let glob_strs: &'static [&'static str] = match language {
@ -420,7 +422,7 @@ fn looks_like_hacklang(path: &Path, src: &str) -> bool {
fn looks_like_objc(path: &Path, src: &str) -> bool {
if let Some(extension) = path.extension() {
if extension == "h" {
return src.lines().take(100).any(|line| {
return split_on_newlines(src).take(100).any(|line| {
["#import", "@interface", "@protocol"]
.iter()
.any(|keyword| line.starts_with(keyword))
@ -497,7 +499,7 @@ fn from_emacs_mode_header(src: &str) -> Option<Language> {
// Emacs allows the mode header to occur on the second line if the
// first line is a shebang.
for line in src.lines().take(2) {
for line in split_on_newlines(src).take(2) {
let mode_name: String = match (MODE_RE.captures(line), SHORTHAND_RE.captures(line)) {
(Some(cap), _) | (_, Some(cap)) => cap[1].into(),
_ => "".into(),
@ -559,7 +561,7 @@ fn from_shebang(src: &str) -> Option<Language> {
lazy_static! {
static ref RE: Regex = Regex::new(r"#! *(?:/usr/bin/env )?([^ ]+)").unwrap();
}
if let Some(first_line) = src.lines().next() {
if let Some(first_line) = split_on_newlines(src).next() {
if let Some(cap) = RE.captures(first_line) {
let interpreter_path = Path::new(&cap[1]);
if let Some(name) = interpreter_path.file_name() {

@ -9,6 +9,7 @@ use line_numbers::SingleLineSpan;
use typed_arena::Arena;
use self::Syntax::*;
use crate::lines::split_on_newlines;
use crate::words::split_words_and_numbers;
use crate::{
diff::changes::ChangeKind,
@ -408,9 +409,8 @@ fn set_content_id(nodes: &[&Syntax], existing: &mut DftHashMap<ContentKey, u32>)
..
} => {
let is_comment = *highlight == AtomKind::Comment;
let clean_content = if is_comment && content.lines().count() > 1 {
content
.lines()
let clean_content = if is_comment && split_on_newlines(content).count() > 1 {
split_on_newlines(content)
.map(|l| l.trim_start())
.collect::<Vec<_>>()
.join("\n")