Switch to wu-diff for textual diffing

In #153 a user reported difftastic never terminated on a 140,000
file. This was due to the diff crate using a very large amount of time
and memory.

The diff crate does not use Myers' algorithm, which has a
divide-and-conquer approach using snakes:

https://blog.jcoglan.com/2017/03/22/myers-diff-in-linear-space-theory/

wu-diff does implement Myer's algorithm and performs much better on
these large files.
pull/297/head
Wilfred Hughes 2022-03-10 23:06:04 +07:00
parent e8d9ffa61c
commit afb1b369f4
4 changed files with 29 additions and 7 deletions

@ -1,5 +1,9 @@
## 0.23 (unreleased)
### Diffing
Improved performance on very large files that are compared by text.
## 0.22 (release 10th March 2022)
Difftastic now requires Rust 1.56 to build.

7
Cargo.lock generated

@ -187,6 +187,7 @@ dependencies = [
"tree-sitter",
"typed-arena",
"walkdir",
"wu-diff",
]
[[package]]
@ -574,3 +575,9 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "wu-diff"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e3e6735fcde06432870db8dc9d7e3ab1b93727c14eaef329969426299f28893"

@ -41,6 +41,7 @@ term_size = "0.3.2"
const_format = "0.2.22"
owo-colors = "3.2.0"
rpds = "0.11.0"
wu-diff = "0.1.2"
[dev-dependencies]
pretty_assertions = "1.0.0"

@ -109,15 +109,25 @@ fn diff_slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<diff:
rhs_ids.push(id);
}
diff::slice(&lhs_ids[..], &rhs_ids[..])
wu_diff::diff(&lhs_ids[..], &rhs_ids[..])
.into_iter()
.map(|result| match result {
diff::Result::Left(id) => diff::Result::Left(*id_values.get(id).unwrap()),
diff::Result::Both(lhs_id, rhs_id) => diff::Result::Both(
*id_values.get(lhs_id).unwrap(),
*id_values.get(rhs_id).unwrap(),
),
diff::Result::Right(id) => diff::Result::Right(*id_values.get(id).unwrap()),
wu_diff::DiffResult::Removed(r) => {
let id = lhs_ids[r.old_index.unwrap()];
diff::Result::Left(*id_values.get(&id).unwrap())
}
wu_diff::DiffResult::Common(c) => {
let lhs_id = lhs_ids[c.old_index.unwrap()];
let rhs_id = rhs_ids[c.new_index.unwrap()];
diff::Result::Both(
*id_values.get(&lhs_id).unwrap(),
*id_values.get(&rhs_id).unwrap(),
)
}
wu_diff::DiffResult::Added(a) => {
let id = rhs_ids[a.new_index.unwrap()];
diff::Result::Right(*id_values.get(&id).unwrap())
}
})
.collect::<Vec<diff::Result<&'a T>>>()
}