mirror of https://github.com/Wilfred/difftastic/
256 lines
7.7 KiB
Rust
256 lines
7.7 KiB
Rust
//! A fast diff for linear content, using Myer's diff algorithm.
|
|
|
|
use rustc_hash::FxHashSet;
|
|
use std::hash::Hash;
|
|
|
|
use crate::hash::DftHashMap;
|
|
|
|
#[derive(Debug, PartialEq)]
|
|
pub enum DiffResult<T> {
|
|
Left(T),
|
|
Both(T, T),
|
|
Right(T),
|
|
}
|
|
|
|
/// Compute a linear diff between `lhs` and `rhs`. This is the
|
|
/// traditional Myer's diff algorithm.
|
|
pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
|
|
wu_diff::diff(lhs, rhs)
|
|
.into_iter()
|
|
.map(|result| match result {
|
|
wu_diff::DiffResult::Removed(r) => DiffResult::Left(&lhs[r.old_index.unwrap()]),
|
|
wu_diff::DiffResult::Common(c) => {
|
|
let lhs_id = c.old_index.unwrap();
|
|
let rhs_id = c.new_index.unwrap();
|
|
DiffResult::Both(&lhs[lhs_id], &rhs[rhs_id])
|
|
}
|
|
wu_diff::DiffResult::Added(a) => DiffResult::Right(&rhs[a.new_index.unwrap()]),
|
|
})
|
|
.collect::<Vec<_>>()
|
|
}
|
|
|
|
/// Compute a linear diff between `lhs` and `rhs`, but use hashed
|
|
/// values internally.
|
|
///
|
|
/// This is faster when equality checks on `T` are expensive, such as
|
|
/// large strings.
|
|
pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
|
|
// Compute a unique numeric value for each item, use that for
|
|
// diffing, then return diff results in terms of the original
|
|
// type.
|
|
//
|
|
// This is the decorate-sort-undecorate pattern, or Schwartzian
|
|
// transform, for diffing.
|
|
let mut value_ids: DftHashMap<&T, u32> = DftHashMap::default();
|
|
let mut id_values: DftHashMap<u32, &T> = DftHashMap::default();
|
|
|
|
let mut lhs_ids = Vec::with_capacity(lhs.len());
|
|
for value in lhs {
|
|
let id: u32 = match value_ids.get(value) {
|
|
Some(id) => *id,
|
|
None => {
|
|
let new_id = value_ids.len() as u32;
|
|
value_ids.insert(value, new_id);
|
|
id_values.insert(new_id, value);
|
|
new_id
|
|
}
|
|
};
|
|
lhs_ids.push(id);
|
|
}
|
|
|
|
let mut rhs_ids = Vec::with_capacity(rhs.len());
|
|
for value in rhs {
|
|
let id = match value_ids.get(value) {
|
|
Some(id) => *id,
|
|
None => {
|
|
let new_id = value_ids.len() as u32;
|
|
value_ids.insert(value, new_id);
|
|
id_values.insert(new_id, value);
|
|
new_id
|
|
}
|
|
};
|
|
rhs_ids.push(id);
|
|
}
|
|
|
|
slice(&lhs_ids[..], &rhs_ids[..])
|
|
.into_iter()
|
|
.map(|result| match result {
|
|
DiffResult::Left(id) => DiffResult::Left(*id_values.get(id).unwrap()),
|
|
DiffResult::Both(lhs_id, rhs_id) => DiffResult::Both(
|
|
*id_values.get(lhs_id).unwrap(),
|
|
*id_values.get(rhs_id).unwrap(),
|
|
),
|
|
DiffResult::Right(id) => DiffResult::Right(*id_values.get(id).unwrap()),
|
|
})
|
|
.collect::<Vec<_>>()
|
|
}
|
|
|
|
/// Compute the linear diff between `lhs` and `rhs`. If there are
|
|
/// items that only occur on a single side, mark them as novel without
|
|
/// processing them with Myer's diff.
|
|
///
|
|
/// This is substantially faster than `slice`, when `lhs` and `rhs`
|
|
/// have few items in common.
|
|
///
|
|
/// (This heuristic is used in traditional diff tools too, such as GNU
|
|
/// diff.)
|
|
pub fn slice_unique_by_hash<'a, T: Eq + Clone + Hash>(
|
|
lhs: &'a [T],
|
|
rhs: &'a [T],
|
|
) -> Vec<DiffResult<&'a T>> {
|
|
let mut lhs_set = FxHashSet::default();
|
|
for item in lhs {
|
|
lhs_set.insert(item);
|
|
}
|
|
let mut rhs_set = FxHashSet::default();
|
|
for item in rhs {
|
|
rhs_set.insert(item);
|
|
}
|
|
|
|
let lhs_without_unique: Vec<&'a T> = lhs.iter().filter(|n| rhs_set.contains(n)).collect();
|
|
let rhs_without_unique: Vec<&'a T> = rhs.iter().filter(|n| lhs_set.contains(n)).collect();
|
|
|
|
let mut res: Vec<DiffResult<&'a T>> = Vec::with_capacity(lhs.len());
|
|
let mut lhs_i = 0;
|
|
let mut rhs_i = 0;
|
|
|
|
for item in slice_by_hash(&lhs_without_unique, &rhs_without_unique) {
|
|
match item {
|
|
DiffResult::Left(lhs_item) => {
|
|
while lhs_i < lhs.len() {
|
|
if &lhs[lhs_i] != *lhs_item {
|
|
res.push(DiffResult::Left(&lhs[lhs_i]));
|
|
lhs_i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
res.push(DiffResult::Left(*lhs_item));
|
|
lhs_i += 1;
|
|
}
|
|
DiffResult::Both(lhs_item, rhs_item) => {
|
|
while lhs_i < lhs.len() {
|
|
if &lhs[lhs_i] != *lhs_item {
|
|
res.push(DiffResult::Left(&lhs[lhs_i]));
|
|
lhs_i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
while rhs_i < rhs.len() {
|
|
if &rhs[rhs_i] != *rhs_item {
|
|
res.push(DiffResult::Right(&rhs[rhs_i]));
|
|
rhs_i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
res.push(DiffResult::Both(*lhs_item, *rhs_item));
|
|
lhs_i += 1;
|
|
rhs_i += 1;
|
|
}
|
|
DiffResult::Right(rhs_item) => {
|
|
while rhs_i < rhs.len() {
|
|
if &rhs[rhs_i] != *rhs_item {
|
|
res.push(DiffResult::Right(&rhs[rhs_i]));
|
|
rhs_i += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
res.push(DiffResult::Right(*rhs_item));
|
|
rhs_i += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
while lhs_i < lhs.len() {
|
|
res.push(DiffResult::Left(&lhs[lhs_i]));
|
|
lhs_i += 1;
|
|
}
|
|
while rhs_i < rhs.len() {
|
|
res.push(DiffResult::Right(&rhs[rhs_i]));
|
|
rhs_i += 1;
|
|
}
|
|
|
|
res
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_slice_same_items() {
|
|
let diff_items = slice(&["a", "b"], &["a", "b"]);
|
|
assert_eq!(
|
|
diff_items,
|
|
vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_slice_different_items() {
|
|
let diff_items = slice(&["a", "b"], &["c", "d"]);
|
|
assert_eq!(
|
|
diff_items,
|
|
vec![
|
|
DiffResult::Left(&"a"),
|
|
DiffResult::Left(&"b"),
|
|
DiffResult::Right(&"c"),
|
|
DiffResult::Right(&"d"),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_slice_by_hash_same_items() {
|
|
let diff_items = slice_by_hash(&["a", "b"], &["a", "b"]);
|
|
assert_eq!(
|
|
diff_items,
|
|
vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_slice_by_hash_different_items() {
|
|
let diff_items = slice_by_hash(&["a", "b"], &["c", "d"]);
|
|
assert_eq!(
|
|
diff_items,
|
|
vec![
|
|
DiffResult::Left(&"a"),
|
|
DiffResult::Left(&"b"),
|
|
DiffResult::Right(&"c"),
|
|
DiffResult::Right(&"d"),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_slice_unique_same_items() {
|
|
let diff_items = slice_unique_by_hash(&["a", "b"], &["a", "b"]);
|
|
assert_eq!(
|
|
diff_items,
|
|
vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_slice_unique_different_items() {
|
|
let diff_items = slice_unique_by_hash(&["a", "b"], &["c", "d"]);
|
|
assert_eq!(
|
|
diff_items,
|
|
vec![
|
|
DiffResult::Left(&"a"),
|
|
DiffResult::Left(&"b"),
|
|
DiffResult::Right(&"c"),
|
|
DiffResult::Right(&"d"),
|
|
]
|
|
);
|
|
}
|
|
}
|