Move slice_by_hash to myers_diff and add unit tests

pull/464/head
Wilfred Hughes 2023-01-15 11:03:31 +07:00
parent dd92af3643
commit c08eefb14a
2 changed files with 77 additions and 61 deletions

@ -1,5 +1,8 @@
//! A fast diff for linear content, using Myer's diff algorithm. //! A fast diff for linear content, using Myer's diff algorithm.
use rustc_hash::FxHashMap;
use std::hash::Hash;
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum DiffResult<T> { pub enum DiffResult<T> {
Left(T), Left(T),
@ -22,6 +25,56 @@ pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffRe
.collect::<Vec<_>>() .collect::<Vec<_>>()
} }
/// Compute a unique numeric value for each item, use that for
/// diffing, then return diff results in terms of the original type.
///
/// This is the decorate-sort-undecorate pattern, or Schwartzian
/// transform, for diffing.
pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
let mut value_ids: FxHashMap<&T, u32> = FxHashMap::default();
let mut id_values: FxHashMap<u32, &T> = FxHashMap::default();
let mut lhs_ids = Vec::with_capacity(lhs.len());
for value in lhs {
let id: u32 = match value_ids.get(value) {
Some(id) => *id,
None => {
let new_id = value_ids.len() as u32;
value_ids.insert(value, new_id);
id_values.insert(new_id, value);
new_id
}
};
lhs_ids.push(id);
}
let mut rhs_ids = Vec::with_capacity(rhs.len());
for value in rhs {
let id = match value_ids.get(value) {
Some(id) => *id,
None => {
let new_id = value_ids.len() as u32;
value_ids.insert(value, new_id);
id_values.insert(new_id, value);
new_id
}
};
rhs_ids.push(id);
}
slice(&lhs_ids[..], &rhs_ids[..])
.into_iter()
.map(|result| match result {
DiffResult::Left(id) => DiffResult::Left(*id_values.get(id).unwrap()),
DiffResult::Both(lhs_id, rhs_id) => DiffResult::Both(
*id_values.get(lhs_id).unwrap(),
*id_values.get(rhs_id).unwrap(),
),
DiffResult::Right(id) => DiffResult::Right(*id_values.get(id).unwrap()),
})
.collect::<Vec<_>>()
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -48,4 +101,27 @@ mod tests {
] ]
); );
} }
#[test]
fn test_slice_by_hash_same_items() {
let diff_items = slice_by_hash(&["a", "b"], &["a", "b"]);
assert_eq!(
diff_items,
vec![DiffResult::Both(&"a", &"a"), DiffResult::Both(&"b", &"b")]
);
}
#[test]
fn test_slice_by_hash_different_items() {
let diff_items = slice_by_hash(&["a", "b"], &["c", "d"]);
assert_eq!(
diff_items,
vec![
DiffResult::Left(&"a"),
DiffResult::Left(&"b"),
DiffResult::Right(&"c"),
DiffResult::Right(&"d"),
]
);
}
} }

@ -1,10 +1,7 @@
//! A fallback "parser" for plain text. //! A fallback "parser" for plain text.
use std::hash::Hash;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use regex::Regex; use regex::Regex;
use rustc_hash::FxHashMap;
use crate::{ use crate::{
diff::myers_diff, diff::myers_diff,
@ -73,63 +70,6 @@ fn merge_novel<'a>(
res res
} }
/// Compute a unique numeric value for each item, use that for
/// diffing, then return diff results in terms of the original type.
///
/// This is the decorate-sort-undecorate pattern, or Schwartzian
/// transform, for diffing.
fn diff_slice_by_hash<'a, T: Eq + Hash>(
lhs: &'a [T],
rhs: &'a [T],
) -> Vec<myers_diff::DiffResult<&'a T>> {
let mut value_ids: FxHashMap<&T, u32> = FxHashMap::default();
let mut id_values: FxHashMap<u32, &T> = FxHashMap::default();
let mut lhs_ids = Vec::with_capacity(lhs.len());
for value in lhs {
let id: u32 = match value_ids.get(value) {
Some(id) => *id,
None => {
let new_id = value_ids.len() as u32;
value_ids.insert(value, new_id);
id_values.insert(new_id, value);
new_id
}
};
lhs_ids.push(id);
}
let mut rhs_ids = Vec::with_capacity(rhs.len());
for value in rhs {
let id = match value_ids.get(value) {
Some(id) => *id,
None => {
let new_id = value_ids.len() as u32;
value_ids.insert(value, new_id);
id_values.insert(new_id, value);
new_id
}
};
rhs_ids.push(id);
}
myers_diff::slice(&lhs_ids[..], &rhs_ids[..])
.into_iter()
.map(|result| match result {
myers_diff::DiffResult::Left(id) => {
myers_diff::DiffResult::Left(*id_values.get(id).unwrap())
}
myers_diff::DiffResult::Both(lhs_id, rhs_id) => myers_diff::DiffResult::Both(
*id_values.get(lhs_id).unwrap(),
*id_values.get(rhs_id).unwrap(),
),
myers_diff::DiffResult::Right(id) => {
myers_diff::DiffResult::Right(*id_values.get(id).unwrap())
}
})
.collect::<Vec<_>>()
}
fn changed_parts<'a>( fn changed_parts<'a>(
src: &'a str, src: &'a str,
opposite_src: &'a str, opposite_src: &'a str,
@ -138,7 +78,7 @@ fn changed_parts<'a>(
let opposite_src_lines = split_lines_keep_newline(opposite_src); let opposite_src_lines = split_lines_keep_newline(opposite_src);
let mut res: Vec<(TextChangeKind, Vec<&'a str>, Vec<&'a str>)> = vec![]; let mut res: Vec<(TextChangeKind, Vec<&'a str>, Vec<&'a str>)> = vec![];
for diff_res in diff_slice_by_hash(&src_lines, &opposite_src_lines) { for diff_res in myers_diff::slice_by_hash(&src_lines, &opposite_src_lines) {
match diff_res { match diff_res {
myers_diff::DiffResult::Left(line) => { myers_diff::DiffResult::Left(line) => {
res.push((TextChangeKind::Novel, vec![line], vec![])); res.push((TextChangeKind::Novel, vec![line], vec![]));