Use unique subtrees shared between LHS/RHS to determine similar lists

This makes the 'lists are sufficiently similar' heuristic more aggressive. Previously we'd look for lists with common start or end children and the same delimiters. This worked badly for cases like: LHS: (novel-lhs (a b c d e)) RHS: (novel-rhs (a b c d e)) Instead, look for sublists that are unique on both sides and occur on both the LHS and RHS root being considered. This allows us to match up many more cases. Consider lists to be sufficiently similar exclusiely using this (surprisingly effective) heuristic, and don't consider outer delimiters. This substantially improves performance in many cases, particularly for files that are fairly flat (many toplevel lists with little nesting). Fixes #306
2022-07-18 22:32:00 +07:00 · 2022-07-18 22:32:00 +07:00 · bc283341f3
parent 97a29645f0
commit bc283341f3
3 changed files with 110 additions and 35 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,10 @@
 ## 0.32 (unreleased)

+### Diffing
+
+Improved performance in many cases, particularly for files with a
+fairly flat structure.
+
 ## 0.31 (released 11th July 2022)

 ### Parsing
--- a/src/diff/unchanged.rs
+++ b/src/diff/unchanged.rs
@ -1,13 +1,14 @@
 //! Find nodes that are obviously unchanged, so we can run the main
 //! diff on smaller inputs.

+use std::collections::HashSet;
+
 use crate::diff::changes::{insert_deep_unchanged, ChangeKind, ChangeMap};
 use crate::diff::myers_diff;

 use crate::parse::syntax::Syntax;

 const TINY_TREE_THRESHOLD: u32 = 10;
-const MOSTLY_UNCHANGED_MIN_NODES: usize = 4;
 const MOSTLY_UNCHANGED_MIN_COMMON_CHILDREN: usize = 4;

 /// Set [`ChangeKind`] on nodes that are obviously unchanged, and return a
@ -122,44 +123,55 @@ fn split_unchanged_singleton_list<'a>(
    res
 }

+fn find_unique_content_ids(node: &Syntax, unique_ids: &mut HashSet<u32>) {
+    if node.content_is_unique() {
+        unique_ids.insert(node.content_id());
+    }
+    if let Syntax::List { children, .. } = node {
+        for child in children {
+            find_unique_content_ids(child, unique_ids);
+        }
+    }
+}
+
+fn find_all_unique_content_ids(node: &Syntax) -> HashSet<u32> {
+    let mut unique_ids = HashSet::new();
+    find_unique_content_ids(node, &mut unique_ids);
+    unique_ids
+}
+
+fn count_unique_subtrees(node: &Syntax, opposite_unique_ids: &HashSet<u32>) -> usize {
+    if node.content_is_unique() && opposite_unique_ids.contains(&node.content_id()) {
+        // Ignore children as soon as find a unique node, to avoid
+        // overcounting.
+        return 1;
+    }
+
+    if let Syntax::List { children, .. } = node {
+        return children
+            .iter()
+            .map(|child| count_unique_subtrees(child, opposite_unique_ids))
+            .sum();
+    }
+
+    0
+}
+
+/// Count the nodes in `lhs` that are unique to the LHS input, but are
+/// also present in `rhs`.
+///
+/// Ignores children of unique nodes, so we don't overcount.
+fn count_common_unique(lhs: &Syntax, rhs: &Syntax) -> usize {
+    let rhs_unique_ids = find_all_unique_content_ids(rhs);
+    count_unique_subtrees(lhs, &rhs_unique_ids)
+}
+
 /// Return true if both nodes are lists with same delimiters and have
 /// the same start and end children.
 fn is_mostly_unchanged_list(lhs: &Syntax, rhs: &Syntax) -> bool {
    match (lhs, rhs) {
-        (
-            Syntax::List {
-                open_content: lhs_open_content,
-                close_content: lhs_close_content,
-                children: lhs_children,
-                ..
-            },
-            Syntax::List {
-                open_content: rhs_open_content,
-                close_content: rhs_close_content,
-                children: rhs_children,
-                ..
-            },
-        ) if lhs_open_content == rhs_open_content && lhs_close_content == rhs_close_content => {
-            if lhs_children.len() < MOSTLY_UNCHANGED_MIN_NODES
-                || rhs_children.len() < MOSTLY_UNCHANGED_MIN_NODES
-            {
-                return false;
-            }
-
-            let first_children_unchanged = lhs_children
-                .iter()
-                .zip(rhs_children.iter())
-                .take(MOSTLY_UNCHANGED_MIN_COMMON_CHILDREN)
-                .all(|(lhs, rhs)| lhs.content_id() == rhs.content_id());
-
-            let last_children_unchanged = lhs_children
-                .iter()
-                .rev()
-                .zip(rhs_children.iter().rev())
-                .take(MOSTLY_UNCHANGED_MIN_COMMON_CHILDREN)
-                .all(|(lhs, rhs)| lhs.content_id() == rhs.content_id());
-
-            first_children_unchanged || last_children_unchanged
+        (Syntax::List { .. }, Syntax::List { .. }) => {
+            count_common_unique(lhs, rhs) >= MOSTLY_UNCHANGED_MIN_COMMON_CHILDREN
        }
        _ => false,
    }
@ -711,4 +723,58 @@ mod tests {
        let split = split_mostly_unchanged_toplevel(&lhs_nodes, &rhs_nodes);
        assert_eq!(split.len(), 2);
    }
+
+    #[test]
+    fn test_count_common_unique() {
+        let arena = Arena::new();
+        let config = from_language(guess_language::Language::EmacsLisp);
+
+        // There are two subtrees that are unique on both sides and
+        // shared between the two sides here:
+        //
+        // 1: shared-1
+        // 2: (shared-2a shared-2b)
+        let lhs_nodes = parse(
+            &arena,
+            "(shared-1 (shared-2a shared-2b) not-unique not-unique)",
+            &config,
+        );
+        let rhs_nodes = parse(
+            &arena,
+            "(shared-1 (shared-2a shared-2b) not-unique)",
+            &config,
+        );
+        init_all_info(&lhs_nodes, &rhs_nodes);
+
+        assert_eq!(count_common_unique(lhs_nodes[0], rhs_nodes[0]), 2);
+    }
+
+    #[test]
+    fn test_similar_with_common_grandchildren() {
+        let arena = Arena::new();
+        let config = from_language(guess_language::Language::EmacsLisp);
+
+        let lhs_nodes = parse(&arena, "((novel-lhs 1 2 3 4 5)) x", &config);
+        let rhs_nodes = parse(&arena, "((novel-rhs 1 2 3 4 5)) y", &config);
+        init_all_info(&lhs_nodes, &rhs_nodes);
+
+        assert_eq!(
+            split_mostly_unchanged_toplevel(&lhs_nodes, &rhs_nodes).len(),
+            2
+        );
+    }
+    #[test]
+    fn test_similar_ignore_delimiter() {
+        let arena = Arena::new();
+        let config = from_language(guess_language::Language::EmacsLisp);
+
+        let lhs_nodes = parse(&arena, "(novel-lhs 1 2 3 4 5) x", &config);
+        let rhs_nodes = parse(&arena, "[novel-rhs 1 2 3 4 5] y", &config);
+        init_all_info(&lhs_nodes, &rhs_nodes);
+
+        assert_eq!(
+            split_mostly_unchanged_toplevel(&lhs_nodes, &rhs_nodes).len(),
+            2
+        );
+    }
 }
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@ -283,6 +283,10 @@ impl<'a> Syntax<'a> {
        self.info().content_id.get()
    }

+    pub fn content_is_unique(&self) -> bool {
+        self.info().content_is_unique.get()
+    }
+
    pub fn num_ancestors(&self) -> u32 {
        self.info().num_ancestors.get()
    }