Improve word diffing heuristic and add another sample file

2023-07-12 12:12:12 +07:00 · 2023-07-12 12:12:12 +07:00 · a814e01d22
parent 1d3b6836ef
commit a814e01d22
4 changed files with 49 additions and 16 deletions
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -158,7 +158,7 @@ sample_files/pascal_before.pascal sample_files/pascal_after.pascal
 dfea5599b7f5e180d0fafab326f612cc  -

 sample_files/perl_before.pl sample_files/perl_after.pl
-09034cdf9cc4853ba7527de6d633e9be  -
+62ed7685bdfad901d1087e8bad399d86  -

 sample_files/prefer_outer_before.el sample_files/prefer_outer_after.el
 de31a80dc8a06987aeff4aaa04ce3b87  -
@ -199,6 +199,9 @@ sample_files/slow_before.rs sample_files/slow_after.rs
 sample_files/small_before.js sample_files/small_after.js
 b4300bfc0203acd8f2603b504b859dc8  -

+sample_files/string_subwords_before.el sample_files/string_subwords_after.el
+1154702ee8bc90407728871b94d12878  -
+
 sample_files/strings_before.el sample_files/strings_after.el
 adc1c8734906b83deff25b1567e46b56  -

--- a/sample_files/string_subwords_after.el
+++ b/sample_files/string_subwords_after.el
@ -0,0 +1,16 @@
+(format "%s: %s" (site-name) name)
+
+(defcustom deadgrep-max-buffers
+  4
+  "The maximum number of deadgrep results buffers.
+
+If the number of results buffers exceeds this value, deadgrep
+will kill results buffers. The least recently used buffers are
+killed first.
+
+To disable cleanup entirely, set this variable to nil."
+  :type '(choice
+          (number :tag "Maximum of buffers allowed")
+          (const :tag "Disable cleanup" nil))
+  :group 'deadgrep)
+
--- a/sample_files/string_subwords_before.el
+++ b/sample_files/string_subwords_before.el
@ -0,0 +1,13 @@
+(format "SoloWiki Viewing: %s" name)
+
+(defcustom deadgrep-max-buffers
+  4
+  "Deadgrep will kill the least recently used results buffer
+if there are more than this many.
+
+To disable cleanup entirely, set this variable to nil."
+  :type '(choice
+          (number :tag "Maximum of buffers allowed")
+          (const :tag "Disable cleanup" nil))
+  :group 'deadgrep)
+
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@ -790,29 +790,30 @@ fn split_atom_words(
 /// Are there sufficient common words that we should only highlight
 /// individual changed words?
 fn has_common_words(word_diffs: &Vec<myers_diff::DiffResult<&&str>>) -> bool {
-    let mut word_count = 0;
+    let mut novel_count = 0;
+    let mut unchanged_count = 0;
+
    for word_diff in word_diffs {
        match word_diff {
            myers_diff::DiffResult::Both(word, _) => {
-                // If we have at least one long word (i.e. not just
-                // punctuation), that's sufficient.
-                if word.len() > 2 {
-                    return true;
-                }
-
-                // If we have lots of common short words, not just the
-                // beginning/end comment delimiter, that qualifies
-                // too.
-                word_count += 1;
-                if word_count > 4 {
-                    return true;
+                if **word != " " {
+                    unchanged_count += 1;
                }
            }
-            _ => {}
+            _ => {
+                novel_count += 1;
+            }
        }
    }

-    false
+    // We want more than two unchanged words, because the text content
+    // includes the comment or string delimiters.
+    //
+    // A sufficiently similar set of words is when more than 50% of
+    // the words are common between the two sides. We multiply by two
+    // because non-matching words gives us two novel words, whereas
+    // matched words only gives us one unchanged word.
+    unchanged_count > 2 && unchanged_count * 2 >= novel_count
 }

 impl MatchedPos {