Require some common words to do subword highlighting

This is important when comparing short string literals. This change has improved several cases in sample_files/ but I've added a new example that made the previous unwanted behaviour much more obvious.
2023-07-10 09:01:45 +07:00 · 2023-07-10 09:01:45 +07:00 · 5824322244
parent 4aca79f220
commit 5824322244
4 changed files with 3884 additions and 16 deletions
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -77,7 +77,7 @@ sample_files/helpful_before.el sample_files/helpful_after.el
 a0f2e0115ea94c46d3650ba89b486f09  -

 sample_files/html_before.html sample_files/html_after.html
-ed77c9d76eefdc82cf52e089d268ac6c  -
+0cd91f89716413757ee300e0a6f13453  -

 sample_files/html_simple_before.html sample_files/html_simple_after.html
 ce3bfa12bc21d0eb5528766e18387e86  -
@ -104,7 +104,7 @@ sample_files/javascript_simple_before.js sample_files/javascript_simple_after.js
 3357d9d47a5e7efb3c7677745993ea2b  -

 sample_files/json_before.json sample_files/json_after.json
-11bd95ff0aff18781d3421f702d62c17  -
+bae479fb04e15baf9460c5274c77963b  -

 sample_files/jsx_before.jsx sample_files/jsx_after.jsx
 5784f67cac95fcdb621751aa80a3402b  -
@ -116,7 +116,7 @@ sample_files/load_before.js sample_files/load_after.js
 5cb293020a07b0635b864850c07458b3  -

 sample_files/lua_before.lua sample_files/lua_after.lua
-c12d85c8ffa7ad6b6ca931cf52ac5f3e  -
+9886d61f459cdf566be9c42f7fa61a12  -

 sample_files/makefile_before.mk sample_files/makefile_after.mk
 82ed37f60448e7402c62d5319f30fd3c  -
@ -199,6 +199,9 @@ sample_files/slow_before.rs sample_files/slow_after.rs
 sample_files/small_before.js sample_files/small_after.js
 b4300bfc0203acd8f2603b504b859dc8  -

+sample_files/strings_before.el sample_files/strings_after.el
+adc1c8734906b83deff25b1567e46b56  -
+
 sample_files/swift_before.swift sample_files/swift_after.swift
 4285db52158468d58d54115b6cb8f29b  -

--- a/sample_files/strings_after.el
+++ b/sample_files/strings_after.el
--- a/sample_files/strings_before.el
+++ b/sample_files/strings_before.el
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@ -695,6 +695,11 @@ pub fn split_words_and_numbers(s: &str) -> Vec<&str> {
    res
 }

+/// Given the text `content` from a comment or strings, split it into
+/// MatchedPos values for the novel and unchanged words.
+///
+/// If there is negligible text in common with `opposite_content`,
+/// treat the whole `content` as a single novel region.
 fn split_atom_words(
    content: &str,
    pos: SingleLineSpan,
@ -709,6 +714,17 @@ fn split_atom_words(
    let content_parts = split_words_and_numbers(content);
    let other_parts = split_words_and_numbers(opposite_content);

+    let word_diffs = myers_diff::slice_by_hash(&content_parts, &other_parts);
+
+    if !has_common_words(&word_diffs) {
+        return vec![MatchedPos {
+            kind: MatchKind::Novel {
+                highlight: TokenKind::Atom(kind),
+            },
+            pos,
+        }];
+    }
+
    let content_newlines = NewlinePositions::from(content);
    let opposite_content_newlines = NewlinePositions::from(opposite_content);

@ -716,7 +732,7 @@ fn split_atom_words(
    let mut opposite_offset = 0;

    let mut res = vec![];
-    for diff_res in myers_diff::slice_by_hash(&content_parts, &other_parts) {
+    for diff_res in word_diffs {
        match diff_res {
            myers_diff::DiffResult::Left(word) => {
                // This word is novel to this side.
@ -765,6 +781,34 @@ fn split_atom_words(
    res
 }

+/// Are there sufficient common words that we should only highlight
+/// individual changed words?
+fn has_common_words(word_diffs: &Vec<myers_diff::DiffResult<&&str>>) -> bool {
+    let mut word_count = 0;
+    for word_diff in word_diffs {
+        match word_diff {
+            myers_diff::DiffResult::Both(word, _) => {
+                // If we have at least one long word (i.e. not just
+                // punctuation), that's sufficient.
+                if word.len() > 2 {
+                    return true;
+                }
+
+                // If we have lots of common short words, not just the
+                // beginning/end comment delimiter, that qualifies
+                // too.
+                word_count += 1;
+                if word_count > 4 {
+                    return true;
+                }
+            }
+            _ => {}
+        }
+    }
+
+    false
+}
+
 impl MatchedPos {
    fn new(
        ck: ChangeKind,
@ -1135,15 +1179,15 @@ mod tests {
    }

    #[test]
-    fn test_split_comment_words_basic() {
-        let content = "abc";
+    fn test_split_atom_words() {
+        let content = "abc def";
        let pos = SingleLineSpan {
            line: 0.into(),
            start_col: 0,
-            end_col: 3,
+            end_col: 7,
        };

-        let opposite_content = "def";
+        let opposite_content = "abc";
        let opposite_pos = SingleLineSpan {
            line: 0.into(),
            start_col: 0,
@ -1159,16 +1203,38 @@ mod tests {
        );
        assert_eq!(
            res,
-            vec![MatchedPos {
-                kind: MatchKind::NovelWord {
-                    highlight: TokenKind::Atom(AtomKind::Comment),
+            vec![
+                MatchedPos {
+                    kind: MatchKind::NovelLinePart {
+                        highlight: TokenKind::Atom(AtomKind::Comment),
+                        self_pos: SingleLineSpan {
+                            line: 0.into(),
+                            start_col: 0,
+                            end_col: 3
+                        },
+                        opposite_pos: vec![SingleLineSpan {
+                            line: 0.into(),
+                            start_col: 0,
+                            end_col: 3
+                        }]
+                    },
+                    pos: SingleLineSpan {
+                        line: 0.into(),
+                        start_col: 0,
+                        end_col: 3
+                    },
                },
-                pos: SingleLineSpan {
-                    line: 0.into(),
-                    start_col: 0,
-                    end_col: 3
+                MatchedPos {
+                    kind: MatchKind::NovelWord {
+                        highlight: TokenKind::Atom(AtomKind::Comment),
+                    },
+                    pos: SingleLineSpan {
+                        line: 0.into(),
+                        start_col: 4,
+                        end_col: 7
+                    },
                },
-            },]
+            ]
        );
    }