Improve word splitting heuristics

This is particularly noticeable when diffing comments with timestamps 2000-12-31T23:59:59 where we don't want 31T23 to be a single word.
2023-06-29 08:31:51 +07:00 · 2023-06-29 08:31:51 +07:00 · 3730580ca3
parent 63a5eeaa98
commit 3730580ca3
3 changed files with 42 additions and 15 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,10 @@

 Updated Scala parser.

+### Display
+
+Improved word highlighting in comments when they contain numbers.
+
 ### Internals

 Difftastic's logging is now configured with the environment variable
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -17,7 +17,7 @@ sample_files/change_outer_before.el sample_files/change_outer_after.el
 4d202515307556b443806ea25aac0b84  -

 sample_files/chinese_before.po sample_files/chinese_after.po
-46cc71b46688dd2abc51f9fd82864c90  -
+b2cace14b01c272217eec27d16adddbe  -

 sample_files/clojure_before.clj sample_files/clojure_after.clj
 b8e17b8eb649ba0b8d29b57a23e4ac81  -
@ -59,7 +59,7 @@ sample_files/hack_before.php sample_files/hack_after.php
 83d4a92c596b5d465ff024aa1b30be92  -

 sample_files/hare_before.ha sample_files/hare_after.ha
-2b3a9433cd692d9ffab872477312e3b8  -
+ea834f886bd44133115c567a200f1996  -

 sample_files/haskell_before.hs sample_files/haskell_after.hs
 5a2c0c5d4a04f79e2f8f32299e6cd364  -
@ -80,7 +80,7 @@ sample_files/html_simple_before.html sample_files/html_simple_after.html
 ce3bfa12bc21d0eb5528766e18387e86  -

 sample_files/huge_cpp_before.cpp sample_files/huge_cpp_after.cpp
-c879bed2d8551579975617262245337c  -
+b0dd65cc5431f3cca9e93ca6f4a64676  -

 sample_files/identical_before.scala sample_files/identical_after.scala
 9c7319f61833e46a0a8cb6c01cc997c9  -
@ -206,7 +206,7 @@ sample_files/tab_before.c sample_files/tab_after.c
 b652d15f3a05b82a7d871cfeca2f453f  -

 sample_files/tailwind_before.css sample_files/tailwind_after.css
-cee5ee7415b1bd50bdc2dacd11e7303a  -
+3e07691cbb537948db60bcc80813eaf9  -

 sample_files/text_before.txt sample_files/text_after.txt
 5fbdac2d1156ed8bb6b098e87f30d319  -
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@ -662,25 +662,41 @@ pub struct MatchedPos {
 /// Split `s` into a vec of things that look like words and individual
 /// non-word characters.
 ///
-/// "foo bar" -> vec!["foo", " ", "bar"]
+/// "foo..bar23" -> vec!["foo", ".", ".", "bar", "23"]
 pub fn split_words(s: &str) -> Vec<&str> {
    let mut res = vec![];
-    let mut word_start = None;
+    let mut word_start: Option<(usize, char)> = None;
    for (idx, c) in s.char_indices() {
-        if c.is_alphanumeric() || c == '-' || c == '_' {
-            if word_start.is_none() {
-                word_start = Some(idx);
+        match word_start {
+            Some((start, start_c)) => {
+                if c.is_alphanumeric() || c == '-' || c == '_' {
+                    // Word character, add to the current word if it's
+                    // not a number.
+                    if c.is_ascii_digit() == start_c.is_ascii_digit() {
+                        // Just carry on in this word.
+                    } else {
+                        // Finish previous word, start a new one.
+                        res.push(&s[start..idx]);
+                        word_start = Some((idx, c));
+                    }
+                } else {
+                    // Push the previous word, then this non-word character.
+                    res.push(&s[start..idx]);
+                    res.push(&s[idx..idx + c.len_utf8()]);
+                    word_start = None;
+                }
            }
-        } else {
-            if let Some(start) = word_start {
-                res.push(&s[start..idx]);
-                word_start = None;
+            None => {
+                if c.is_alphanumeric() || c == '-' || c == '_' {
+                    word_start = Some((idx, c));
+                } else {
+                    res.push(&s[idx..idx + c.len_utf8()]);
+                }
            }
-            res.push(&s[idx..idx + c.len_utf8()]);
        }
    }

-    if let Some(start) = word_start {
+    if let Some((start, _)) = word_start {
        res.push(&s[start..]);
    }
    res
@ -1162,6 +1178,13 @@ mod tests {
        assert_eq!(res, vec!["example", ".", "."])
    }

+    #[test]
+    fn test_split_words_numbers() {
+        let s = "foo123bar";
+        let res = split_words(s);
+        assert_eq!(res, vec!["foo", "123", "bar"])
+    }
+
    #[test]
    fn test_split_words_treats_newline_separately() {
        let s = "example.\ncom";