From 3943c1401a631881abaddd3aeded72d55e4a8681 Mon Sep 17 00:00:00 2001
From: Wilfred Hughes <me@wilfred.me.uk>
Date: Fri, 14 Nov 2025 16:36:26 -0800
Subject: [PATCH] Don't consider - as a word character

This produced some unfortunate subword diffs when mixing words, numbers and
hyphens.

Fixes #918
---
 sample_files/compare.expected       | 16 ++++++++--------
 sample_files/hyphen_subwords_1.json |  3 +++
 sample_files/hyphen_subwords_2.json |  3 +++
 src/words.rs                        | 22 ++++------------------
 4 files changed, 18 insertions(+), 26 deletions(-)
 create mode 100644 sample_files/hyphen_subwords_1.json
 create mode 100644 sample_files/hyphen_subwords_2.json

diff --git a/sample_files/compare.expected b/sample_files/compare.expected
index 6eb766bd3..db0c2b663 100644
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@@ -41,7 +41,7 @@ sample_files/comma_and_comment_1.js sample_files/comma_and_comment_2.js
 0a5ccbcd368607e62eaff0c4ae25049f  -
 
 sample_files/comments_1.rs sample_files/comments_2.rs
-cc3a5f9134765192e034c65fb9d02026  -
+a75827163654d6aed8f837bb586e733c  -
 
 sample_files/context_1.rs sample_files/context_2.rs
 1a1633bcf672a582867c815381ae1609  -
@@ -65,7 +65,7 @@ sample_files/elisp_contiguous_1.el sample_files/elisp_contiguous_2.el
 4a5a33873a4f84ee055d95e1448fba35  -
 
 sample_files/elixir_1.ex sample_files/elixir_2.ex
-6bcd4f912e6adfd9cf2f83c602d72415  -
+85494310196ac5065b3b4ce1d4b350fd  -
 
 sample_files/elm_1.elm sample_files/elm_2.elm
 ccc1f4bb568cd72781dbcd623b612c43  -
@@ -86,7 +86,7 @@ sample_files/hare_1.ha sample_files/hare_2.ha
 ef6fd59edc55241311a97d21dd81e4c0  -
 
 sample_files/haskell_1.hs sample_files/haskell_2.hs
-6791dd931d74391b3d9fb9e351a6de54  -
+68fd7f9865c2b1defe05ffd509e08b93  -
 
 sample_files/hcl_1.hcl sample_files/hcl_2.hcl
 7c2aaa3a8b401bc007817f5dd608946d  -
@@ -98,13 +98,13 @@ sample_files/helpful_1.el sample_files/helpful_2.el
 295640aa4cbc23640658a80ad2393ce4  -
 
 sample_files/html_1.html sample_files/html_2.html
-64285a8ed6ddecab1e24bcf0ce649b62  -
+3cc8b445a56b74f05e1d7bb84874edab  -
 
 sample_files/html_simple_1.html sample_files/html_simple_2.html
 bb129dce38cd26eac81ca52d2016bade  -
 
 sample_files/huge_cpp_1.cpp sample_files/huge_cpp_2.cpp
-7f65e42e16ee318bbfc342b8bcc03d2e  -
+09e8a30ad7be5686e4d03a3e6b2588aa  -
 
 sample_files/identical_1.scala sample_files/identical_2.scala
 15c5a789e644348cb7e0de051ff4b63e  -
@@ -146,7 +146,7 @@ sample_files/lua_1.lua sample_files/lua_2.lua
 81ad9478e64494320e96284cb7632ced  -
 
 sample_files/makefile_1.mk sample_files/makefile_2.mk
-d0572210b5121ce68ac0ce45e43b922b  -
+4759883325ade33566f2c8afa09e2d82  -
 
 sample_files/many_newlines_1.txt sample_files/many_newlines_2.txt
 52ca05855e520876479e6f608c5e7831  -
@@ -164,7 +164,7 @@ sample_files/multiline_string_1.ml sample_files/multiline_string_2.ml
 ed80815053ba156505d156277d0f4195  -
 
 sample_files/multiline_string_eof_1.yml sample_files/multiline_string_eof_2.yml
-ba8a8e7ed2f4b519feaa391fd05c95fe  -
+cd9cfd627c28b8ecd7c990adc683281a  -
 
 sample_files/nest_1.rs sample_files/nest_2.rs
 d3a799fe2cd9d81aa251c96af5cd9711  -
@@ -260,7 +260,7 @@ sample_files/string_subwords_1.el sample_files/string_subwords_2.el
 b66e960672189960c2d35ef68b47a195  -
 
 sample_files/strings_1.el sample_files/strings_2.el
-fe61803e3391fb14f5a3f05750bb94ff  -
+26ea57243abb16043088b17bfee482a4  -
 
 sample_files/swift_1.swift sample_files/swift_2.swift
 73830b14bd8aacac8d4590a3bed61c40  -
diff --git a/sample_files/hyphen_subwords_1.json b/sample_files/hyphen_subwords_1.json
new file mode 100644
index 000000000..5248f946b
--- /dev/null
+++ b/sample_files/hyphen_subwords_1.json
@@ -0,0 +1,3 @@
+{
+  "name": "foo-d123-pretty-long"
+}
diff --git a/sample_files/hyphen_subwords_2.json b/sample_files/hyphen_subwords_2.json
new file mode 100644
index 000000000..0ea55e8f6
--- /dev/null
+++ b/sample_files/hyphen_subwords_2.json
@@ -0,0 +1,3 @@
+{
+  "name": "d123-pretty-long"
+}
diff --git a/src/words.rs b/src/words.rs
index 9a1e6950b..2a51e5933 100644
--- a/src/words.rs
+++ b/src/words.rs
@@ -12,7 +12,7 @@ pub(crate) fn split_words(s: &str) -> Vec<&str> {
     for (idx, c) in s.char_indices() {
         match word_start {
             Some(start) => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
+                if c.is_alphanumeric() || c == '_' {
                     // Just carry on in this word.
                 } else {
                     // Push the previous word, then this non-word character.
@@ -22,7 +22,7 @@ pub(crate) fn split_words(s: &str) -> Vec<&str> {
                 }
             }
             None => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
+                if c.is_alphanumeric() || c == '_' {
                     word_start = Some(idx);
                 } else {
                     words.push(&s[idx..idx + c.len_utf8()]);
@@ -47,7 +47,7 @@ pub(crate) fn split_words_and_numbers(s: &str) -> Vec<&str> {
     for (idx, c) in s.char_indices() {
         match word_start {
             Some((start, start_c)) => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
+                if c.is_alphanumeric() || c == '_' {
                     // Word character, add to the current word if it's
                     // not a number.
                     if c.is_ascii_digit() == start_c.is_ascii_digit() {
@@ -65,7 +65,7 @@ pub(crate) fn split_words_and_numbers(s: &str) -> Vec<&str> {
                 }
             }
             None => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
+                if c.is_alphanumeric() || c == '_' {
                     word_start = Some((idx, c));
                 } else {
                     words.push(&s[idx..idx + c.len_utf8()]);
@@ -93,13 +93,6 @@ mod tests {
         assert_eq!(res, vec!["example", ".", "com"])
     }
 
-    #[test]
-    fn test_split_words_hyphens() {
-        let s = "foo -bar-baz-";
-        let res = split_words(s);
-        assert_eq!(res, vec!["foo", " ", "-bar-baz-"])
-    }
-
     #[test]
     fn test_split_words_punctuation() {
         let s = "example..";
@@ -149,13 +142,6 @@ mod tests {
         assert_eq!(res, vec!["a", "123", "b"])
     }
 
-    #[test]
-    fn test_split_words_and_numbers_hyphens() {
-        let s = "a-b -c-";
-        let res = split_words_and_numbers(s);
-        assert_eq!(res, vec!["a-b", " ", "-c-"])
-    }
-
     #[test]
     fn test_split_words_and_numbers_spaces() {
         let s = "foo bar";