From 3943c1401a631881abaddd3aeded72d55e4a8681 Mon Sep 17 00:00:00 2001 From: Wilfred Hughes Date: Fri, 14 Nov 2025 16:36:26 -0800 Subject: [PATCH] Don't consider - as a word character This produced some unfortunate subword diffs when mixing words, numbers and hyphens. Fixes #918 --- sample_files/compare.expected | 16 ++++++++-------- sample_files/hyphen_subwords_1.json | 3 +++ sample_files/hyphen_subwords_2.json | 3 +++ src/words.rs | 22 ++++------------------ 4 files changed, 18 insertions(+), 26 deletions(-) create mode 100644 sample_files/hyphen_subwords_1.json create mode 100644 sample_files/hyphen_subwords_2.json diff --git a/sample_files/compare.expected b/sample_files/compare.expected index 6eb766bd3..db0c2b663 100644 --- a/sample_files/compare.expected +++ b/sample_files/compare.expected @@ -41,7 +41,7 @@ sample_files/comma_and_comment_1.js sample_files/comma_and_comment_2.js 0a5ccbcd368607e62eaff0c4ae25049f - sample_files/comments_1.rs sample_files/comments_2.rs -cc3a5f9134765192e034c65fb9d02026 - +a75827163654d6aed8f837bb586e733c - sample_files/context_1.rs sample_files/context_2.rs 1a1633bcf672a582867c815381ae1609 - @@ -65,7 +65,7 @@ sample_files/elisp_contiguous_1.el sample_files/elisp_contiguous_2.el 4a5a33873a4f84ee055d95e1448fba35 - sample_files/elixir_1.ex sample_files/elixir_2.ex -6bcd4f912e6adfd9cf2f83c602d72415 - +85494310196ac5065b3b4ce1d4b350fd - sample_files/elm_1.elm sample_files/elm_2.elm ccc1f4bb568cd72781dbcd623b612c43 - @@ -86,7 +86,7 @@ sample_files/hare_1.ha sample_files/hare_2.ha ef6fd59edc55241311a97d21dd81e4c0 - sample_files/haskell_1.hs sample_files/haskell_2.hs -6791dd931d74391b3d9fb9e351a6de54 - +68fd7f9865c2b1defe05ffd509e08b93 - sample_files/hcl_1.hcl sample_files/hcl_2.hcl 7c2aaa3a8b401bc007817f5dd608946d - @@ -98,13 +98,13 @@ sample_files/helpful_1.el sample_files/helpful_2.el 295640aa4cbc23640658a80ad2393ce4 - sample_files/html_1.html sample_files/html_2.html -64285a8ed6ddecab1e24bcf0ce649b62 - +3cc8b445a56b74f05e1d7bb84874edab - sample_files/html_simple_1.html sample_files/html_simple_2.html bb129dce38cd26eac81ca52d2016bade - sample_files/huge_cpp_1.cpp sample_files/huge_cpp_2.cpp -7f65e42e16ee318bbfc342b8bcc03d2e - +09e8a30ad7be5686e4d03a3e6b2588aa - sample_files/identical_1.scala sample_files/identical_2.scala 15c5a789e644348cb7e0de051ff4b63e - @@ -146,7 +146,7 @@ sample_files/lua_1.lua sample_files/lua_2.lua 81ad9478e64494320e96284cb7632ced - sample_files/makefile_1.mk sample_files/makefile_2.mk -d0572210b5121ce68ac0ce45e43b922b - +4759883325ade33566f2c8afa09e2d82 - sample_files/many_newlines_1.txt sample_files/many_newlines_2.txt 52ca05855e520876479e6f608c5e7831 - @@ -164,7 +164,7 @@ sample_files/multiline_string_1.ml sample_files/multiline_string_2.ml ed80815053ba156505d156277d0f4195 - sample_files/multiline_string_eof_1.yml sample_files/multiline_string_eof_2.yml -ba8a8e7ed2f4b519feaa391fd05c95fe - +cd9cfd627c28b8ecd7c990adc683281a - sample_files/nest_1.rs sample_files/nest_2.rs d3a799fe2cd9d81aa251c96af5cd9711 - @@ -260,7 +260,7 @@ sample_files/string_subwords_1.el sample_files/string_subwords_2.el b66e960672189960c2d35ef68b47a195 - sample_files/strings_1.el sample_files/strings_2.el -fe61803e3391fb14f5a3f05750bb94ff - +26ea57243abb16043088b17bfee482a4 - sample_files/swift_1.swift sample_files/swift_2.swift 73830b14bd8aacac8d4590a3bed61c40 - diff --git a/sample_files/hyphen_subwords_1.json b/sample_files/hyphen_subwords_1.json new file mode 100644 index 000000000..5248f946b --- /dev/null +++ b/sample_files/hyphen_subwords_1.json @@ -0,0 +1,3 @@ +{ + "name": "foo-d123-pretty-long" +} diff --git a/sample_files/hyphen_subwords_2.json b/sample_files/hyphen_subwords_2.json new file mode 100644 index 000000000..0ea55e8f6 --- /dev/null +++ b/sample_files/hyphen_subwords_2.json @@ -0,0 +1,3 @@ +{ + "name": "d123-pretty-long" +} diff --git a/src/words.rs b/src/words.rs index 9a1e6950b..2a51e5933 100644 --- a/src/words.rs +++ b/src/words.rs @@ -12,7 +12,7 @@ pub(crate) fn split_words(s: &str) -> Vec<&str> { for (idx, c) in s.char_indices() { match word_start { Some(start) => { - if c.is_alphanumeric() || c == '-' || c == '_' { + if c.is_alphanumeric() || c == '_' { // Just carry on in this word. } else { // Push the previous word, then this non-word character. @@ -22,7 +22,7 @@ pub(crate) fn split_words(s: &str) -> Vec<&str> { } } None => { - if c.is_alphanumeric() || c == '-' || c == '_' { + if c.is_alphanumeric() || c == '_' { word_start = Some(idx); } else { words.push(&s[idx..idx + c.len_utf8()]); @@ -47,7 +47,7 @@ pub(crate) fn split_words_and_numbers(s: &str) -> Vec<&str> { for (idx, c) in s.char_indices() { match word_start { Some((start, start_c)) => { - if c.is_alphanumeric() || c == '-' || c == '_' { + if c.is_alphanumeric() || c == '_' { // Word character, add to the current word if it's // not a number. if c.is_ascii_digit() == start_c.is_ascii_digit() { @@ -65,7 +65,7 @@ pub(crate) fn split_words_and_numbers(s: &str) -> Vec<&str> { } } None => { - if c.is_alphanumeric() || c == '-' || c == '_' { + if c.is_alphanumeric() || c == '_' { word_start = Some((idx, c)); } else { words.push(&s[idx..idx + c.len_utf8()]); @@ -93,13 +93,6 @@ mod tests { assert_eq!(res, vec!["example", ".", "com"]) } - #[test] - fn test_split_words_hyphens() { - let s = "foo -bar-baz-"; - let res = split_words(s); - assert_eq!(res, vec!["foo", " ", "-bar-baz-"]) - } - #[test] fn test_split_words_punctuation() { let s = "example.."; @@ -149,13 +142,6 @@ mod tests { assert_eq!(res, vec!["a", "123", "b"]) } - #[test] - fn test_split_words_and_numbers_hyphens() { - let s = "a-b -c-"; - let res = split_words_and_numbers(s); - assert_eq!(res, vec!["a-b", " ", "-c-"]) - } - #[test] fn test_split_words_and_numbers_spaces() { let s = "foo bar";