Improve word splitting heuristics

This is particularly noticeable when diffing comments with timestamps
2000-12-31T23:59:59 where we don't want 31T23 to be a single word.
text_sliders
Wilfred Hughes 2023-06-29 08:31:51 +07:00
parent 63a5eeaa98
commit 3730580ca3
3 changed files with 42 additions and 15 deletions

@ -4,6 +4,10 @@
Updated Scala parser.
### Display
Improved word highlighting in comments when they contain numbers.
### Internals
Difftastic's logging is now configured with the environment variable

@ -17,7 +17,7 @@ sample_files/change_outer_before.el sample_files/change_outer_after.el
4d202515307556b443806ea25aac0b84 -
sample_files/chinese_before.po sample_files/chinese_after.po
46cc71b46688dd2abc51f9fd82864c90 -
b2cace14b01c272217eec27d16adddbe -
sample_files/clojure_before.clj sample_files/clojure_after.clj
b8e17b8eb649ba0b8d29b57a23e4ac81 -
@ -59,7 +59,7 @@ sample_files/hack_before.php sample_files/hack_after.php
83d4a92c596b5d465ff024aa1b30be92 -
sample_files/hare_before.ha sample_files/hare_after.ha
2b3a9433cd692d9ffab872477312e3b8 -
ea834f886bd44133115c567a200f1996 -
sample_files/haskell_before.hs sample_files/haskell_after.hs
5a2c0c5d4a04f79e2f8f32299e6cd364 -
@ -80,7 +80,7 @@ sample_files/html_simple_before.html sample_files/html_simple_after.html
ce3bfa12bc21d0eb5528766e18387e86 -
sample_files/huge_cpp_before.cpp sample_files/huge_cpp_after.cpp
c879bed2d8551579975617262245337c -
b0dd65cc5431f3cca9e93ca6f4a64676 -
sample_files/identical_before.scala sample_files/identical_after.scala
9c7319f61833e46a0a8cb6c01cc997c9 -
@ -206,7 +206,7 @@ sample_files/tab_before.c sample_files/tab_after.c
b652d15f3a05b82a7d871cfeca2f453f -
sample_files/tailwind_before.css sample_files/tailwind_after.css
cee5ee7415b1bd50bdc2dacd11e7303a -
3e07691cbb537948db60bcc80813eaf9 -
sample_files/text_before.txt sample_files/text_after.txt
5fbdac2d1156ed8bb6b098e87f30d319 -

@ -662,25 +662,41 @@ pub struct MatchedPos {
/// Split `s` into a vec of things that look like words and individual
/// non-word characters.
///
/// "foo bar" -> vec!["foo", " ", "bar"]
/// "foo..bar23" -> vec!["foo", ".", ".", "bar", "23"]
pub fn split_words(s: &str) -> Vec<&str> {
let mut res = vec![];
let mut word_start = None;
let mut word_start: Option<(usize, char)> = None;
for (idx, c) in s.char_indices() {
if c.is_alphanumeric() || c == '-' || c == '_' {
if word_start.is_none() {
word_start = Some(idx);
match word_start {
Some((start, start_c)) => {
if c.is_alphanumeric() || c == '-' || c == '_' {
// Word character, add to the current word if it's
// not a number.
if c.is_ascii_digit() == start_c.is_ascii_digit() {
// Just carry on in this word.
} else {
// Finish previous word, start a new one.
res.push(&s[start..idx]);
word_start = Some((idx, c));
}
} else {
// Push the previous word, then this non-word character.
res.push(&s[start..idx]);
res.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
} else {
if let Some(start) = word_start {
res.push(&s[start..idx]);
word_start = None;
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some((idx, c));
} else {
res.push(&s[idx..idx + c.len_utf8()]);
}
}
res.push(&s[idx..idx + c.len_utf8()]);
}
}
if let Some(start) = word_start {
if let Some((start, _)) = word_start {
res.push(&s[start..]);
}
res
@ -1162,6 +1178,13 @@ mod tests {
assert_eq!(res, vec!["example", ".", "."])
}
#[test]
fn test_split_words_numbers() {
let s = "foo123bar";
let res = split_words(s);
assert_eq!(res, vec!["foo", "123", "bar"])
}
#[test]
fn test_split_words_treats_newline_separately() {
let s = "example.\ncom";