Define a separate words module

2023-11-18 16:18:19 +07:00 · 2023-11-18 16:18:19 +07:00 · 60d0f61cbd
parent 635e62c19b
commit 60d0f61cbd
4 changed files with 155 additions and 146 deletions
--- a/src/line_parser.rs
+++ b/src/line_parser.rs
@ -4,9 +4,10 @@ use lazy_static::lazy_static;
 use line_numbers::LinePositions;
 use regex::Regex;

+use crate::words::split_words;
 use crate::{
    diff::myers_diff,
-    parse::syntax::{split_words, AtomKind, MatchKind, MatchedPos, TokenKind},
+    parse::syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
 };

 fn split_lines_keep_newline(s: &str) -> Vec<&str> {
--- a/src/main.rs
+++ b/src/main.rs
@ -39,6 +39,7 @@ mod options;
 mod parse;
 mod summary;
 mod version;
+mod words;

 #[macro_use]
 extern crate log;
--- a/src/parse/syntax.rs
+++ b/src/parse/syntax.rs
@ -9,6 +9,7 @@ use line_numbers::SingleLineSpan;
 use typed_arena::Arena;

 use self::Syntax::*;
+use crate::words::split_words_and_numbers;
 use crate::{
    diff::changes::ChangeKind,
    diff::changes::{ChangeKind::*, ChangeMap},
@ -627,88 +628,6 @@ pub struct MatchedPos {
    pub pos: SingleLineSpan,
 }

-/// Split `s` into a vec of things that look like words and individual
-/// non-word characters.
-///
-/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
-///
-/// See also `split_words_and_numbers`. Both these functions are hot,
-/// so they are separate implementations rather than passing a bool to
-/// customise number handling.
-pub fn split_words(s: &str) -> Vec<&str> {
-    let mut res = vec![];
-    let mut word_start: Option<usize> = None;
-    for (idx, c) in s.char_indices() {
-        match word_start {
-            Some(start) => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
-                    // Just carry on in this word.
-                } else {
-                    // Push the previous word, then this non-word character.
-                    res.push(&s[start..idx]);
-                    res.push(&s[idx..idx + c.len_utf8()]);
-                    word_start = None;
-                }
-            }
-            None => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
-                    word_start = Some(idx);
-                } else {
-                    res.push(&s[idx..idx + c.len_utf8()]);
-                }
-            }
-        }
-    }
-
-    if let Some(start) = word_start {
-        res.push(&s[start..]);
-    }
-    res
-}
-
-/// Split `s` into a vec of things that look like words and individual
-/// non-word characters.
-///
-/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
-pub fn split_words_and_numbers(s: &str) -> Vec<&str> {
-    let mut res = vec![];
-    let mut word_start: Option<(usize, char)> = None;
-    for (idx, c) in s.char_indices() {
-        match word_start {
-            Some((start, start_c)) => {
-                if c.is_alphanumeric() || c == '_' {
-                    // Word character, add to the current word if it's
-                    // not a number.
-                    if c.is_ascii_digit() == start_c.is_ascii_digit() {
-                        // Just carry on in this word.
-                    } else {
-                        // Finish previous word, start a new one.
-                        res.push(&s[start..idx]);
-                        word_start = Some((idx, c));
-                    }
-                } else {
-                    // Push the previous word, then this non-word character.
-                    res.push(&s[start..idx]);
-                    res.push(&s[idx..idx + c.len_utf8()]);
-                    word_start = None;
-                }
-            }
-            None => {
-                if c.is_alphanumeric() || c == '-' || c == '_' {
-                    word_start = Some((idx, c));
-                } else {
-                    res.push(&s[idx..idx + c.len_utf8()]);
-                }
-            }
-        }
-    }
-
-    if let Some((start, _)) = word_start {
-        res.push(&s[start..]);
-    }
-    res
-}
-
 /// Given the text `content` from a comment or strings, split it into
 /// MatchedPos values for the novel and unchanged words.
 ///
@ -1339,67 +1258,4 @@ mod tests {
            ],
        );
    }
-
-    #[test]
-    fn test_split_words() {
-        let s = "example.com";
-        let res = split_words(s);
-        assert_eq!(res, vec!["example", ".", "com"])
-    }
-
-    #[test]
-    fn test_split_words_punctuation() {
-        let s = "example..";
-        let res = split_words(s);
-        assert_eq!(res, vec!["example", ".", "."])
-    }
-
-    #[test]
-    fn test_split_words_numbers() {
-        let s = "foo123bar";
-        let res = split_words(s);
-        assert_eq!(res, vec!["foo123bar"])
-    }
-
-    #[test]
-    fn test_split_words_treats_newline_separately() {
-        let s = "example.\ncom";
-        let res = split_words(s);
-        assert_eq!(res, vec!["example", ".", "\n", "com"])
-    }
-
-    #[test]
-    fn test_split_words_single_unicode() {
-        let s = "a ö b";
-        let res = split_words(s);
-        assert_eq!(res, vec!["a", " ", "ö", " ", "b"])
-    }
-
-    #[test]
-    fn test_split_words_single_unicode_not_alphabetic() {
-        let s = "a 💝 b";
-        let res = split_words(s);
-        assert_eq!(res, vec!["a", " ", "💝", " ", "b"])
-    }
-
-    #[test]
-    fn test_split_words_unicode() {
-        let s = "a xöy b";
-        let res = split_words(s);
-        assert_eq!(res, vec!["a", " ", "xöy", " ", "b"])
-    }
-
-    #[test]
-    fn test_split_words_and_numbers() {
-        let s = "a123b";
-        let res = split_words_and_numbers(s);
-        assert_eq!(res, vec!["a", "123", "b"])
-    }
-
-    #[test]
-    fn test_split_words_and_numbers_spaces() {
-        let s = "foo bar";
-        let res = split_words_and_numbers(s);
-        assert_eq!(res, vec!["foo", " ", "bar"])
-    }
 }
--- a/src/words.rs
+++ b/src/words.rs
@ -0,0 +1,151 @@
+/// Split `s` into a vec of things that look like words and individual
+/// non-word characters.
+///
+/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
+///
+/// See also `split_words_and_numbers`. Both these functions are hot,
+/// so they are separate implementations rather than passing a bool to
+/// customise number handling.
+pub fn split_words(s: &str) -> Vec<&str> {
+    let mut res = vec![];
+    let mut word_start: Option<usize> = None;
+    for (idx, c) in s.char_indices() {
+        match word_start {
+            Some(start) => {
+                if c.is_alphanumeric() || c == '-' || c == '_' {
+                    // Just carry on in this word.
+                } else {
+                    // Push the previous word, then this non-word character.
+                    res.push(&s[start..idx]);
+                    res.push(&s[idx..idx + c.len_utf8()]);
+                    word_start = None;
+                }
+            }
+            None => {
+                if c.is_alphanumeric() || c == '-' || c == '_' {
+                    word_start = Some(idx);
+                } else {
+                    res.push(&s[idx..idx + c.len_utf8()]);
+                }
+            }
+        }
+    }
+
+    if let Some(start) = word_start {
+        res.push(&s[start..]);
+    }
+    res
+}
+
+/// Split `s` into a vec of things that look like words and individual
+/// non-word characters.
+///
+/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
+pub fn split_words_and_numbers(s: &str) -> Vec<&str> {
+    let mut res = vec![];
+    let mut word_start: Option<(usize, char)> = None;
+    for (idx, c) in s.char_indices() {
+        match word_start {
+            Some((start, start_c)) => {
+                if c.is_alphanumeric() || c == '_' {
+                    // Word character, add to the current word if it's
+                    // not a number.
+                    if c.is_ascii_digit() == start_c.is_ascii_digit() {
+                        // Just carry on in this word.
+                    } else {
+                        // Finish previous word, start a new one.
+                        res.push(&s[start..idx]);
+                        word_start = Some((idx, c));
+                    }
+                } else {
+                    // Push the previous word, then this non-word character.
+                    res.push(&s[start..idx]);
+                    res.push(&s[idx..idx + c.len_utf8()]);
+                    word_start = None;
+                }
+            }
+            None => {
+                if c.is_alphanumeric() || c == '-' || c == '_' {
+                    word_start = Some((idx, c));
+                } else {
+                    res.push(&s[idx..idx + c.len_utf8()]);
+                }
+            }
+        }
+    }
+
+    if let Some((start, _)) = word_start {
+        res.push(&s[start..]);
+    }
+    res
+}
+
+#[cfg(test)]
+mod tests {
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_split_words() {
+        let s = "example.com";
+        let res = split_words(s);
+        assert_eq!(res, vec!["example", ".", "com"])
+    }
+
+    #[test]
+    fn test_split_words_punctuation() {
+        let s = "example..";
+        let res = split_words(s);
+        assert_eq!(res, vec!["example", ".", "."])
+    }
+
+    #[test]
+    fn test_split_words_numbers() {
+        let s = "foo123bar";
+        let res = split_words(s);
+        assert_eq!(res, vec!["foo123bar"])
+    }
+
+    #[test]
+    fn test_split_words_treats_newline_separately() {
+        let s = "example.\ncom";
+        let res = split_words(s);
+        assert_eq!(res, vec!["example", ".", "\n", "com"])
+    }
+
+    #[test]
+    fn test_split_words_single_unicode() {
+        let s = "a ö b";
+        let res = split_words(s);
+        assert_eq!(res, vec!["a", " ", "ö", " ", "b"])
+    }
+
+    #[test]
+    fn test_split_words_single_unicode_not_alphabetic() {
+        let s = "a 💝 b";
+        let res = split_words(s);
+        assert_eq!(res, vec!["a", " ", "💝", " ", "b"])
+    }
+
+    #[test]
+    fn test_split_words_unicode() {
+        let s = "a xöy b";
+        let res = split_words(s);
+        assert_eq!(res, vec!["a", " ", "xöy", " ", "b"])
+    }
+
+    #[test]
+    fn test_split_words_and_numbers() {
+        let s = "a123b";
+        let res = split_words_and_numbers(s);
+        assert_eq!(res, vec!["a", "123", "b"])
+    }
+
+    #[test]
+    fn test_split_words_and_numbers_spaces() {
+        let s = "foo bar";
+        let res = split_words_and_numbers(s);
+        assert_eq!(res, vec!["foo", " ", "bar"])
+    }
+}