Define a separate words module

pull/593/head^2
Wilfred Hughes 2023-11-18 16:18:19 +07:00
parent 635e62c19b
commit 60d0f61cbd
4 changed files with 155 additions and 146 deletions

@ -4,9 +4,10 @@ use lazy_static::lazy_static;
use line_numbers::LinePositions;
use regex::Regex;
use crate::words::split_words;
use crate::{
diff::myers_diff,
parse::syntax::{split_words, AtomKind, MatchKind, MatchedPos, TokenKind},
parse::syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
};
fn split_lines_keep_newline(s: &str) -> Vec<&str> {

@ -39,6 +39,7 @@ mod options;
mod parse;
mod summary;
mod version;
mod words;
#[macro_use]
extern crate log;

@ -9,6 +9,7 @@ use line_numbers::SingleLineSpan;
use typed_arena::Arena;
use self::Syntax::*;
use crate::words::split_words_and_numbers;
use crate::{
diff::changes::ChangeKind,
diff::changes::{ChangeKind::*, ChangeMap},
@ -627,88 +628,6 @@ pub struct MatchedPos {
pub pos: SingleLineSpan,
}
/// Split `s` into a vec of things that look like words and individual
/// non-word characters.
///
/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
///
/// See also `split_words_and_numbers`. Both these functions are hot,
/// so they are separate implementations rather than passing a bool to
/// customise number handling.
pub fn split_words(s: &str) -> Vec<&str> {
let mut res = vec![];
let mut word_start: Option<usize> = None;
for (idx, c) in s.char_indices() {
match word_start {
Some(start) => {
if c.is_alphanumeric() || c == '-' || c == '_' {
// Just carry on in this word.
} else {
// Push the previous word, then this non-word character.
res.push(&s[start..idx]);
res.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some(idx);
} else {
res.push(&s[idx..idx + c.len_utf8()]);
}
}
}
}
if let Some(start) = word_start {
res.push(&s[start..]);
}
res
}
/// Split `s` into a vec of things that look like words and individual
/// non-word characters.
///
/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
pub fn split_words_and_numbers(s: &str) -> Vec<&str> {
let mut res = vec![];
let mut word_start: Option<(usize, char)> = None;
for (idx, c) in s.char_indices() {
match word_start {
Some((start, start_c)) => {
if c.is_alphanumeric() || c == '_' {
// Word character, add to the current word if it's
// not a number.
if c.is_ascii_digit() == start_c.is_ascii_digit() {
// Just carry on in this word.
} else {
// Finish previous word, start a new one.
res.push(&s[start..idx]);
word_start = Some((idx, c));
}
} else {
// Push the previous word, then this non-word character.
res.push(&s[start..idx]);
res.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some((idx, c));
} else {
res.push(&s[idx..idx + c.len_utf8()]);
}
}
}
}
if let Some((start, _)) = word_start {
res.push(&s[start..]);
}
res
}
/// Given the text `content` from a comment or strings, split it into
/// MatchedPos values for the novel and unchanged words.
///
@ -1339,67 +1258,4 @@ mod tests {
],
);
}
#[test]
fn test_split_words() {
let s = "example.com";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "com"])
}
#[test]
fn test_split_words_punctuation() {
let s = "example..";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "."])
}
#[test]
fn test_split_words_numbers() {
let s = "foo123bar";
let res = split_words(s);
assert_eq!(res, vec!["foo123bar"])
}
#[test]
fn test_split_words_treats_newline_separately() {
let s = "example.\ncom";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "\n", "com"])
}
#[test]
fn test_split_words_single_unicode() {
let s = "a ö b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "ö", " ", "b"])
}
#[test]
fn test_split_words_single_unicode_not_alphabetic() {
let s = "a 💝 b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "💝", " ", "b"])
}
#[test]
fn test_split_words_unicode() {
let s = "a xöy b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "xöy", " ", "b"])
}
#[test]
fn test_split_words_and_numbers() {
let s = "a123b";
let res = split_words_and_numbers(s);
assert_eq!(res, vec!["a", "123", "b"])
}
#[test]
fn test_split_words_and_numbers_spaces() {
let s = "foo bar";
let res = split_words_and_numbers(s);
assert_eq!(res, vec!["foo", " ", "bar"])
}
}

@ -0,0 +1,151 @@
/// Split `s` into a vec of things that look like words and individual
/// non-word characters.
///
/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
///
/// See also `split_words_and_numbers`. Both these functions are hot,
/// so they are separate implementations rather than passing a bool to
/// customise number handling.
pub fn split_words(s: &str) -> Vec<&str> {
let mut res = vec![];
let mut word_start: Option<usize> = None;
for (idx, c) in s.char_indices() {
match word_start {
Some(start) => {
if c.is_alphanumeric() || c == '-' || c == '_' {
// Just carry on in this word.
} else {
// Push the previous word, then this non-word character.
res.push(&s[start..idx]);
res.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some(idx);
} else {
res.push(&s[idx..idx + c.len_utf8()]);
}
}
}
}
if let Some(start) = word_start {
res.push(&s[start..]);
}
res
}
/// Split `s` into a vec of things that look like words and individual
/// non-word characters.
///
/// "foo..bar23" -> vec!["foo", ".", ".", "bar23"]
pub fn split_words_and_numbers(s: &str) -> Vec<&str> {
let mut res = vec![];
let mut word_start: Option<(usize, char)> = None;
for (idx, c) in s.char_indices() {
match word_start {
Some((start, start_c)) => {
if c.is_alphanumeric() || c == '_' {
// Word character, add to the current word if it's
// not a number.
if c.is_ascii_digit() == start_c.is_ascii_digit() {
// Just carry on in this word.
} else {
// Finish previous word, start a new one.
res.push(&s[start..idx]);
word_start = Some((idx, c));
}
} else {
// Push the previous word, then this non-word character.
res.push(&s[start..idx]);
res.push(&s[idx..idx + c.len_utf8()]);
word_start = None;
}
}
None => {
if c.is_alphanumeric() || c == '-' || c == '_' {
word_start = Some((idx, c));
} else {
res.push(&s[idx..idx + c.len_utf8()]);
}
}
}
}
if let Some((start, _)) = word_start {
res.push(&s[start..]);
}
res
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_split_words() {
let s = "example.com";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "com"])
}
#[test]
fn test_split_words_punctuation() {
let s = "example..";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "."])
}
#[test]
fn test_split_words_numbers() {
let s = "foo123bar";
let res = split_words(s);
assert_eq!(res, vec!["foo123bar"])
}
#[test]
fn test_split_words_treats_newline_separately() {
let s = "example.\ncom";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "\n", "com"])
}
#[test]
fn test_split_words_single_unicode() {
let s = "a ö b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "ö", " ", "b"])
}
#[test]
fn test_split_words_single_unicode_not_alphabetic() {
let s = "a 💝 b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "💝", " ", "b"])
}
#[test]
fn test_split_words_unicode() {
let s = "a xöy b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "xöy", " ", "b"])
}
#[test]
fn test_split_words_and_numbers() {
let s = "a123b";
let res = split_words_and_numbers(s);
assert_eq!(res, vec!["a", "123", "b"])
}
#[test]
fn test_split_words_and_numbers_spaces() {
let s = "foo bar";
let res = split_words_and_numbers(s);
assert_eq!(res, vec!["foo", " ", "bar"])
}
}