From 027856d70739a078fc296690d94a3717fdea7fb9 Mon Sep 17 00:00:00 2001 From: Wilfred Hughes Date: Sat, 1 Jan 2022 20:05:09 -0800 Subject: [PATCH] Adding a line-based textual differ that ignores trees --- src/line_parser.rs | 304 ++++++++++++++++++++++----------------------- src/main.rs | 35 +++--- src/syntax.rs | 2 +- 3 files changed, 170 insertions(+), 171 deletions(-) diff --git a/src/line_parser.rs b/src/line_parser.rs index e59db9ce6..e2672b5cb 100644 --- a/src/line_parser.rs +++ b/src/line_parser.rs @@ -1,187 +1,185 @@ //! A fallback "parser" for plain text. -use typed_arena::Arena; +use lazy_static::lazy_static; +use regex::Regex; use crate::{ - positions::SingleLineSpan, - syntax::{AtomKind, Syntax}, + lines::NewlinePositions, + syntax::{split_words, AtomKind, MatchKind, MatchedPos, TokenKind}, }; -/// Split `s` by lines, and treat each line as an atom. -/// -/// This is a fallback for files that we don't know how to parse. -pub fn parse<'a>(arena: &'a Arena>, s: &str) -> Vec<&'a Syntax<'a>> { +fn split_lines_keep_newline(s: &str) -> Vec { + lazy_static! { + static ref NEWLINE_RE: Regex = Regex::new("\n").unwrap(); + } + + let mut offset = 0; let mut res = vec![]; - // TODO: This scales poorly to large files (e.g. parser.c - // changes). Consider grouping Syntax items into lists when we - // encounter blank lines. - for (i, line) in s.lines().enumerate() { - // Mark each line as a comment atom, so we get word-level diffs. - // TODO: this is very hot on large files, such as parser.c, - // because we spend ~65% of execution time computing - // levenshtein distance. - res.push(Syntax::new_atom( - arena, - vec![SingleLineSpan { - line: i.into(), - start_col: 0, - end_col: line.len(), - }], - line, - AtomKind::Comment, // TODO: don't dim plain lines like other comments - )); + for newline_match in NEWLINE_RE.find_iter(s) { + res.push(s[offset..newline_match.end()].into()); + offset = newline_match.end(); + } + + if offset < s.len() { + res.push(s[offset..].into()); } res } -#[cfg(test)] -mod tests { - use super::*; - use Syntax::*; +#[derive(Debug)] +enum TextChangeKind { + Novel, + Unchanged, +} + +fn merge_novel( + lines: &[(TextChangeKind, Vec, Vec)], +) -> Vec<(TextChangeKind, Vec, Vec)> { + let mut lhs_novel: Vec = vec![]; + let mut rhs_novel: Vec = vec![]; + + let mut res: Vec<(TextChangeKind, Vec, Vec)> = vec![]; + for (kind, lhs_lines, rhs_lines) in lines { + match kind { + TextChangeKind::Novel => { + lhs_novel.extend(lhs_lines.iter().cloned()); + rhs_novel.extend(rhs_lines.iter().cloned()); + } + TextChangeKind::Unchanged => { + if !lhs_novel.is_empty() || !rhs_novel.is_empty() { + res.push((TextChangeKind::Novel, lhs_novel, rhs_novel)); + lhs_novel = vec![]; + rhs_novel = vec![]; + } - fn assert_syntaxes<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) { - if !syntaxes_match(actual, expected) { - dbg!(expected, actual); - assert!(false); + res.push(( + TextChangeKind::Unchanged, + lhs_lines.clone(), + rhs_lines.clone(), + )); + } } } - fn syntaxes_match<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) -> bool { - if actual.len() != expected.len() { - return false; - } else { - for (lhs_child, rhs_child) in actual.iter().zip(expected.iter()) { - if !syntax_matches(lhs_child, rhs_child) { - return false; - } + if !lhs_novel.is_empty() || !rhs_novel.is_empty() { + res.push((TextChangeKind::Novel, lhs_novel, rhs_novel)); + } + res +} + +fn changed_parts(src: &str, opposite_src: &str) -> Vec<(TextChangeKind, Vec, Vec)> { + let src_lines = split_lines_keep_newline(src); + let opposite_src_lines = split_lines_keep_newline(opposite_src); + + let mut res: Vec<(TextChangeKind, Vec, Vec)> = vec![]; + for diff_res in diff::slice(&src_lines, &opposite_src_lines) { + match diff_res { + diff::Result::Left(line) => { + res.push((TextChangeKind::Novel, vec![line.into()], vec![])); + } + diff::Result::Both(line, opposite_line) => { + res.push(( + TextChangeKind::Unchanged, + vec![line.into()], + vec![opposite_line.into()], + )); + } + diff::Result::Right(opposite_line) => { + res.push((TextChangeKind::Novel, vec![], vec![opposite_line.into()])); } } - true } - /// Compare all the fields in a Syntax value, not just - /// those used in its Eq implementation. - fn syntax_matches<'a>(actual: &'a Syntax<'a>, expected: &'a Syntax<'a>) -> bool { - match (actual, expected) { - ( - List { - open_position: lhs_open_position, - open_content: lhs_start_content, - children: lhs_children, - close_content: lhs_end_content, - close_position: lhs_close_position, - num_descendants: lhs_num_descendants, - .. - }, - List { - open_position: rhs_open_position, - open_content: rhs_start_content, - children: rhs_children, - close_content: rhs_end_content, - close_position: rhs_close_position, - num_descendants: rhs_num_descendants, - .. - }, - ) => { - if actual.change() != expected.change() { - dbg!(actual.change(), expected.change()); - return false; - } - if lhs_open_position != rhs_open_position { - dbg!(lhs_open_position, rhs_open_position); - return false; - } + merge_novel(&res) +} - if lhs_start_content != rhs_start_content { - dbg!(lhs_start_content, rhs_start_content); - return false; - } - if lhs_end_content != rhs_end_content { - dbg!(lhs_end_content, rhs_end_content); - return false; - } - if lhs_close_position != rhs_close_position { - dbg!(lhs_close_position, rhs_close_position); - return false; - } +// TODO: Prefer src/opposite_src nomenclature as this function is called from both sides. +pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec { + let lhs_nlp = NewlinePositions::from(lhs_src); + let rhs_nlp = NewlinePositions::from(rhs_src); - if lhs_num_descendants != rhs_num_descendants { - dbg!(lhs_num_descendants, rhs_num_descendants); - return false; - } + let mut lhs_offset = 0; + let mut rhs_offset = 0; - if !syntaxes_match(lhs_children, rhs_children) { - return false; + let mut res = vec![]; + for (kind, lhs_lines, rhs_lines) in changed_parts(lhs_src, rhs_src) { + match kind { + TextChangeKind::Unchanged => { + for (lhs_line, rhs_line) in lhs_lines.iter().zip(rhs_lines) { + let lhs_pos = lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_line.len() - 1); + let rhs_pos = rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_line.len() - 1); + + res.push(MatchedPos { + kind: MatchKind::Unchanged { + highlight: TokenKind::Atom(AtomKind::Normal), + self_pos: lhs_pos.clone(), + opposite_pos: rhs_pos, + }, + pos: lhs_pos[0], + }); + + lhs_offset += lhs_line.len(); + rhs_offset += rhs_line.len(); } } - ( - Atom { - position: lhs_position, - content: lhs_content, - kind: lhs_highlight, - .. - }, - Atom { - position: rhs_position, - content: rhs_content, - kind: rhs_highlight, - .. - }, - ) => { - if actual.change() != expected.change() { - dbg!(actual.change(), expected.change()); - return false; - } - if lhs_position != rhs_position { - dbg!(lhs_position, rhs_position); - return false; - } + TextChangeKind::Novel => { + let lhs_part = lhs_lines.join(""); + let rhs_part = rhs_lines.join(""); - if lhs_content != rhs_content { - dbg!(lhs_content, rhs_content); - return false; - } - if lhs_highlight != rhs_highlight { - dbg!(lhs_highlight, rhs_highlight); - return false; + for diff_res in diff::slice(&split_words(&lhs_part), &split_words(&rhs_part)) { + match diff_res { + diff::Result::Left(lhs_word) => { + let lhs_pos = + lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len()); + res.push(MatchedPos { + // TODO: rename this kind to reflect + // that it's used for both code + // comments and plain text. + kind: MatchKind::ChangedCommentPart {}, + pos: lhs_pos[0], + }); + + lhs_offset += lhs_word.len(); + } + diff::Result::Both(lhs_word, rhs_word) => { + let lhs_pos = + lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len()); + let rhs_pos = + rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_word.len()); + + res.push(MatchedPos { + kind: MatchKind::UnchangedCommentPart { + self_pos: lhs_pos[0], + opposite_pos: rhs_pos, + }, + pos: lhs_pos[0], + }); + + lhs_offset += lhs_word.len(); + rhs_offset += rhs_word.len(); + } + diff::Result::Right(rhs_word) => { + rhs_offset += rhs_word.len(); + } + } } } - _ => { - return false; - } } - true } + res +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + #[test] - fn test_parse_lines() { - let arena = Arena::new(); - - assert_syntaxes( - &parse(&arena, "foo\nbar"), - &[ - Syntax::new_atom( - &arena, - vec![SingleLineSpan { - line: 0.into(), - start_col: 0, - end_col: 3, - }], - "foo", - AtomKind::Comment, - ), - Syntax::new_atom( - &arena, - vec![SingleLineSpan { - line: 1.into(), - start_col: 0, - end_col: 3, - }], - "bar", - AtomKind::Comment, - ), - ], - ); + fn test_split_newlines() { + let s = "foo\nbar\nbaz"; + let res = split_lines_keep_newline(s); + assert_eq!(res, vec!["foo\n", "bar\n", "baz"]) } } diff --git a/src/main.rs b/src/main.rs index d737df587..481b9dda2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -45,6 +45,7 @@ use walkdir::WalkDir; use crate::{ dijkstra::mark_syntax, files::{is_probably_binary, read_or_die}, + line_parser as lp, lines::MaxLine, syntax::{change_positions, init_info}, tree_sitter_parser as tsp, @@ -277,25 +278,25 @@ fn diff_file(display_path: &str, lhs_path: &Path, rhs_path: &Path) -> DiffResult }; let ts_lang = guess(path, guess_src).map(tsp::from_language); - let arena = Arena::new(); - let (lang_name, lhs, rhs) = match ts_lang { - Some(ts_lang) => ( - Some(ts_lang.name.into()), - tsp::parse(&arena, &lhs_src, &ts_lang), - tsp::parse(&arena, &rhs_src, &ts_lang), - ), - None => ( - None, - line_parser::parse(&arena, &lhs_src), - line_parser::parse(&arena, &rhs_src), - ), - }; + let (lang_name, lhs_positions, rhs_positions) = match ts_lang { + Some(ts_lang) => { + let arena = Arena::new(); + let lhs = tsp::parse(&arena, &lhs_src, &ts_lang); + let rhs = tsp::parse(&arena, &rhs_src, &ts_lang); - init_info(&lhs, &rhs); - mark_syntax(lhs.get(0).copied(), rhs.get(0).copied()); + init_info(&lhs, &rhs); + mark_syntax(lhs.get(0).copied(), rhs.get(0).copied()); - let lhs_positions = change_positions(&lhs_src, &rhs_src, &lhs); - let rhs_positions = change_positions(&rhs_src, &lhs_src, &rhs); + let lhs_positions = change_positions(&lhs_src, &rhs_src, &lhs); + let rhs_positions = change_positions(&rhs_src, &lhs_src, &rhs); + (Some(ts_lang.name.into()), lhs_positions, rhs_positions) + } + None => { + let lhs_positions = lp::change_positions(&lhs_src, &rhs_src); + let rhs_positions = lp::change_positions(&rhs_src, &lhs_src); + (None, lhs_positions, rhs_positions) + } + }; DiffResult { path: display_path.into(), diff --git a/src/syntax.rs b/src/syntax.rs index 21aaae76e..98a0f3ab2 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -527,7 +527,7 @@ pub struct MatchedPos { } // "foo bar" -> vec!["foo", " ", "bar"] -fn split_words(s: &str) -> Vec { +pub fn split_words(s: &str) -> Vec { lazy_static! { static ref RE: Regex = Regex::new(r"[a-zA-Z0-9]+|\n|[^a-zA-Z0-9\n]").unwrap(); }