From 027856d70739a078fc296690d94a3717fdea7fb9 Mon Sep 17 00:00:00 2001
From: Wilfred Hughes <me@wilfred.me.uk>
Date: Sat, 1 Jan 2022 20:05:09 -0800
Subject: [PATCH] Adding a line-based textual differ that ignores trees

---
 src/line_parser.rs | 304 ++++++++++++++++++++++-----------------------
 src/main.rs        |  35 +++---
 src/syntax.rs      |   2 +-
 3 files changed, 170 insertions(+), 171 deletions(-)
diff --git a/src/line_parser.rs b/src/line_parser.rs
index e59db9ce6..e2672b5cb 100644
--- a/src/line_parser.rs
+++ b/src/line_parser.rs
@@ -1,187 +1,185 @@
 //! A fallback "parser" for plain text.
 
-use typed_arena::Arena;
+use lazy_static::lazy_static;
+use regex::Regex;
 
 use crate::{
-    positions::SingleLineSpan,
-    syntax::{AtomKind, Syntax},
+    lines::NewlinePositions,
+    syntax::{split_words, AtomKind, MatchKind, MatchedPos, TokenKind},
 };
 
-/// Split `s` by lines, and treat each line as an atom.
-///
-/// This is a fallback for files that we don't know how to parse.
-pub fn parse<'a>(arena: &'a Arena<Syntax<'a>>, s: &str) -> Vec<&'a Syntax<'a>> {
+fn split_lines_keep_newline(s: &str) -> Vec<String> {
+    lazy_static! {
+        static ref NEWLINE_RE: Regex = Regex::new("\n").unwrap();
+    }
+
+    let mut offset = 0;
     let mut res = vec![];
-    // TODO: This scales poorly to large files (e.g. parser.c
-    // changes). Consider grouping Syntax items into lists when we
-    // encounter blank lines.
-    for (i, line) in s.lines().enumerate() {
-        // Mark each line as a comment atom, so we get word-level diffs.
-        // TODO: this is very hot on large files, such as parser.c,
-        // because we spend ~65% of execution time computing
-        // levenshtein distance.
-        res.push(Syntax::new_atom(
-            arena,
-            vec![SingleLineSpan {
-                line: i.into(),
-                start_col: 0,
-                end_col: line.len(),
-            }],
-            line,
-            AtomKind::Comment, // TODO: don't dim plain lines like other comments
-        ));
+    for newline_match in NEWLINE_RE.find_iter(s) {
+        res.push(s[offset..newline_match.end()].into());
+        offset = newline_match.end();
+    }
+
+    if offset < s.len() {
+        res.push(s[offset..].into());
     }
 
     res
 }
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use Syntax::*;
+#[derive(Debug)]
+enum TextChangeKind {
+    Novel,
+    Unchanged,
+}
+
+fn merge_novel(
+    lines: &[(TextChangeKind, Vec<String>, Vec<String>)],
+) -> Vec<(TextChangeKind, Vec<String>, Vec<String>)> {
+    let mut lhs_novel: Vec<String> = vec![];
+    let mut rhs_novel: Vec<String> = vec![];
+
+    let mut res: Vec<(TextChangeKind, Vec<String>, Vec<String>)> = vec![];
+    for (kind, lhs_lines, rhs_lines) in lines {
+        match kind {
+            TextChangeKind::Novel => {
+                lhs_novel.extend(lhs_lines.iter().cloned());
+                rhs_novel.extend(rhs_lines.iter().cloned());
+            }
+            TextChangeKind::Unchanged => {
+                if !lhs_novel.is_empty() || !rhs_novel.is_empty() {
+                    res.push((TextChangeKind::Novel, lhs_novel, rhs_novel));
+                    lhs_novel = vec![];
+                    rhs_novel = vec![];
+                }
 
-    fn assert_syntaxes<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) {
-        if !syntaxes_match(actual, expected) {
-            dbg!(expected, actual);
-            assert!(false);
+                res.push((
+                    TextChangeKind::Unchanged,
+                    lhs_lines.clone(),
+                    rhs_lines.clone(),
+                ));
+            }
         }
     }
 
-    fn syntaxes_match<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) -> bool {
-        if actual.len() != expected.len() {
-            return false;
-        } else {
-            for (lhs_child, rhs_child) in actual.iter().zip(expected.iter()) {
-                if !syntax_matches(lhs_child, rhs_child) {
-                    return false;
-                }
+    if !lhs_novel.is_empty() || !rhs_novel.is_empty() {
+        res.push((TextChangeKind::Novel, lhs_novel, rhs_novel));
+    }
+    res
+}
+
+fn changed_parts(src: &str, opposite_src: &str) -> Vec<(TextChangeKind, Vec<String>, Vec<String>)> {
+    let src_lines = split_lines_keep_newline(src);
+    let opposite_src_lines = split_lines_keep_newline(opposite_src);
+
+    let mut res: Vec<(TextChangeKind, Vec<String>, Vec<String>)> = vec![];
+    for diff_res in diff::slice(&src_lines, &opposite_src_lines) {
+        match diff_res {
+            diff::Result::Left(line) => {
+                res.push((TextChangeKind::Novel, vec![line.into()], vec![]));
+            }
+            diff::Result::Both(line, opposite_line) => {
+                res.push((
+                    TextChangeKind::Unchanged,
+                    vec![line.into()],
+                    vec![opposite_line.into()],
+                ));
+            }
+            diff::Result::Right(opposite_line) => {
+                res.push((TextChangeKind::Novel, vec![], vec![opposite_line.into()]));
             }
         }
-        true
     }
 
-    /// Compare all the fields in a Syntax value, not just
-    /// those used in its Eq implementation.
-    fn syntax_matches<'a>(actual: &'a Syntax<'a>, expected: &'a Syntax<'a>) -> bool {
-        match (actual, expected) {
-            (
-                List {
-                    open_position: lhs_open_position,
-                    open_content: lhs_start_content,
-                    children: lhs_children,
-                    close_content: lhs_end_content,
-                    close_position: lhs_close_position,
-                    num_descendants: lhs_num_descendants,
-                    ..
-                },
-                List {
-                    open_position: rhs_open_position,
-                    open_content: rhs_start_content,
-                    children: rhs_children,
-                    close_content: rhs_end_content,
-                    close_position: rhs_close_position,
-                    num_descendants: rhs_num_descendants,
-                    ..
-                },
-            ) => {
-                if actual.change() != expected.change() {
-                    dbg!(actual.change(), expected.change());
-                    return false;
-                }
-                if lhs_open_position != rhs_open_position {
-                    dbg!(lhs_open_position, rhs_open_position);
-                    return false;
-                }
+    merge_novel(&res)
+}
 
-                if lhs_start_content != rhs_start_content {
-                    dbg!(lhs_start_content, rhs_start_content);
-                    return false;
-                }
-                if lhs_end_content != rhs_end_content {
-                    dbg!(lhs_end_content, rhs_end_content);
-                    return false;
-                }
-                if lhs_close_position != rhs_close_position {
-                    dbg!(lhs_close_position, rhs_close_position);
-                    return false;
-                }
+// TODO: Prefer src/opposite_src nomenclature as this function is called from both sides.
+pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos> {
+    let lhs_nlp = NewlinePositions::from(lhs_src);
+    let rhs_nlp = NewlinePositions::from(rhs_src);
 
-                if lhs_num_descendants != rhs_num_descendants {
-                    dbg!(lhs_num_descendants, rhs_num_descendants);
-                    return false;
-                }
+    let mut lhs_offset = 0;
+    let mut rhs_offset = 0;
 
-                if !syntaxes_match(lhs_children, rhs_children) {
-                    return false;
+    let mut res = vec![];
+    for (kind, lhs_lines, rhs_lines) in changed_parts(lhs_src, rhs_src) {
+        match kind {
+            TextChangeKind::Unchanged => {
+                for (lhs_line, rhs_line) in lhs_lines.iter().zip(rhs_lines) {
+                    let lhs_pos = lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_line.len() - 1);
+                    let rhs_pos = rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_line.len() - 1);
+
+                    res.push(MatchedPos {
+                        kind: MatchKind::Unchanged {
+                            highlight: TokenKind::Atom(AtomKind::Normal),
+                            self_pos: lhs_pos.clone(),
+                            opposite_pos: rhs_pos,
+                        },
+                        pos: lhs_pos[0],
+                    });
+
+                    lhs_offset += lhs_line.len();
+                    rhs_offset += rhs_line.len();
                 }
             }
-            (
-                Atom {
-                    position: lhs_position,
-                    content: lhs_content,
-                    kind: lhs_highlight,
-                    ..
-                },
-                Atom {
-                    position: rhs_position,
-                    content: rhs_content,
-                    kind: rhs_highlight,
-                    ..
-                },
-            ) => {
-                if actual.change() != expected.change() {
-                    dbg!(actual.change(), expected.change());
-                    return false;
-                }
-                if lhs_position != rhs_position {
-                    dbg!(lhs_position, rhs_position);
-                    return false;
-                }
+            TextChangeKind::Novel => {
+                let lhs_part = lhs_lines.join("");
+                let rhs_part = rhs_lines.join("");
 
-                if lhs_content != rhs_content {
-                    dbg!(lhs_content, rhs_content);
-                    return false;
-                }
-                if lhs_highlight != rhs_highlight {
-                    dbg!(lhs_highlight, rhs_highlight);
-                    return false;
+                for diff_res in diff::slice(&split_words(&lhs_part), &split_words(&rhs_part)) {
+                    match diff_res {
+                        diff::Result::Left(lhs_word) => {
+                            let lhs_pos =
+                                lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len());
+                            res.push(MatchedPos {
+                                // TODO: rename this kind to reflect
+                                // that it's used for both code
+                                // comments and plain text.
+                                kind: MatchKind::ChangedCommentPart {},
+                                pos: lhs_pos[0],
+                            });
+
+                            lhs_offset += lhs_word.len();
+                        }
+                        diff::Result::Both(lhs_word, rhs_word) => {
+                            let lhs_pos =
+                                lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len());
+                            let rhs_pos =
+                                rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_word.len());
+
+                            res.push(MatchedPos {
+                                kind: MatchKind::UnchangedCommentPart {
+                                    self_pos: lhs_pos[0],
+                                    opposite_pos: rhs_pos,
+                                },
+                                pos: lhs_pos[0],
+                            });
+
+                            lhs_offset += lhs_word.len();
+                            rhs_offset += rhs_word.len();
+                        }
+                        diff::Result::Right(rhs_word) => {
+                            rhs_offset += rhs_word.len();
+                        }
+                    }
                 }
             }
-            _ => {
-                return false;
-            }
         }
-        true
     }
 
+    res
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
     #[test]
-    fn test_parse_lines() {
-        let arena = Arena::new();
-
-        assert_syntaxes(
-            &parse(&arena, "foo\nbar"),
-            &[
-                Syntax::new_atom(
-                    &arena,
-                    vec![SingleLineSpan {
-                        line: 0.into(),
-                        start_col: 0,
-                        end_col: 3,
-                    }],
-                    "foo",
-                    AtomKind::Comment,
-                ),
-                Syntax::new_atom(
-                    &arena,
-                    vec![SingleLineSpan {
-                        line: 1.into(),
-                        start_col: 0,
-                        end_col: 3,
-                    }],
-                    "bar",
-                    AtomKind::Comment,
-                ),
-            ],
-        );
+    fn test_split_newlines() {
+        let s = "foo\nbar\nbaz";
+        let res = split_lines_keep_newline(s);
+        assert_eq!(res, vec!["foo\n", "bar\n", "baz"])
     }
 }
diff --git a/src/main.rs b/src/main.rs
index d737df587..481b9dda2 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -45,6 +45,7 @@ use walkdir::WalkDir;
 use crate::{
     dijkstra::mark_syntax,
     files::{is_probably_binary, read_or_die},
+    line_parser as lp,
     lines::MaxLine,
     syntax::{change_positions, init_info},
     tree_sitter_parser as tsp,
@@ -277,25 +278,25 @@ fn diff_file(display_path: &str, lhs_path: &Path, rhs_path: &Path) -> DiffResult
     };
     let ts_lang = guess(path, guess_src).map(tsp::from_language);
 
-    let arena = Arena::new();
-    let (lang_name, lhs, rhs) = match ts_lang {
-        Some(ts_lang) => (
-            Some(ts_lang.name.into()),
-            tsp::parse(&arena, &lhs_src, &ts_lang),
-            tsp::parse(&arena, &rhs_src, &ts_lang),
-        ),
-        None => (
-            None,
-            line_parser::parse(&arena, &lhs_src),
-            line_parser::parse(&arena, &rhs_src),
-        ),
-    };
+    let (lang_name, lhs_positions, rhs_positions) = match ts_lang {
+        Some(ts_lang) => {
+            let arena = Arena::new();
+            let lhs = tsp::parse(&arena, &lhs_src, &ts_lang);
+            let rhs = tsp::parse(&arena, &rhs_src, &ts_lang);
 
-    init_info(&lhs, &rhs);
-    mark_syntax(lhs.get(0).copied(), rhs.get(0).copied());
+            init_info(&lhs, &rhs);
+            mark_syntax(lhs.get(0).copied(), rhs.get(0).copied());
 
-    let lhs_positions = change_positions(&lhs_src, &rhs_src, &lhs);
-    let rhs_positions = change_positions(&rhs_src, &lhs_src, &rhs);
+            let lhs_positions = change_positions(&lhs_src, &rhs_src, &lhs);
+            let rhs_positions = change_positions(&rhs_src, &lhs_src, &rhs);
+            (Some(ts_lang.name.into()), lhs_positions, rhs_positions)
+        }
+        None => {
+            let lhs_positions = lp::change_positions(&lhs_src, &rhs_src);
+            let rhs_positions = lp::change_positions(&rhs_src, &lhs_src);
+            (None, lhs_positions, rhs_positions)
+        }
+    };
 
     DiffResult {
         path: display_path.into(),
diff --git a/src/syntax.rs b/src/syntax.rs
index 21aaae76e..98a0f3ab2 100644
--- a/src/syntax.rs
+++ b/src/syntax.rs
@@ -527,7 +527,7 @@ pub struct MatchedPos {
 }
 
 // "foo bar" -> vec!["foo", " ", "bar"]
-fn split_words(s: &str) -> Vec<String> {
+pub fn split_words(s: &str) -> Vec<String> {
     lazy_static! {
         static ref RE: Regex = Regex::new(r"[a-zA-Z0-9]+|\n|[^a-zA-Z0-9\n]").unwrap();
     }