Adding a line-based textual differ that ignores trees

a_star_module
Wilfred Hughes 2022-01-01 20:05:09 +07:00
parent b1793703fb
commit 027856d707
3 changed files with 170 additions and 171 deletions

@ -1,187 +1,185 @@
//! A fallback "parser" for plain text.
use typed_arena::Arena;
use lazy_static::lazy_static;
use regex::Regex;
use crate::{
positions::SingleLineSpan,
syntax::{AtomKind, Syntax},
lines::NewlinePositions,
syntax::{split_words, AtomKind, MatchKind, MatchedPos, TokenKind},
};
/// Split `s` by lines, and treat each line as an atom.
///
/// This is a fallback for files that we don't know how to parse.
pub fn parse<'a>(arena: &'a Arena<Syntax<'a>>, s: &str) -> Vec<&'a Syntax<'a>> {
fn split_lines_keep_newline(s: &str) -> Vec<String> {
lazy_static! {
static ref NEWLINE_RE: Regex = Regex::new("\n").unwrap();
}
let mut offset = 0;
let mut res = vec![];
// TODO: This scales poorly to large files (e.g. parser.c
// changes). Consider grouping Syntax items into lists when we
// encounter blank lines.
for (i, line) in s.lines().enumerate() {
// Mark each line as a comment atom, so we get word-level diffs.
// TODO: this is very hot on large files, such as parser.c,
// because we spend ~65% of execution time computing
// levenshtein distance.
res.push(Syntax::new_atom(
arena,
vec![SingleLineSpan {
line: i.into(),
start_col: 0,
end_col: line.len(),
}],
line,
AtomKind::Comment, // TODO: don't dim plain lines like other comments
));
for newline_match in NEWLINE_RE.find_iter(s) {
res.push(s[offset..newline_match.end()].into());
offset = newline_match.end();
}
if offset < s.len() {
res.push(s[offset..].into());
}
res
}
#[cfg(test)]
mod tests {
use super::*;
use Syntax::*;
#[derive(Debug)]
enum TextChangeKind {
Novel,
Unchanged,
}
fn merge_novel(
lines: &[(TextChangeKind, Vec<String>, Vec<String>)],
) -> Vec<(TextChangeKind, Vec<String>, Vec<String>)> {
let mut lhs_novel: Vec<String> = vec![];
let mut rhs_novel: Vec<String> = vec![];
let mut res: Vec<(TextChangeKind, Vec<String>, Vec<String>)> = vec![];
for (kind, lhs_lines, rhs_lines) in lines {
match kind {
TextChangeKind::Novel => {
lhs_novel.extend(lhs_lines.iter().cloned());
rhs_novel.extend(rhs_lines.iter().cloned());
}
TextChangeKind::Unchanged => {
if !lhs_novel.is_empty() || !rhs_novel.is_empty() {
res.push((TextChangeKind::Novel, lhs_novel, rhs_novel));
lhs_novel = vec![];
rhs_novel = vec![];
}
fn assert_syntaxes<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) {
if !syntaxes_match(actual, expected) {
dbg!(expected, actual);
assert!(false);
res.push((
TextChangeKind::Unchanged,
lhs_lines.clone(),
rhs_lines.clone(),
));
}
}
}
fn syntaxes_match<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) -> bool {
if actual.len() != expected.len() {
return false;
} else {
for (lhs_child, rhs_child) in actual.iter().zip(expected.iter()) {
if !syntax_matches(lhs_child, rhs_child) {
return false;
}
if !lhs_novel.is_empty() || !rhs_novel.is_empty() {
res.push((TextChangeKind::Novel, lhs_novel, rhs_novel));
}
res
}
fn changed_parts(src: &str, opposite_src: &str) -> Vec<(TextChangeKind, Vec<String>, Vec<String>)> {
let src_lines = split_lines_keep_newline(src);
let opposite_src_lines = split_lines_keep_newline(opposite_src);
let mut res: Vec<(TextChangeKind, Vec<String>, Vec<String>)> = vec![];
for diff_res in diff::slice(&src_lines, &opposite_src_lines) {
match diff_res {
diff::Result::Left(line) => {
res.push((TextChangeKind::Novel, vec![line.into()], vec![]));
}
diff::Result::Both(line, opposite_line) => {
res.push((
TextChangeKind::Unchanged,
vec![line.into()],
vec![opposite_line.into()],
));
}
diff::Result::Right(opposite_line) => {
res.push((TextChangeKind::Novel, vec![], vec![opposite_line.into()]));
}
}
true
}
/// Compare all the fields in a Syntax value, not just
/// those used in its Eq implementation.
fn syntax_matches<'a>(actual: &'a Syntax<'a>, expected: &'a Syntax<'a>) -> bool {
match (actual, expected) {
(
List {
open_position: lhs_open_position,
open_content: lhs_start_content,
children: lhs_children,
close_content: lhs_end_content,
close_position: lhs_close_position,
num_descendants: lhs_num_descendants,
..
},
List {
open_position: rhs_open_position,
open_content: rhs_start_content,
children: rhs_children,
close_content: rhs_end_content,
close_position: rhs_close_position,
num_descendants: rhs_num_descendants,
..
},
) => {
if actual.change() != expected.change() {
dbg!(actual.change(), expected.change());
return false;
}
if lhs_open_position != rhs_open_position {
dbg!(lhs_open_position, rhs_open_position);
return false;
}
merge_novel(&res)
}
if lhs_start_content != rhs_start_content {
dbg!(lhs_start_content, rhs_start_content);
return false;
}
if lhs_end_content != rhs_end_content {
dbg!(lhs_end_content, rhs_end_content);
return false;
}
if lhs_close_position != rhs_close_position {
dbg!(lhs_close_position, rhs_close_position);
return false;
}
// TODO: Prefer src/opposite_src nomenclature as this function is called from both sides.
pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos> {
let lhs_nlp = NewlinePositions::from(lhs_src);
let rhs_nlp = NewlinePositions::from(rhs_src);
if lhs_num_descendants != rhs_num_descendants {
dbg!(lhs_num_descendants, rhs_num_descendants);
return false;
}
let mut lhs_offset = 0;
let mut rhs_offset = 0;
if !syntaxes_match(lhs_children, rhs_children) {
return false;
let mut res = vec![];
for (kind, lhs_lines, rhs_lines) in changed_parts(lhs_src, rhs_src) {
match kind {
TextChangeKind::Unchanged => {
for (lhs_line, rhs_line) in lhs_lines.iter().zip(rhs_lines) {
let lhs_pos = lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_line.len() - 1);
let rhs_pos = rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_line.len() - 1);
res.push(MatchedPos {
kind: MatchKind::Unchanged {
highlight: TokenKind::Atom(AtomKind::Normal),
self_pos: lhs_pos.clone(),
opposite_pos: rhs_pos,
},
pos: lhs_pos[0],
});
lhs_offset += lhs_line.len();
rhs_offset += rhs_line.len();
}
}
(
Atom {
position: lhs_position,
content: lhs_content,
kind: lhs_highlight,
..
},
Atom {
position: rhs_position,
content: rhs_content,
kind: rhs_highlight,
..
},
) => {
if actual.change() != expected.change() {
dbg!(actual.change(), expected.change());
return false;
}
if lhs_position != rhs_position {
dbg!(lhs_position, rhs_position);
return false;
}
TextChangeKind::Novel => {
let lhs_part = lhs_lines.join("");
let rhs_part = rhs_lines.join("");
if lhs_content != rhs_content {
dbg!(lhs_content, rhs_content);
return false;
}
if lhs_highlight != rhs_highlight {
dbg!(lhs_highlight, rhs_highlight);
return false;
for diff_res in diff::slice(&split_words(&lhs_part), &split_words(&rhs_part)) {
match diff_res {
diff::Result::Left(lhs_word) => {
let lhs_pos =
lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len());
res.push(MatchedPos {
// TODO: rename this kind to reflect
// that it's used for both code
// comments and plain text.
kind: MatchKind::ChangedCommentPart {},
pos: lhs_pos[0],
});
lhs_offset += lhs_word.len();
}
diff::Result::Both(lhs_word, rhs_word) => {
let lhs_pos =
lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len());
let rhs_pos =
rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_word.len());
res.push(MatchedPos {
kind: MatchKind::UnchangedCommentPart {
self_pos: lhs_pos[0],
opposite_pos: rhs_pos,
},
pos: lhs_pos[0],
});
lhs_offset += lhs_word.len();
rhs_offset += rhs_word.len();
}
diff::Result::Right(rhs_word) => {
rhs_offset += rhs_word.len();
}
}
}
}
_ => {
return false;
}
}
true
}
res
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_parse_lines() {
let arena = Arena::new();
assert_syntaxes(
&parse(&arena, "foo\nbar"),
&[
Syntax::new_atom(
&arena,
vec![SingleLineSpan {
line: 0.into(),
start_col: 0,
end_col: 3,
}],
"foo",
AtomKind::Comment,
),
Syntax::new_atom(
&arena,
vec![SingleLineSpan {
line: 1.into(),
start_col: 0,
end_col: 3,
}],
"bar",
AtomKind::Comment,
),
],
);
fn test_split_newlines() {
let s = "foo\nbar\nbaz";
let res = split_lines_keep_newline(s);
assert_eq!(res, vec!["foo\n", "bar\n", "baz"])
}
}

@ -45,6 +45,7 @@ use walkdir::WalkDir;
use crate::{
dijkstra::mark_syntax,
files::{is_probably_binary, read_or_die},
line_parser as lp,
lines::MaxLine,
syntax::{change_positions, init_info},
tree_sitter_parser as tsp,
@ -277,25 +278,25 @@ fn diff_file(display_path: &str, lhs_path: &Path, rhs_path: &Path) -> DiffResult
};
let ts_lang = guess(path, guess_src).map(tsp::from_language);
let arena = Arena::new();
let (lang_name, lhs, rhs) = match ts_lang {
Some(ts_lang) => (
Some(ts_lang.name.into()),
tsp::parse(&arena, &lhs_src, &ts_lang),
tsp::parse(&arena, &rhs_src, &ts_lang),
),
None => (
None,
line_parser::parse(&arena, &lhs_src),
line_parser::parse(&arena, &rhs_src),
),
};
let (lang_name, lhs_positions, rhs_positions) = match ts_lang {
Some(ts_lang) => {
let arena = Arena::new();
let lhs = tsp::parse(&arena, &lhs_src, &ts_lang);
let rhs = tsp::parse(&arena, &rhs_src, &ts_lang);
init_info(&lhs, &rhs);
mark_syntax(lhs.get(0).copied(), rhs.get(0).copied());
init_info(&lhs, &rhs);
mark_syntax(lhs.get(0).copied(), rhs.get(0).copied());
let lhs_positions = change_positions(&lhs_src, &rhs_src, &lhs);
let rhs_positions = change_positions(&rhs_src, &lhs_src, &rhs);
let lhs_positions = change_positions(&lhs_src, &rhs_src, &lhs);
let rhs_positions = change_positions(&rhs_src, &lhs_src, &rhs);
(Some(ts_lang.name.into()), lhs_positions, rhs_positions)
}
None => {
let lhs_positions = lp::change_positions(&lhs_src, &rhs_src);
let rhs_positions = lp::change_positions(&rhs_src, &lhs_src);
(None, lhs_positions, rhs_positions)
}
};
DiffResult {
path: display_path.into(),

@ -527,7 +527,7 @@ pub struct MatchedPos {
}
// "foo bar" -> vec!["foo", " ", "bar"]
fn split_words(s: &str) -> Vec<String> {
pub fn split_words(s: &str) -> Vec<String> {
lazy_static! {
static ref RE: Regex = Regex::new(r"[a-zA-Z0-9]+|\n|[^a-zA-Z0-9\n]").unwrap();
}