mirror of https://github.com/Wilfred/difftastic/
Adding a line-based textual differ that ignores trees
parent
b1793703fb
commit
027856d707
@ -1,187 +1,185 @@
|
||||
//! A fallback "parser" for plain text.
|
||||
|
||||
use typed_arena::Arena;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::{
|
||||
positions::SingleLineSpan,
|
||||
syntax::{AtomKind, Syntax},
|
||||
lines::NewlinePositions,
|
||||
syntax::{split_words, AtomKind, MatchKind, MatchedPos, TokenKind},
|
||||
};
|
||||
|
||||
/// Split `s` by lines, and treat each line as an atom.
|
||||
///
|
||||
/// This is a fallback for files that we don't know how to parse.
|
||||
pub fn parse<'a>(arena: &'a Arena<Syntax<'a>>, s: &str) -> Vec<&'a Syntax<'a>> {
|
||||
fn split_lines_keep_newline(s: &str) -> Vec<String> {
|
||||
lazy_static! {
|
||||
static ref NEWLINE_RE: Regex = Regex::new("\n").unwrap();
|
||||
}
|
||||
|
||||
let mut offset = 0;
|
||||
let mut res = vec![];
|
||||
// TODO: This scales poorly to large files (e.g. parser.c
|
||||
// changes). Consider grouping Syntax items into lists when we
|
||||
// encounter blank lines.
|
||||
for (i, line) in s.lines().enumerate() {
|
||||
// Mark each line as a comment atom, so we get word-level diffs.
|
||||
// TODO: this is very hot on large files, such as parser.c,
|
||||
// because we spend ~65% of execution time computing
|
||||
// levenshtein distance.
|
||||
res.push(Syntax::new_atom(
|
||||
arena,
|
||||
vec![SingleLineSpan {
|
||||
line: i.into(),
|
||||
start_col: 0,
|
||||
end_col: line.len(),
|
||||
}],
|
||||
line,
|
||||
AtomKind::Comment, // TODO: don't dim plain lines like other comments
|
||||
));
|
||||
for newline_match in NEWLINE_RE.find_iter(s) {
|
||||
res.push(s[offset..newline_match.end()].into());
|
||||
offset = newline_match.end();
|
||||
}
|
||||
|
||||
if offset < s.len() {
|
||||
res.push(s[offset..].into());
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use Syntax::*;
|
||||
#[derive(Debug)]
|
||||
enum TextChangeKind {
|
||||
Novel,
|
||||
Unchanged,
|
||||
}
|
||||
|
||||
fn merge_novel(
|
||||
lines: &[(TextChangeKind, Vec<String>, Vec<String>)],
|
||||
) -> Vec<(TextChangeKind, Vec<String>, Vec<String>)> {
|
||||
let mut lhs_novel: Vec<String> = vec![];
|
||||
let mut rhs_novel: Vec<String> = vec![];
|
||||
|
||||
let mut res: Vec<(TextChangeKind, Vec<String>, Vec<String>)> = vec![];
|
||||
for (kind, lhs_lines, rhs_lines) in lines {
|
||||
match kind {
|
||||
TextChangeKind::Novel => {
|
||||
lhs_novel.extend(lhs_lines.iter().cloned());
|
||||
rhs_novel.extend(rhs_lines.iter().cloned());
|
||||
}
|
||||
TextChangeKind::Unchanged => {
|
||||
if !lhs_novel.is_empty() || !rhs_novel.is_empty() {
|
||||
res.push((TextChangeKind::Novel, lhs_novel, rhs_novel));
|
||||
lhs_novel = vec![];
|
||||
rhs_novel = vec![];
|
||||
}
|
||||
|
||||
fn assert_syntaxes<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) {
|
||||
if !syntaxes_match(actual, expected) {
|
||||
dbg!(expected, actual);
|
||||
assert!(false);
|
||||
res.push((
|
||||
TextChangeKind::Unchanged,
|
||||
lhs_lines.clone(),
|
||||
rhs_lines.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn syntaxes_match<'a>(actual: &[&'a Syntax<'a>], expected: &[&'a Syntax<'a>]) -> bool {
|
||||
if actual.len() != expected.len() {
|
||||
return false;
|
||||
} else {
|
||||
for (lhs_child, rhs_child) in actual.iter().zip(expected.iter()) {
|
||||
if !syntax_matches(lhs_child, rhs_child) {
|
||||
return false;
|
||||
}
|
||||
if !lhs_novel.is_empty() || !rhs_novel.is_empty() {
|
||||
res.push((TextChangeKind::Novel, lhs_novel, rhs_novel));
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
fn changed_parts(src: &str, opposite_src: &str) -> Vec<(TextChangeKind, Vec<String>, Vec<String>)> {
|
||||
let src_lines = split_lines_keep_newline(src);
|
||||
let opposite_src_lines = split_lines_keep_newline(opposite_src);
|
||||
|
||||
let mut res: Vec<(TextChangeKind, Vec<String>, Vec<String>)> = vec![];
|
||||
for diff_res in diff::slice(&src_lines, &opposite_src_lines) {
|
||||
match diff_res {
|
||||
diff::Result::Left(line) => {
|
||||
res.push((TextChangeKind::Novel, vec![line.into()], vec![]));
|
||||
}
|
||||
diff::Result::Both(line, opposite_line) => {
|
||||
res.push((
|
||||
TextChangeKind::Unchanged,
|
||||
vec![line.into()],
|
||||
vec![opposite_line.into()],
|
||||
));
|
||||
}
|
||||
diff::Result::Right(opposite_line) => {
|
||||
res.push((TextChangeKind::Novel, vec![], vec![opposite_line.into()]));
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Compare all the fields in a Syntax value, not just
|
||||
/// those used in its Eq implementation.
|
||||
fn syntax_matches<'a>(actual: &'a Syntax<'a>, expected: &'a Syntax<'a>) -> bool {
|
||||
match (actual, expected) {
|
||||
(
|
||||
List {
|
||||
open_position: lhs_open_position,
|
||||
open_content: lhs_start_content,
|
||||
children: lhs_children,
|
||||
close_content: lhs_end_content,
|
||||
close_position: lhs_close_position,
|
||||
num_descendants: lhs_num_descendants,
|
||||
..
|
||||
},
|
||||
List {
|
||||
open_position: rhs_open_position,
|
||||
open_content: rhs_start_content,
|
||||
children: rhs_children,
|
||||
close_content: rhs_end_content,
|
||||
close_position: rhs_close_position,
|
||||
num_descendants: rhs_num_descendants,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
if actual.change() != expected.change() {
|
||||
dbg!(actual.change(), expected.change());
|
||||
return false;
|
||||
}
|
||||
if lhs_open_position != rhs_open_position {
|
||||
dbg!(lhs_open_position, rhs_open_position);
|
||||
return false;
|
||||
}
|
||||
merge_novel(&res)
|
||||
}
|
||||
|
||||
if lhs_start_content != rhs_start_content {
|
||||
dbg!(lhs_start_content, rhs_start_content);
|
||||
return false;
|
||||
}
|
||||
if lhs_end_content != rhs_end_content {
|
||||
dbg!(lhs_end_content, rhs_end_content);
|
||||
return false;
|
||||
}
|
||||
if lhs_close_position != rhs_close_position {
|
||||
dbg!(lhs_close_position, rhs_close_position);
|
||||
return false;
|
||||
}
|
||||
// TODO: Prefer src/opposite_src nomenclature as this function is called from both sides.
|
||||
pub fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos> {
|
||||
let lhs_nlp = NewlinePositions::from(lhs_src);
|
||||
let rhs_nlp = NewlinePositions::from(rhs_src);
|
||||
|
||||
if lhs_num_descendants != rhs_num_descendants {
|
||||
dbg!(lhs_num_descendants, rhs_num_descendants);
|
||||
return false;
|
||||
}
|
||||
let mut lhs_offset = 0;
|
||||
let mut rhs_offset = 0;
|
||||
|
||||
if !syntaxes_match(lhs_children, rhs_children) {
|
||||
return false;
|
||||
let mut res = vec![];
|
||||
for (kind, lhs_lines, rhs_lines) in changed_parts(lhs_src, rhs_src) {
|
||||
match kind {
|
||||
TextChangeKind::Unchanged => {
|
||||
for (lhs_line, rhs_line) in lhs_lines.iter().zip(rhs_lines) {
|
||||
let lhs_pos = lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_line.len() - 1);
|
||||
let rhs_pos = rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_line.len() - 1);
|
||||
|
||||
res.push(MatchedPos {
|
||||
kind: MatchKind::Unchanged {
|
||||
highlight: TokenKind::Atom(AtomKind::Normal),
|
||||
self_pos: lhs_pos.clone(),
|
||||
opposite_pos: rhs_pos,
|
||||
},
|
||||
pos: lhs_pos[0],
|
||||
});
|
||||
|
||||
lhs_offset += lhs_line.len();
|
||||
rhs_offset += rhs_line.len();
|
||||
}
|
||||
}
|
||||
(
|
||||
Atom {
|
||||
position: lhs_position,
|
||||
content: lhs_content,
|
||||
kind: lhs_highlight,
|
||||
..
|
||||
},
|
||||
Atom {
|
||||
position: rhs_position,
|
||||
content: rhs_content,
|
||||
kind: rhs_highlight,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
if actual.change() != expected.change() {
|
||||
dbg!(actual.change(), expected.change());
|
||||
return false;
|
||||
}
|
||||
if lhs_position != rhs_position {
|
||||
dbg!(lhs_position, rhs_position);
|
||||
return false;
|
||||
}
|
||||
TextChangeKind::Novel => {
|
||||
let lhs_part = lhs_lines.join("");
|
||||
let rhs_part = rhs_lines.join("");
|
||||
|
||||
if lhs_content != rhs_content {
|
||||
dbg!(lhs_content, rhs_content);
|
||||
return false;
|
||||
}
|
||||
if lhs_highlight != rhs_highlight {
|
||||
dbg!(lhs_highlight, rhs_highlight);
|
||||
return false;
|
||||
for diff_res in diff::slice(&split_words(&lhs_part), &split_words(&rhs_part)) {
|
||||
match diff_res {
|
||||
diff::Result::Left(lhs_word) => {
|
||||
let lhs_pos =
|
||||
lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len());
|
||||
res.push(MatchedPos {
|
||||
// TODO: rename this kind to reflect
|
||||
// that it's used for both code
|
||||
// comments and plain text.
|
||||
kind: MatchKind::ChangedCommentPart {},
|
||||
pos: lhs_pos[0],
|
||||
});
|
||||
|
||||
lhs_offset += lhs_word.len();
|
||||
}
|
||||
diff::Result::Both(lhs_word, rhs_word) => {
|
||||
let lhs_pos =
|
||||
lhs_nlp.from_offsets(lhs_offset, lhs_offset + lhs_word.len());
|
||||
let rhs_pos =
|
||||
rhs_nlp.from_offsets(rhs_offset, rhs_offset + rhs_word.len());
|
||||
|
||||
res.push(MatchedPos {
|
||||
kind: MatchKind::UnchangedCommentPart {
|
||||
self_pos: lhs_pos[0],
|
||||
opposite_pos: rhs_pos,
|
||||
},
|
||||
pos: lhs_pos[0],
|
||||
});
|
||||
|
||||
lhs_offset += lhs_word.len();
|
||||
rhs_offset += rhs_word.len();
|
||||
}
|
||||
diff::Result::Right(rhs_word) => {
|
||||
rhs_offset += rhs_word.len();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn test_parse_lines() {
|
||||
let arena = Arena::new();
|
||||
|
||||
assert_syntaxes(
|
||||
&parse(&arena, "foo\nbar"),
|
||||
&[
|
||||
Syntax::new_atom(
|
||||
&arena,
|
||||
vec![SingleLineSpan {
|
||||
line: 0.into(),
|
||||
start_col: 0,
|
||||
end_col: 3,
|
||||
}],
|
||||
"foo",
|
||||
AtomKind::Comment,
|
||||
),
|
||||
Syntax::new_atom(
|
||||
&arena,
|
||||
vec![SingleLineSpan {
|
||||
line: 1.into(),
|
||||
start_col: 0,
|
||||
end_col: 3,
|
||||
}],
|
||||
"bar",
|
||||
AtomKind::Comment,
|
||||
),
|
||||
],
|
||||
);
|
||||
fn test_split_newlines() {
|
||||
let s = "foo\nbar\nbaz";
|
||||
let res = split_lines_keep_newline(s);
|
||||
assert_eq!(res, vec!["foo\n", "bar\n", "baz"])
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue