//! A graph representation for computing tree diffs.
use bumpalo::Bump;
use rustc_hash::FxHashMap;
use std::{
cell::{Cell, RefCell},
cmp::min,
fmt,
hash::{Hash, Hasher},
};
use strsim::normalized_levenshtein;
use crate::{
diff::{
changes::{insert_deep_unchanged, ChangeKind, ChangeMap},
stack::Stack,
},
parse::syntax::{AtomKind, Syntax, SyntaxId},
};
use Edge::*;
/// A vertex in a directed acyclic graph that represents a diff.
///
/// Each vertex represents two pointers: one to the next unmatched LHS
/// syntax, and one to the next unmatched RHS syntax.
///
/// For example, suppose we have `X A` on the LHS and `A` on the
/// RHS. Our start vertex looks like this.
///
/// ```text
/// LHS: X A RHS: A
/// ^ ^
/// ```
///
/// From this vertex, we could take [`Edge::NovelAtomLHS`], bringing
/// us to this vertex.
///
/// ```text
/// LHS: X A RHS: A
/// ^ ^
/// ```
///
/// Alternatively, we could take the [`Edge::NovelAtomRHS`], bringing us
/// to this vertex.
///
/// ```text
/// LHS: X A RHS: A
/// ^ ^
/// ```
#[derive(Debug, Clone)]
pub struct Vertex<'a, 'b> {
pub neighbours: RefCell)>>>,
pub predecessor: Cell )>>,
pub lhs_syntax: Option<&'a Syntax<'a>>,
pub rhs_syntax: Option<&'a Syntax<'a>>,
parents: Stack>,
lhs_parent_id: Option,
rhs_parent_id: Option,
can_pop_either: bool,
}
impl<'a, 'b> PartialEq for Vertex<'a, 'b> {
fn eq(&self, other: &Self) -> bool {
self.lhs_syntax.map(|node| node.id()) == other.lhs_syntax.map(|node| node.id())
&& self.rhs_syntax.map(|node| node.id()) == other.rhs_syntax.map(|node| node.id())
// Strictly speaking, we should compare the whole
// EnteredDelimiter stack, not just the immediate
// parents. By taking the immediate parent, we have
// vertices with different stacks that are 'equal'.
//
// This makes the graph traversal path dependent: the
// first vertex we see 'wins', and we use it for deciding
// how we can pop.
//
// In practice this seems to work well. The first vertex
// has the lowest cost, so has the most PopBoth
// occurrences, which is the best outcome.
//
// Handling this properly would require considering many
// more vertices to be distinct, exponentially increasing
// the graph size relative to tree depth.
&& self.lhs_parent_id == other.lhs_parent_id
&& self.rhs_parent_id == other.rhs_parent_id
// We do want to distinguish whether we can pop each side
// independently though. Without this, if we find a case
// where we can pop sides together, we don't consider the
// case where we get a better diff by popping each side
// separately.
&& self.can_pop_either == other.can_pop_either
}
}
impl<'a, 'b> Eq for Vertex<'a, 'b> {}
impl<'a, 'b> Hash for Vertex<'a, 'b> {
fn hash(&self, state: &mut H) {
self.lhs_syntax.map(|node| node.id()).hash(state);
self.rhs_syntax.map(|node| node.id()).hash(state);
self.lhs_parent_id.hash(state);
self.rhs_parent_id.hash(state);
self.can_pop_either.hash(state);
}
}
/// Tracks entering syntax List nodes.
#[derive(Clone, PartialEq)]
enum EnteredDelimiter<'a> {
/// If we've entered the LHS or RHS separately, we can pop either
/// side independently.
///
/// Assumes that at least one stack is non-empty.
PopEither((Stack<&'a Syntax<'a>>, Stack<&'a Syntax<'a>>)),
/// If we've entered the LHS and RHS together, we must pop both
/// sides together too. Otherwise we'd consider the following case to have no changes.
///
/// ```text
/// Old: (a b c)
/// New: (a b) c
/// ```
PopBoth((&'a Syntax<'a>, &'a Syntax<'a>)),
}
impl<'a> fmt::Debug for EnteredDelimiter<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let desc = match self {
EnteredDelimiter::PopEither((lhs_delims, rhs_delims)) => {
format!(
"PopEither(lhs count: {}, rhs count: {})",
lhs_delims.size(),
rhs_delims.size()
)
}
EnteredDelimiter::PopBoth(_) => "PopBoth".to_string(),
};
f.write_str(&desc)
}
}
fn push_both_delimiters<'a>(
entered: &Stack>,
lhs_delim: &'a Syntax<'a>,
rhs_delim: &'a Syntax<'a>,
) -> Stack> {
entered.push(EnteredDelimiter::PopBoth((lhs_delim, rhs_delim)))
}
fn can_pop_either_parent(entered: &Stack) -> bool {
matches!(entered.peek(), Some(EnteredDelimiter::PopEither(_)))
}
fn try_pop_both<'a>(
entered: &Stack>,
) -> Option<(&'a Syntax<'a>, &'a Syntax<'a>, Stack>)> {
match entered.peek() {
Some(EnteredDelimiter::PopBoth((lhs_delim, rhs_delim))) => {
Some((lhs_delim, rhs_delim, entered.pop().unwrap()))
}
_ => None,
}
}
fn try_pop_lhs<'a>(
entered: &Stack>,
) -> Option<(&'a Syntax<'a>, Stack>)> {
match entered.peek() {
Some(EnteredDelimiter::PopEither((lhs_delims, rhs_delims))) => match lhs_delims.peek() {
Some(lhs_delim) => {
let mut entered = entered.pop().unwrap();
let new_lhs_delims = lhs_delims.pop().unwrap();
if !new_lhs_delims.is_empty() || !rhs_delims.is_empty() {
entered = entered.push(EnteredDelimiter::PopEither((
new_lhs_delims,
rhs_delims.clone(),
)));
}
Some((lhs_delim, entered))
}
None => None,
},
_ => None,
}
}
fn try_pop_rhs<'a>(
entered: &Stack>,
) -> Option<(&'a Syntax<'a>, Stack>)> {
match entered.peek() {
Some(EnteredDelimiter::PopEither((lhs_delims, rhs_delims))) => match rhs_delims.peek() {
Some(rhs_delim) => {
let mut entered = entered.pop().unwrap();
let new_rhs_delims = rhs_delims.pop().unwrap();
if !lhs_delims.is_empty() || !new_rhs_delims.is_empty() {
entered = entered.push(EnteredDelimiter::PopEither((
lhs_delims.clone(),
new_rhs_delims,
)));
}
Some((rhs_delim, entered))
}
None => None,
},
_ => None,
}
}
fn push_lhs_delimiter<'a>(
entered: &Stack>,
delimiter: &'a Syntax<'a>,
) -> Stack> {
let mut modifying_head = false;
let (mut lhs_delims, rhs_delims) = match entered.peek() {
Some(EnteredDelimiter::PopEither((lhs_delims, rhs_delims))) => {
modifying_head = true;
(lhs_delims.clone(), rhs_delims.clone())
}
_ => (Stack::new(), Stack::new()),
};
lhs_delims = lhs_delims.push(delimiter);
let entered = if modifying_head {
entered.pop().unwrap()
} else {
entered.clone()
};
entered.push(EnteredDelimiter::PopEither((lhs_delims, rhs_delims)))
}
fn push_rhs_delimiter<'a>(
entered: &Stack>,
delimiter: &'a Syntax<'a>,
) -> Stack> {
let mut modifying_head = false;
let (lhs_delims, mut rhs_delims) = match entered.peek() {
Some(EnteredDelimiter::PopEither((lhs_delims, rhs_delims))) => {
modifying_head = true;
(lhs_delims.clone(), rhs_delims.clone())
}
_ => (Stack::new(), Stack::new()),
};
rhs_delims = rhs_delims.push(delimiter);
let entered = if modifying_head {
entered.pop().unwrap()
} else {
entered.clone()
};
entered.push(EnteredDelimiter::PopEither((lhs_delims, rhs_delims)))
}
impl<'a, 'b> Vertex<'a, 'b> {
pub fn is_end(&self) -> bool {
self.lhs_syntax.is_none() && self.rhs_syntax.is_none() && self.parents.is_empty()
}
pub fn new(lhs_syntax: Option<&'a Syntax<'a>>, rhs_syntax: Option<&'a Syntax<'a>>) -> Self {
let parents = Stack::new();
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax,
rhs_syntax,
parents,
lhs_parent_id: None,
rhs_parent_id: None,
can_pop_either: false,
}
}
}
/// An edge in our graph, with an associated [`cost`](Edge::cost).
///
/// A syntax node can always be marked as novel, so a vertex will have
/// at least a NovelFoo edge. Depending on the syntax nodes of the
/// current [`Vertex`], other edges may also be available.
///
/// See [`neighbours`] for all the edges available for a given `Vertex`.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum Edge {
UnchangedNode {
depth_difference: u32,
},
EnterUnchangedDelimiter {
depth_difference: u32,
},
ReplacedComment {
levenshtein_pct: u8,
},
NovelAtomLHS {
contiguous: bool,
probably_punctuation: bool,
},
NovelAtomRHS {
contiguous: bool,
probably_punctuation: bool,
},
// TODO: An EnterNovelDelimiterBoth edge might help performance
// rather doing LHS and RHS separately.
EnterNovelDelimiterLHS {
contiguous: bool,
},
EnterNovelDelimiterRHS {
contiguous: bool,
},
ExitDelimiterLHS,
ExitDelimiterRHS,
ExitDelimiterBoth,
}
const NOT_CONTIGUOUS_PENALTY: u64 = 50;
impl Edge {
pub fn cost(self) -> u64 {
match self {
// When we're at the end of a list, there's only one exit
// delimiter possibility, so the cost doesn't matter. We
// choose a non-zero number as it's easier to reason
// about.
ExitDelimiterBoth => 1,
// Choose a higher value for exiting individually. This
// shouldn't matter since entering a novel delimiter is
// already more expensive than entering a matched
// delimiter, but be defensive.
ExitDelimiterLHS | ExitDelimiterRHS => 2,
// Matching nodes is always best.
UnchangedNode { depth_difference } => min(40, u64::from(depth_difference) + 1),
// Matching an outer delimiter is good.
EnterUnchangedDelimiter { depth_difference } => {
100 + min(40, u64::from(depth_difference))
}
// Replacing a comment is better than treating it as novel.
ReplacedComment { levenshtein_pct } => 150 + u64::from(100 - levenshtein_pct),
// Otherwise, we've added/removed a node.
NovelAtomLHS {
contiguous,
probably_punctuation,
}
| NovelAtomRHS {
contiguous,
probably_punctuation,
} => {
let mut cost = 300;
if !contiguous {
cost += NOT_CONTIGUOUS_PENALTY;
}
// If it's only punctuation, decrease the cost
// slightly. It's better to have novel punctuation
// than novel variable names.
if probably_punctuation {
cost -= 10;
}
cost
}
EnterNovelDelimiterLHS { contiguous } | EnterNovelDelimiterRHS { contiguous } => {
let mut cost = 300;
if !contiguous {
// This needs to be more than 40 greater than the
// contiguous case. Otherwise, we end up choosing
// a badly positioned unchanged delimiter just
// because it has a better depth difference.
//
// TODO: write a test for this case.
cost += NOT_CONTIGUOUS_PENALTY;
}
cost
}
}
}
}
fn allocate_if_new<'syn, 'b>(
v: Vertex<'syn, 'b>,
alloc: &'b Bump,
seen: &mut FxHashMap<&Vertex<'syn, 'b>, Vec<&'b Vertex<'syn, 'b>>>,
) -> &'b Vertex<'syn, 'b> {
match seen.get_mut(&v) {
Some(existing) => {
// Don't explore more than two possible parenthesis
// nestings for each syntax node pair.
if let Some(allocated) = existing.last() {
if existing.len() >= 2 {
return *allocated;
}
}
// If we have seen exactly this graph node before, even
// considering parenthesis matching, return it.
for existing_node in existing.iter() {
if existing_node.parents == v.parents {
return existing_node;
}
}
// We haven't reached the graph node limit yet, allocate a
// new one.
let allocated = alloc.alloc(v);
existing.push(allocated);
allocated
}
None => {
let allocated = alloc.alloc(v);
seen.insert(allocated, vec![allocated]);
allocated
}
}
}
/// Does this atom look like punctuation?
///
/// This check is deliberately conservative, becuase it's hard to
/// accurately recognise punctuation in a language-agnostic way.
fn looks_like_punctuation(content: &str) -> bool {
content == "," || content == ";" || content == "."
}
/// Compute the neighbours of `v` if we haven't previously done so,
/// write them to the .neighbours cell inside `v`, and return them.
pub fn get_set_neighbours<'syn, 'b>(
v: &Vertex<'syn, 'b>,
alloc: &'b Bump,
seen: &mut FxHashMap<&Vertex<'syn, 'b>, Vec<&'b Vertex<'syn, 'b>>>,
) -> Vec<(Edge, &'b Vertex<'syn, 'b>)> {
match &*v.neighbours.borrow() {
Some(neighbours) => return neighbours.clone(),
None => {}
}
let mut res: Vec<(Edge, &Vertex)> = vec![];
if v.lhs_syntax.is_none() && v.rhs_syntax.is_none() {
if let Some((lhs_parent, rhs_parent, parents_next)) = try_pop_both(&v.parents) {
// We have exhausted all the nodes on both lists, so we can
// move up to the parent node.
// Continue from sibling of parent.
res.push((
ExitDelimiterBoth,
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_parent.next_sibling(),
rhs_syntax: rhs_parent.next_sibling(),
can_pop_either: can_pop_either_parent(&parents_next),
parents: parents_next,
lhs_parent_id: lhs_parent.parent().map(Syntax::id),
rhs_parent_id: rhs_parent.parent().map(Syntax::id),
},
alloc,
seen,
),
));
}
}
if v.lhs_syntax.is_none() {
if let Some((lhs_parent, parents_next)) = try_pop_lhs(&v.parents) {
// Move to next after LHS parent.
// Continue from sibling of parent.
res.push((
ExitDelimiterLHS,
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_parent.next_sibling(),
rhs_syntax: v.rhs_syntax,
can_pop_either: can_pop_either_parent(&parents_next),
parents: parents_next,
lhs_parent_id: lhs_parent.parent().map(Syntax::id),
rhs_parent_id: v.rhs_parent_id,
},
alloc,
seen,
),
));
}
}
if v.rhs_syntax.is_none() {
if let Some((rhs_parent, parents_next)) = try_pop_rhs(&v.parents) {
// Move to next after RHS parent.
// Continue from sibling of parent.
res.push((
ExitDelimiterRHS,
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: v.lhs_syntax,
rhs_syntax: rhs_parent.next_sibling(),
can_pop_either: can_pop_either_parent(&parents_next),
parents: parents_next,
lhs_parent_id: v.lhs_parent_id,
rhs_parent_id: rhs_parent.parent().map(Syntax::id),
},
alloc,
seen,
),
));
}
}
if let (Some(lhs_syntax), Some(rhs_syntax)) = (&v.lhs_syntax, &v.rhs_syntax) {
if lhs_syntax == rhs_syntax {
let depth_difference = (lhs_syntax.num_ancestors() as i32
- rhs_syntax.num_ancestors() as i32)
.unsigned_abs();
// Both nodes are equal, the happy case.
res.push((
UnchangedNode { depth_difference },
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_syntax.next_sibling(),
rhs_syntax: rhs_syntax.next_sibling(),
parents: v.parents.clone(),
lhs_parent_id: v.lhs_parent_id,
rhs_parent_id: v.rhs_parent_id,
can_pop_either: v.can_pop_either,
},
alloc,
seen,
),
));
}
if let (
Syntax::List {
open_content: lhs_open_content,
close_content: lhs_close_content,
children: lhs_children,
..
},
Syntax::List {
open_content: rhs_open_content,
close_content: rhs_close_content,
children: rhs_children,
..
},
) = (lhs_syntax, rhs_syntax)
{
// The list delimiters are equal, but children may not be.
if lhs_open_content == rhs_open_content && lhs_close_content == rhs_close_content {
let lhs_next = lhs_children.get(0).copied();
let rhs_next = rhs_children.get(0).copied();
// TODO: be consistent between parents_next and next_parents.
let parents_next = push_both_delimiters(&v.parents, lhs_syntax, rhs_syntax);
let depth_difference = (lhs_syntax.num_ancestors() as i32
- rhs_syntax.num_ancestors() as i32)
.unsigned_abs();
res.push((
EnterUnchangedDelimiter { depth_difference },
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_next,
rhs_syntax: rhs_next,
parents: parents_next,
lhs_parent_id: Some(lhs_syntax.id()),
rhs_parent_id: Some(rhs_syntax.id()),
can_pop_either: false,
},
alloc,
seen,
),
));
}
}
if let (
Syntax::Atom {
content: lhs_content,
kind: AtomKind::Comment,
..
},
Syntax::Atom {
content: rhs_content,
kind: AtomKind::Comment,
..
},
) = (lhs_syntax, rhs_syntax)
{
// Both sides are comments and their content is reasonably
// similar.
if lhs_content != rhs_content {
let levenshtein_pct =
(normalized_levenshtein(lhs_content, rhs_content) * 100.0).round() as u8;
res.push((
ReplacedComment { levenshtein_pct },
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_syntax.next_sibling(),
rhs_syntax: rhs_syntax.next_sibling(),
parents: v.parents.clone(),
lhs_parent_id: v.lhs_parent_id,
rhs_parent_id: v.rhs_parent_id,
can_pop_either: v.can_pop_either,
},
alloc,
seen,
),
));
}
}
}
if let Some(lhs_syntax) = &v.lhs_syntax {
match lhs_syntax {
// Step over this novel atom.
Syntax::Atom { content, .. } => {
res.push((
NovelAtomLHS {
// TODO: should this apply if prev is a parent
// node rather than a sibling?
contiguous: lhs_syntax.prev_is_contiguous(),
probably_punctuation: looks_like_punctuation(content),
},
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_syntax.next_sibling(),
rhs_syntax: v.rhs_syntax,
parents: v.parents.clone(),
lhs_parent_id: v.lhs_parent_id,
rhs_parent_id: v.rhs_parent_id,
can_pop_either: v.can_pop_either,
},
alloc,
seen,
),
));
}
// Step into this partially/fully novel list.
Syntax::List { children, .. } => {
let lhs_next = children.get(0).copied();
let parents_next = push_lhs_delimiter(&v.parents, lhs_syntax);
res.push((
EnterNovelDelimiterLHS {
contiguous: lhs_syntax.prev_is_contiguous(),
},
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: lhs_next,
rhs_syntax: v.rhs_syntax,
parents: parents_next,
lhs_parent_id: Some(lhs_syntax.id()),
rhs_parent_id: v.rhs_parent_id,
can_pop_either: true,
},
alloc,
seen,
),
));
}
}
}
if let Some(rhs_syntax) = &v.rhs_syntax {
match rhs_syntax {
// Step over this novel atom.
Syntax::Atom { content, .. } => {
res.push((
NovelAtomRHS {
contiguous: rhs_syntax.prev_is_contiguous(),
probably_punctuation: looks_like_punctuation(content),
},
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: v.lhs_syntax,
rhs_syntax: rhs_syntax.next_sibling(),
parents: v.parents.clone(),
lhs_parent_id: v.lhs_parent_id,
rhs_parent_id: v.rhs_parent_id,
can_pop_either: v.can_pop_either,
},
alloc,
seen,
),
));
}
// Step into this partially/fully novel list.
Syntax::List { children, .. } => {
let rhs_next = children.get(0).copied();
let parents_next = push_rhs_delimiter(&v.parents, rhs_syntax);
res.push((
EnterNovelDelimiterRHS {
contiguous: rhs_syntax.prev_is_contiguous(),
},
allocate_if_new(
Vertex {
neighbours: RefCell::new(None),
predecessor: Cell::new(None),
lhs_syntax: v.lhs_syntax,
rhs_syntax: rhs_next,
parents: parents_next,
lhs_parent_id: v.lhs_parent_id,
rhs_parent_id: Some(rhs_syntax.id()),
can_pop_either: true,
},
alloc,
seen,
),
));
}
}
}
assert!(
!res.is_empty(),
"Must always find some next steps if node is not the end"
);
v.neighbours.replace(Some(res.clone()));
res
}
pub fn populate_change_map<'a, 'b>(
route: &[(Edge, &'b Vertex<'a, 'b>)],
change_map: &mut ChangeMap<'a>,
) {
for (e, v) in route {
match e {
ExitDelimiterBoth | ExitDelimiterLHS | ExitDelimiterRHS => {
// Nothing to do: we have already marked this node when we entered it.
}
UnchangedNode { .. } => {
// No change on this node or its children.
let lhs = v.lhs_syntax.unwrap();
let rhs = v.rhs_syntax.unwrap();
insert_deep_unchanged(lhs, rhs, change_map);
insert_deep_unchanged(rhs, lhs, change_map);
}
EnterUnchangedDelimiter { .. } => {
// No change on the outer delimiter, but children may
// have changed.
let lhs = v.lhs_syntax.unwrap();
let rhs = v.rhs_syntax.unwrap();
change_map.insert(lhs, ChangeKind::Unchanged(rhs));
change_map.insert(rhs, ChangeKind::Unchanged(lhs));
}
ReplacedComment { levenshtein_pct } => {
let lhs = v.lhs_syntax.unwrap();
let rhs = v.rhs_syntax.unwrap();
if *levenshtein_pct > 40 {
change_map.insert(lhs, ChangeKind::ReplacedComment(lhs, rhs));
change_map.insert(rhs, ChangeKind::ReplacedComment(rhs, lhs));
} else {
change_map.insert(lhs, ChangeKind::Novel);
change_map.insert(rhs, ChangeKind::Novel);
}
}
NovelAtomLHS { .. } | EnterNovelDelimiterLHS { .. } => {
let lhs = v.lhs_syntax.unwrap();
change_map.insert(lhs, ChangeKind::Novel);
}
NovelAtomRHS { .. } | EnterNovelDelimiterRHS { .. } => {
let rhs = v.rhs_syntax.unwrap();
change_map.insert(rhs, ChangeKind::Novel);
}
}
}
}