Limit the graph size based on the number of predecessors found

Fixes #183
Fixes #306
pull/305/head^2
Wilfred Hughes 2022-07-03 18:23:17 +07:00
parent 338e815e9b
commit f10cfa00cc
4 changed files with 107 additions and 89 deletions

@ -1,5 +1,22 @@
## 0.30 (unreleased)
### Command Line Interface
`--node-limit` has been replaced by `--graph-limit`, and the
corresponding environment variable `DFT_NODE_LIMIT` has been replaced
with `DFT_GRAPH_LIMIT`.
`--graph-limit` makes difftastic give up on structural diffs after
traversing this many graph nodes. `--node-limit` applied a limit based
on an estimate of how big the graph would be, leading to very slow
diffs when the estimate was wrong.
This new setting sets a more accurate limit on difftastic
performance. It also means that difftastic will always try a
structural diff first. This will be slower for files that exceed
`--graph-limit`, but guarantees that files with a small number of
changes will always get a structural diff.
## 0.29.1 (released 13th June 2022)
Fixed a major memory regression in 0.29 when performing large

@ -15,8 +15,15 @@ use rustc_hash::FxHashMap;
type PredecessorInfo<'a, 'b> = (u64, &'b Vertex<'a>);
#[derive(Debug)]
pub struct ExceededGraphLimit {}
/// Return the shortest route from `start` to the end vertex.
fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
fn shortest_vertex_path(
start: Vertex,
size_hint: usize,
graph_limit: usize,
) -> Result<Vec<Vertex>, ExceededGraphLimit> {
// We want to visit nodes with the shortest distance first, but
// RadixHeapMap is a max-heap. Ensure nodes are wrapped with
// Reverse to flip comparisons.
@ -56,6 +63,9 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
}
}
}
if predecessors.len() > graph_limit {
return Err(ExceededGraphLimit {});
}
}
None => panic!("Ran out of graph nodes before reaching end"),
}
@ -77,7 +87,7 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
}
vertex_route.reverse();
vertex_route
Ok(vertex_route)
}
fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)> {
@ -101,9 +111,13 @@ fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)>
///
/// The vec returned does not return the very last vertex. This is
/// necessary because a route of N vertices only has N-1 edges.
fn shortest_path(start: Vertex, size_hint: usize) -> Vec<(Edge, Vertex)> {
let vertex_path = shortest_vertex_path(start, size_hint);
shortest_path_with_edges(&vertex_path)
fn shortest_path(
start: Vertex,
size_hint: usize,
graph_limit: usize,
) -> Result<Vec<(Edge, Vertex)>, ExceededGraphLimit> {
let vertex_path = shortest_vertex_path(start, size_hint, graph_limit)?;
Ok(shortest_path_with_edges(&vertex_path))
}
fn edge_between<'a>(before: &Vertex<'a>, after: &Vertex<'a>) -> Edge {
@ -177,7 +191,8 @@ pub fn mark_syntax<'a>(
lhs_syntax: Option<&'a Syntax<'a>>,
rhs_syntax: Option<&'a Syntax<'a>>,
change_map: &mut ChangeMap<'a>,
) {
graph_limit: usize,
) -> Result<(), ExceededGraphLimit> {
let lhs_node_count = node_count(lhs_syntax) as usize;
let rhs_node_count = node_count(rhs_syntax) as usize;
info!(
@ -195,7 +210,7 @@ pub fn mark_syntax<'a>(
let size_hint = lhs_node_count * rhs_node_count;
let start = Vertex::new(lhs_syntax, rhs_syntax);
let route = shortest_path(start, size_hint);
let route = shortest_path(start, size_hint, graph_limit)?;
let print_length = if env::var("DFT_VERBOSE").is_ok() {
50
@ -223,6 +238,7 @@ pub fn mark_syntax<'a>(
);
populate_change_map(&route, change_map);
Ok(())
}
#[cfg(test)]
@ -231,6 +247,7 @@ mod tests {
use crate::{
diff::changes::ChangeKind,
diff::graph::Edge::*,
options::DEFAULT_GRAPH_LIMIT,
positions::SingleLineSpan,
syntax::{init_all_info, AtomKind},
};
@ -264,7 +281,7 @@ mod tests {
init_all_info(&[lhs], &[rhs]);
let start = Vertex::new(Some(lhs), Some(rhs));
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -304,7 +321,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -346,7 +363,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -392,7 +409,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -431,7 +448,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -468,7 +485,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -506,7 +523,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -540,7 +557,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -571,7 +588,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -610,7 +627,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -632,7 +649,7 @@ mod tests {
init_all_info(&[lhs], &[rhs]);
let mut change_map = ChangeMap::default();
mark_syntax(Some(lhs), Some(rhs), &mut change_map);
mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
assert_eq!(change_map.get(lhs), Some(ChangeKind::Unchanged(rhs)));
assert_eq!(change_map.get(rhs), Some(ChangeKind::Unchanged(lhs)));
@ -646,7 +663,7 @@ mod tests {
init_all_info(&[lhs], &[rhs]);
let mut change_map = ChangeMap::default();
mark_syntax(Some(lhs), Some(rhs), &mut change_map);
mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
assert_eq!(change_map.get(lhs), Some(ChangeKind::Novel));
assert_eq!(change_map.get(rhs), Some(ChangeKind::Novel));
}

@ -31,6 +31,7 @@ use crate::diff::{dijkstra, unchanged};
use crate::display::hunks::{matched_pos_to_hunks, merge_adjacent};
use crate::parse::syntax;
use diff::changes::ChangeMap;
use diff::dijkstra::ExceededGraphLimit;
use display::context::opposite_positions;
use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
use log::info;
@ -121,7 +122,7 @@ fn main() {
}
}
Mode::Diff {
node_limit,
graph_limit,
byte_limit,
display_options,
missing_as_empty,
@ -150,7 +151,7 @@ fn main() {
lhs_path,
rhs_path,
&display_options,
node_limit,
graph_limit,
byte_limit,
language_override,
)
@ -165,7 +166,7 @@ fn main() {
rhs_path,
&display_options,
missing_as_empty,
node_limit,
graph_limit,
byte_limit,
language_override,
);
@ -183,7 +184,7 @@ fn diff_file(
rhs_path: &Path,
display_options: &DisplayOptions,
missing_as_empty: bool,
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> DiffResult {
@ -194,7 +195,7 @@ fn diff_file(
&lhs_bytes,
&rhs_bytes,
display_options.tab_width,
node_limit,
graph_limit,
byte_limit,
language_override,
)
@ -206,7 +207,7 @@ fn diff_file_content(
lhs_bytes: &[u8],
rhs_bytes: &[u8],
tab_width: usize,
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> DiffResult {
@ -293,36 +294,39 @@ fn diff_file_content(
unchanged::mark_unchanged(&lhs, &rhs, &mut change_map)
};
let possibly_changed_max = max_num_nodes(&possibly_changed);
if possibly_changed_max > node_limit {
info!(
"Found {} nodes, exceeding the limit {}",
possibly_changed_max, node_limit
);
let mut exceeded_graph_limit = false;
for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
init_next_prev(&lhs_section_nodes);
init_next_prev(&rhs_section_nodes);
match mark_syntax(
lhs_section_nodes.get(0).copied(),
rhs_section_nodes.get(0).copied(),
&mut change_map,
graph_limit,
) {
Ok(()) => {}
Err(ExceededGraphLimit {}) => {
exceeded_graph_limit = true;
break;
}
}
let language = language.unwrap();
fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
}
if exceeded_graph_limit {
let lhs_positions = line_parser::change_positions(&lhs_src, &rhs_src);
let rhs_positions = line_parser::change_positions(&rhs_src, &lhs_src);
(
Some("Text (exceeded DFT_NODE_LIMIT)".into()),
Some("Text (exceeded DFT_GRAPH_LIMIT)".into()),
lhs_positions,
rhs_positions,
)
} else {
for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
init_next_prev(&lhs_section_nodes);
init_next_prev(&rhs_section_nodes);
mark_syntax(
lhs_section_nodes.get(0).copied(),
rhs_section_nodes.get(0).copied(),
&mut change_map,
);
let language = language.unwrap();
fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
}
let lhs_positions = syntax::change_positions(&lhs, &change_map);
let rhs_positions = syntax::change_positions(&rhs, &change_map);
(Some(ts_lang.name.into()), lhs_positions, rhs_positions)
@ -356,7 +360,7 @@ fn diff_directories<'a>(
lhs_dir: &'a Path,
rhs_dir: &'a Path,
display_options: &DisplayOptions,
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> impl ParallelIterator<Item = DiffResult> + 'a {
@ -380,7 +384,7 @@ fn diff_directories<'a>(
&rhs_path,
&display_options,
true,
node_limit,
graph_limit,
byte_limit,
language_override,
)
@ -495,33 +499,10 @@ fn print_diff_result(display_options: &DisplayOptions, summary: &DiffResult) {
}
}
/// What is the total number of nodes in `roots`?
fn num_nodes(roots: &[&syntax::Syntax]) -> u32 {
roots
.iter()
.map(|n| {
1 + match n {
syntax::Syntax::List {
num_descendants, ..
} => *num_descendants,
syntax::Syntax::Atom { .. } => 0,
}
})
.sum()
}
fn max_num_nodes(roots_vec: &[(Vec<&syntax::Syntax>, Vec<&syntax::Syntax>)]) -> u32 {
roots_vec
.iter()
.map(|(lhs, rhs)| num_nodes(lhs) + num_nodes(rhs))
.max()
.unwrap_or(0)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_NODE_LIMIT, DEFAULT_TAB_WIDTH};
use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_GRAPH_LIMIT, DEFAULT_TAB_WIDTH};
#[test]
fn test_diff_identical_content() {
@ -532,7 +513,7 @@ mod tests {
s.as_bytes(),
s.as_bytes(),
DEFAULT_TAB_WIDTH,
DEFAULT_NODE_LIMIT,
DEFAULT_GRAPH_LIMIT,
DEFAULT_BYTE_LIMIT,
None,
);

@ -8,8 +8,11 @@ use const_format::formatcp;
use crate::{display::style::BackgroundColor, parse::guess_language};
pub const DEFAULT_NODE_LIMIT: u32 = 30_000;
pub const DEFAULT_BYTE_LIMIT: usize = 1_000_000;
// Chosen experimentally: this is sufficiently many for all the sample
// files (the highest is slow_before/after.rs at 1.3M nodes), but
// small enough to terminate in ~5 seconds like the test file in #306.
pub const DEFAULT_GRAPH_LIMIT: usize = 3_000_000;
pub const DEFAULT_TAB_WIDTH: usize = 8;
const USAGE: &str = concat!(env!("CARGO_BIN_NAME"), " [OPTIONS] OLD-PATH NEW-PATH");
@ -134,22 +137,22 @@ fn app() -> clap::Command<'static> {
// TODO: support DFT_LANGUAGE for consistency
)
.arg(
Arg::new("node-limit").long("node-limit")
Arg::new("byte-limit").long("byte-limit")
.takes_value(true)
.value_name("LIMIT")
.help(concat!("Use a text diff if the number of syntax nodes exceeds this number."))
.default_value(formatcp!("{}", DEFAULT_NODE_LIMIT))
.env("DFT_NODE_LIMIT")
.validator(|s| s.parse::<u32>())
.help(concat!("Use a text diff if either input file exceeds this size."))
.default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
.env("DFT_BYTE_LIMIT")
.validator(|s| s.parse::<usize>())
.required(false),
)
.arg(
Arg::new("byte-limit").long("byte-limit")
Arg::new("graph-limit").long("graph-limit")
.takes_value(true)
.value_name("LIMIT")
.help(concat!("Use a text diff if either input file exceeds this size."))
.default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
.env("DFT_BYTE_LIMIT")
.help(concat!("Use a text diff if the structural graph exceed this number of nodes in memory."))
.default_value(formatcp!("{}", DEFAULT_GRAPH_LIMIT))
.env("DFT_GRAPH_LIMIT")
.validator(|s| s.parse::<usize>())
.required(false),
)
@ -172,7 +175,7 @@ pub enum DisplayMode {
pub enum Mode {
Diff {
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
display_options: DisplayOptions,
missing_as_empty: bool,
@ -324,10 +327,10 @@ pub fn parse_args() -> Mode {
let syntax_highlight = matches.value_of("syntax-highlight") == Some("on");
let node_limit = matches
.value_of("node-limit")
let graph_limit = matches
.value_of("graph-limit")
.expect("Always present as we've given clap a default")
.parse::<u32>()
.parse::<usize>()
.expect("Value already validated by clap");
let byte_limit = matches
@ -359,7 +362,7 @@ pub fn parse_args() -> Mode {
};
Mode::Diff {
node_limit,
graph_limit,
byte_limit,
display_options,
missing_as_empty,