From f10cfa00cc146c06efb78f1b27c6a295a5dfa7a5 Mon Sep 17 00:00:00 2001 From: Wilfred Hughes Date: Sun, 3 Jul 2022 18:23:17 -0700 Subject: [PATCH] Limit the graph size based on the number of predecessors found Fixes #183 Fixes #306 --- CHANGELOG.md | 17 +++++++++ src/diff/dijkstra.rs | 55 +++++++++++++++++--------- src/main.rs | 91 ++++++++++++++++++-------------------------- src/options.rs | 33 ++++++++-------- 4 files changed, 107 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7767219c6..6706bcc6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,22 @@ ## 0.30 (unreleased) +### Command Line Interface + +`--node-limit` has been replaced by `--graph-limit`, and the +corresponding environment variable `DFT_NODE_LIMIT` has been replaced +with `DFT_GRAPH_LIMIT`. + +`--graph-limit` makes difftastic give up on structural diffs after +traversing this many graph nodes. `--node-limit` applied a limit based +on an estimate of how big the graph would be, leading to very slow +diffs when the estimate was wrong. + +This new setting sets a more accurate limit on difftastic +performance. It also means that difftastic will always try a +structural diff first. This will be slower for files that exceed +`--graph-limit`, but guarantees that files with a small number of +changes will always get a structural diff. + ## 0.29.1 (released 13th June 2022) Fixed a major memory regression in 0.29 when performing large diff --git a/src/diff/dijkstra.rs b/src/diff/dijkstra.rs index a1694f8bf..ff6222f7d 100644 --- a/src/diff/dijkstra.rs +++ b/src/diff/dijkstra.rs @@ -15,8 +15,15 @@ use rustc_hash::FxHashMap; type PredecessorInfo<'a, 'b> = (u64, &'b Vertex<'a>); +#[derive(Debug)] +pub struct ExceededGraphLimit {} + /// Return the shortest route from `start` to the end vertex. -fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec { +fn shortest_vertex_path( + start: Vertex, + size_hint: usize, + graph_limit: usize, +) -> Result, ExceededGraphLimit> { // We want to visit nodes with the shortest distance first, but // RadixHeapMap is a max-heap. Ensure nodes are wrapped with // Reverse to flip comparisons. @@ -56,6 +63,9 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec { } } } + if predecessors.len() > graph_limit { + return Err(ExceededGraphLimit {}); + } } None => panic!("Ran out of graph nodes before reaching end"), } @@ -77,7 +87,7 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec { } vertex_route.reverse(); - vertex_route + Ok(vertex_route) } fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)> { @@ -101,9 +111,13 @@ fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)> /// /// The vec returned does not return the very last vertex. This is /// necessary because a route of N vertices only has N-1 edges. -fn shortest_path(start: Vertex, size_hint: usize) -> Vec<(Edge, Vertex)> { - let vertex_path = shortest_vertex_path(start, size_hint); - shortest_path_with_edges(&vertex_path) +fn shortest_path( + start: Vertex, + size_hint: usize, + graph_limit: usize, +) -> Result, ExceededGraphLimit> { + let vertex_path = shortest_vertex_path(start, size_hint, graph_limit)?; + Ok(shortest_path_with_edges(&vertex_path)) } fn edge_between<'a>(before: &Vertex<'a>, after: &Vertex<'a>) -> Edge { @@ -177,7 +191,8 @@ pub fn mark_syntax<'a>( lhs_syntax: Option<&'a Syntax<'a>>, rhs_syntax: Option<&'a Syntax<'a>>, change_map: &mut ChangeMap<'a>, -) { + graph_limit: usize, +) -> Result<(), ExceededGraphLimit> { let lhs_node_count = node_count(lhs_syntax) as usize; let rhs_node_count = node_count(rhs_syntax) as usize; info!( @@ -195,7 +210,7 @@ pub fn mark_syntax<'a>( let size_hint = lhs_node_count * rhs_node_count; let start = Vertex::new(lhs_syntax, rhs_syntax); - let route = shortest_path(start, size_hint); + let route = shortest_path(start, size_hint, graph_limit)?; let print_length = if env::var("DFT_VERBOSE").is_ok() { 50 @@ -223,6 +238,7 @@ pub fn mark_syntax<'a>( ); populate_change_map(&route, change_map); + Ok(()) } #[cfg(test)] @@ -231,6 +247,7 @@ mod tests { use crate::{ diff::changes::ChangeKind, diff::graph::Edge::*, + options::DEFAULT_GRAPH_LIMIT, positions::SingleLineSpan, syntax::{init_all_info, AtomKind}, }; @@ -264,7 +281,7 @@ mod tests { init_all_info(&[lhs], &[rhs]); let start = Vertex::new(Some(lhs), Some(rhs)); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -304,7 +321,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -346,7 +363,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -392,7 +409,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -431,7 +448,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -468,7 +485,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -506,7 +523,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -540,7 +557,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -571,7 +588,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -610,7 +627,7 @@ mod tests { init_all_info(&lhs, &rhs); let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied()); - let route = shortest_path(start, 0); + let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap(); let actions = route.iter().map(|(action, _)| *action).collect_vec(); assert_eq!( @@ -632,7 +649,7 @@ mod tests { init_all_info(&[lhs], &[rhs]); let mut change_map = ChangeMap::default(); - mark_syntax(Some(lhs), Some(rhs), &mut change_map); + mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap(); assert_eq!(change_map.get(lhs), Some(ChangeKind::Unchanged(rhs))); assert_eq!(change_map.get(rhs), Some(ChangeKind::Unchanged(lhs))); @@ -646,7 +663,7 @@ mod tests { init_all_info(&[lhs], &[rhs]); let mut change_map = ChangeMap::default(); - mark_syntax(Some(lhs), Some(rhs), &mut change_map); + mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap(); assert_eq!(change_map.get(lhs), Some(ChangeKind::Novel)); assert_eq!(change_map.get(rhs), Some(ChangeKind::Novel)); } diff --git a/src/main.rs b/src/main.rs index 8c98d34b3..dbb6ba625 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,6 +31,7 @@ use crate::diff::{dijkstra, unchanged}; use crate::display::hunks::{matched_pos_to_hunks, merge_adjacent}; use crate::parse::syntax; use diff::changes::ChangeMap; +use diff::dijkstra::ExceededGraphLimit; use display::context::opposite_positions; use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either}; use log::info; @@ -121,7 +122,7 @@ fn main() { } } Mode::Diff { - node_limit, + graph_limit, byte_limit, display_options, missing_as_empty, @@ -150,7 +151,7 @@ fn main() { lhs_path, rhs_path, &display_options, - node_limit, + graph_limit, byte_limit, language_override, ) @@ -165,7 +166,7 @@ fn main() { rhs_path, &display_options, missing_as_empty, - node_limit, + graph_limit, byte_limit, language_override, ); @@ -183,7 +184,7 @@ fn diff_file( rhs_path: &Path, display_options: &DisplayOptions, missing_as_empty: bool, - node_limit: u32, + graph_limit: usize, byte_limit: usize, language_override: Option, ) -> DiffResult { @@ -194,7 +195,7 @@ fn diff_file( &lhs_bytes, &rhs_bytes, display_options.tab_width, - node_limit, + graph_limit, byte_limit, language_override, ) @@ -206,7 +207,7 @@ fn diff_file_content( lhs_bytes: &[u8], rhs_bytes: &[u8], tab_width: usize, - node_limit: u32, + graph_limit: usize, byte_limit: usize, language_override: Option, ) -> DiffResult { @@ -293,36 +294,39 @@ fn diff_file_content( unchanged::mark_unchanged(&lhs, &rhs, &mut change_map) }; - let possibly_changed_max = max_num_nodes(&possibly_changed); - if possibly_changed_max > node_limit { - info!( - "Found {} nodes, exceeding the limit {}", - possibly_changed_max, node_limit - ); + let mut exceeded_graph_limit = false; + + for (lhs_section_nodes, rhs_section_nodes) in possibly_changed { + init_next_prev(&lhs_section_nodes); + init_next_prev(&rhs_section_nodes); + + match mark_syntax( + lhs_section_nodes.get(0).copied(), + rhs_section_nodes.get(0).copied(), + &mut change_map, + graph_limit, + ) { + Ok(()) => {} + Err(ExceededGraphLimit {}) => { + exceeded_graph_limit = true; + break; + } + } + let language = language.unwrap(); + fix_all_sliders(language, &lhs_section_nodes, &mut change_map); + fix_all_sliders(language, &rhs_section_nodes, &mut change_map); + } + + if exceeded_graph_limit { let lhs_positions = line_parser::change_positions(&lhs_src, &rhs_src); let rhs_positions = line_parser::change_positions(&rhs_src, &lhs_src); ( - Some("Text (exceeded DFT_NODE_LIMIT)".into()), + Some("Text (exceeded DFT_GRAPH_LIMIT)".into()), lhs_positions, rhs_positions, ) } else { - for (lhs_section_nodes, rhs_section_nodes) in possibly_changed { - init_next_prev(&lhs_section_nodes); - init_next_prev(&rhs_section_nodes); - - mark_syntax( - lhs_section_nodes.get(0).copied(), - rhs_section_nodes.get(0).copied(), - &mut change_map, - ); - - let language = language.unwrap(); - fix_all_sliders(language, &lhs_section_nodes, &mut change_map); - fix_all_sliders(language, &rhs_section_nodes, &mut change_map); - } - let lhs_positions = syntax::change_positions(&lhs, &change_map); let rhs_positions = syntax::change_positions(&rhs, &change_map); (Some(ts_lang.name.into()), lhs_positions, rhs_positions) @@ -356,7 +360,7 @@ fn diff_directories<'a>( lhs_dir: &'a Path, rhs_dir: &'a Path, display_options: &DisplayOptions, - node_limit: u32, + graph_limit: usize, byte_limit: usize, language_override: Option, ) -> impl ParallelIterator + 'a { @@ -380,7 +384,7 @@ fn diff_directories<'a>( &rhs_path, &display_options, true, - node_limit, + graph_limit, byte_limit, language_override, ) @@ -495,33 +499,10 @@ fn print_diff_result(display_options: &DisplayOptions, summary: &DiffResult) { } } -/// What is the total number of nodes in `roots`? -fn num_nodes(roots: &[&syntax::Syntax]) -> u32 { - roots - .iter() - .map(|n| { - 1 + match n { - syntax::Syntax::List { - num_descendants, .. - } => *num_descendants, - syntax::Syntax::Atom { .. } => 0, - } - }) - .sum() -} - -fn max_num_nodes(roots_vec: &[(Vec<&syntax::Syntax>, Vec<&syntax::Syntax>)]) -> u32 { - roots_vec - .iter() - .map(|(lhs, rhs)| num_nodes(lhs) + num_nodes(rhs)) - .max() - .unwrap_or(0) -} - #[cfg(test)] mod tests { use super::*; - use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_NODE_LIMIT, DEFAULT_TAB_WIDTH}; + use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_GRAPH_LIMIT, DEFAULT_TAB_WIDTH}; #[test] fn test_diff_identical_content() { @@ -532,7 +513,7 @@ mod tests { s.as_bytes(), s.as_bytes(), DEFAULT_TAB_WIDTH, - DEFAULT_NODE_LIMIT, + DEFAULT_GRAPH_LIMIT, DEFAULT_BYTE_LIMIT, None, ); diff --git a/src/options.rs b/src/options.rs index c59e087f0..eb6cb0439 100644 --- a/src/options.rs +++ b/src/options.rs @@ -8,8 +8,11 @@ use const_format::formatcp; use crate::{display::style::BackgroundColor, parse::guess_language}; -pub const DEFAULT_NODE_LIMIT: u32 = 30_000; pub const DEFAULT_BYTE_LIMIT: usize = 1_000_000; +// Chosen experimentally: this is sufficiently many for all the sample +// files (the highest is slow_before/after.rs at 1.3M nodes), but +// small enough to terminate in ~5 seconds like the test file in #306. +pub const DEFAULT_GRAPH_LIMIT: usize = 3_000_000; pub const DEFAULT_TAB_WIDTH: usize = 8; const USAGE: &str = concat!(env!("CARGO_BIN_NAME"), " [OPTIONS] OLD-PATH NEW-PATH"); @@ -134,22 +137,22 @@ fn app() -> clap::Command<'static> { // TODO: support DFT_LANGUAGE for consistency ) .arg( - Arg::new("node-limit").long("node-limit") + Arg::new("byte-limit").long("byte-limit") .takes_value(true) .value_name("LIMIT") - .help(concat!("Use a text diff if the number of syntax nodes exceeds this number.")) - .default_value(formatcp!("{}", DEFAULT_NODE_LIMIT)) - .env("DFT_NODE_LIMIT") - .validator(|s| s.parse::()) + .help(concat!("Use a text diff if either input file exceeds this size.")) + .default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT)) + .env("DFT_BYTE_LIMIT") + .validator(|s| s.parse::()) .required(false), ) .arg( - Arg::new("byte-limit").long("byte-limit") + Arg::new("graph-limit").long("graph-limit") .takes_value(true) .value_name("LIMIT") - .help(concat!("Use a text diff if either input file exceeds this size.")) - .default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT)) - .env("DFT_BYTE_LIMIT") + .help(concat!("Use a text diff if the structural graph exceed this number of nodes in memory.")) + .default_value(formatcp!("{}", DEFAULT_GRAPH_LIMIT)) + .env("DFT_GRAPH_LIMIT") .validator(|s| s.parse::()) .required(false), ) @@ -172,7 +175,7 @@ pub enum DisplayMode { pub enum Mode { Diff { - node_limit: u32, + graph_limit: usize, byte_limit: usize, display_options: DisplayOptions, missing_as_empty: bool, @@ -324,10 +327,10 @@ pub fn parse_args() -> Mode { let syntax_highlight = matches.value_of("syntax-highlight") == Some("on"); - let node_limit = matches - .value_of("node-limit") + let graph_limit = matches + .value_of("graph-limit") .expect("Always present as we've given clap a default") - .parse::() + .parse::() .expect("Value already validated by clap"); let byte_limit = matches @@ -359,7 +362,7 @@ pub fn parse_args() -> Mode { }; Mode::Diff { - node_limit, + graph_limit, byte_limit, display_options, missing_as_empty,