Limit the graph size based on the number of predecessors found

Fixes #183 Fixes #306
2022-07-03 18:23:17 +07:00 · 2022-07-03 18:23:17 +07:00 · f10cfa00cc
parent 338e815e9b
commit f10cfa00cc
4 changed files with 107 additions and 89 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,22 @@
 ## 0.30 (unreleased)

+### Command Line Interface
+
+`--node-limit` has been replaced by `--graph-limit`, and the
+corresponding environment variable `DFT_NODE_LIMIT` has been replaced
+with `DFT_GRAPH_LIMIT`.
+
+`--graph-limit` makes difftastic give up on structural diffs after
+traversing this many graph nodes. `--node-limit` applied a limit based
+on an estimate of how big the graph would be, leading to very slow
+diffs when the estimate was wrong.
+
+This new setting sets a more accurate limit on difftastic
+performance. It also means that difftastic will always try a
+structural diff first. This will be slower for files that exceed
+`--graph-limit`, but guarantees that files with a small number of
+changes will always get a structural diff.
+
 ## 0.29.1 (released 13th June 2022)

 Fixed a major memory regression in 0.29 when performing large
--- a/src/diff/dijkstra.rs
+++ b/src/diff/dijkstra.rs
@ -15,8 +15,15 @@ use rustc_hash::FxHashMap;

 type PredecessorInfo<'a, 'b> = (u64, &'b Vertex<'a>);

+#[derive(Debug)]
+pub struct ExceededGraphLimit {}
+
 /// Return the shortest route from `start` to the end vertex.
-fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
+fn shortest_vertex_path(
+    start: Vertex,
+    size_hint: usize,
+    graph_limit: usize,
+) -> Result<Vec<Vertex>, ExceededGraphLimit> {
    // We want to visit nodes with the shortest distance first, but
    // RadixHeapMap is a max-heap. Ensure nodes are wrapped with
    // Reverse to flip comparisons.
@ -56,6 +63,9 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
                        }
                    }
                }
+                if predecessors.len() > graph_limit {
+                    return Err(ExceededGraphLimit {});
+                }
            }
            None => panic!("Ran out of graph nodes before reaching end"),
        }
@ -77,7 +87,7 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
    }

    vertex_route.reverse();
-    vertex_route
+    Ok(vertex_route)
 }

 fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)> {
@ -101,9 +111,13 @@ fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)>
 ///
 /// The vec returned does not return the very last vertex. This is
 /// necessary because a route of N vertices only has N-1 edges.
-fn shortest_path(start: Vertex, size_hint: usize) -> Vec<(Edge, Vertex)> {
-    let vertex_path = shortest_vertex_path(start, size_hint);
-    shortest_path_with_edges(&vertex_path)
+fn shortest_path(
+    start: Vertex,
+    size_hint: usize,
+    graph_limit: usize,
+) -> Result<Vec<(Edge, Vertex)>, ExceededGraphLimit> {
+    let vertex_path = shortest_vertex_path(start, size_hint, graph_limit)?;
+    Ok(shortest_path_with_edges(&vertex_path))
 }

 fn edge_between<'a>(before: &Vertex<'a>, after: &Vertex<'a>) -> Edge {
@ -177,7 +191,8 @@ pub fn mark_syntax<'a>(
    lhs_syntax: Option<&'a Syntax<'a>>,
    rhs_syntax: Option<&'a Syntax<'a>>,
    change_map: &mut ChangeMap<'a>,
-) {
+    graph_limit: usize,
+) -> Result<(), ExceededGraphLimit> {
    let lhs_node_count = node_count(lhs_syntax) as usize;
    let rhs_node_count = node_count(rhs_syntax) as usize;
    info!(
@ -195,7 +210,7 @@ pub fn mark_syntax<'a>(
    let size_hint = lhs_node_count * rhs_node_count;

    let start = Vertex::new(lhs_syntax, rhs_syntax);
-    let route = shortest_path(start, size_hint);
+    let route = shortest_path(start, size_hint, graph_limit)?;

    let print_length = if env::var("DFT_VERBOSE").is_ok() {
        50
@ -223,6 +238,7 @@ pub fn mark_syntax<'a>(
    );

    populate_change_map(&route, change_map);
+    Ok(())
 }

 #[cfg(test)]
@ -231,6 +247,7 @@ mod tests {
    use crate::{
        diff::changes::ChangeKind,
        diff::graph::Edge::*,
+        options::DEFAULT_GRAPH_LIMIT,
        positions::SingleLineSpan,
        syntax::{init_all_info, AtomKind},
    };
@ -264,7 +281,7 @@ mod tests {
        init_all_info(&[lhs], &[rhs]);

        let start = Vertex::new(Some(lhs), Some(rhs));
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -304,7 +321,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -346,7 +363,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -392,7 +409,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -431,7 +448,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -468,7 +485,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -506,7 +523,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -540,7 +557,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -571,7 +588,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -610,7 +627,7 @@ mod tests {
        init_all_info(&lhs, &rhs);

        let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();

        let actions = route.iter().map(|(action, _)| *action).collect_vec();
        assert_eq!(
@ -632,7 +649,7 @@ mod tests {
        init_all_info(&[lhs], &[rhs]);

        let mut change_map = ChangeMap::default();
-        mark_syntax(Some(lhs), Some(rhs), &mut change_map);
+        mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();

        assert_eq!(change_map.get(lhs), Some(ChangeKind::Unchanged(rhs)));
        assert_eq!(change_map.get(rhs), Some(ChangeKind::Unchanged(lhs)));
@ -646,7 +663,7 @@ mod tests {
        init_all_info(&[lhs], &[rhs]);

        let mut change_map = ChangeMap::default();
-        mark_syntax(Some(lhs), Some(rhs), &mut change_map);
+        mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
        assert_eq!(change_map.get(lhs), Some(ChangeKind::Novel));
        assert_eq!(change_map.get(rhs), Some(ChangeKind::Novel));
    }
--- a/src/main.rs
+++ b/src/main.rs
@ -31,6 +31,7 @@ use crate::diff::{dijkstra, unchanged};
 use crate::display::hunks::{matched_pos_to_hunks, merge_adjacent};
 use crate::parse::syntax;
 use diff::changes::ChangeMap;
+use diff::dijkstra::ExceededGraphLimit;
 use display::context::opposite_positions;
 use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
 use log::info;
@ -121,7 +122,7 @@ fn main() {
            }
        }
        Mode::Diff {
-            node_limit,
+            graph_limit,
            byte_limit,
            display_options,
            missing_as_empty,
@ -150,7 +151,7 @@ fn main() {
                    lhs_path,
                    rhs_path,
                    &display_options,
-                    node_limit,
+                    graph_limit,
                    byte_limit,
                    language_override,
                )
@ -165,7 +166,7 @@ fn main() {
                    rhs_path,
                    &display_options,
                    missing_as_empty,
-                    node_limit,
+                    graph_limit,
                    byte_limit,
                    language_override,
                );
@ -183,7 +184,7 @@ fn diff_file(
    rhs_path: &Path,
    display_options: &DisplayOptions,
    missing_as_empty: bool,
-    node_limit: u32,
+    graph_limit: usize,
    byte_limit: usize,
    language_override: Option<parse::guess_language::Language>,
 ) -> DiffResult {
@ -194,7 +195,7 @@ fn diff_file(
        &lhs_bytes,
        &rhs_bytes,
        display_options.tab_width,
-        node_limit,
+        graph_limit,
        byte_limit,
        language_override,
    )
@ -206,7 +207,7 @@ fn diff_file_content(
    lhs_bytes: &[u8],
    rhs_bytes: &[u8],
    tab_width: usize,
-    node_limit: u32,
+    graph_limit: usize,
    byte_limit: usize,
    language_override: Option<parse::guess_language::Language>,
 ) -> DiffResult {
@ -293,36 +294,39 @@ fn diff_file_content(
                unchanged::mark_unchanged(&lhs, &rhs, &mut change_map)
            };

-            let possibly_changed_max = max_num_nodes(&possibly_changed);
-            if possibly_changed_max > node_limit {
-                info!(
-                    "Found {} nodes, exceeding the limit {}",
-                    possibly_changed_max, node_limit
-                );
+            let mut exceeded_graph_limit = false;
+
+            for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
+                init_next_prev(&lhs_section_nodes);
+                init_next_prev(&rhs_section_nodes);
+
+                match mark_syntax(
+                    lhs_section_nodes.get(0).copied(),
+                    rhs_section_nodes.get(0).copied(),
+                    &mut change_map,
+                    graph_limit,
+                ) {
+                    Ok(()) => {}
+                    Err(ExceededGraphLimit {}) => {
+                        exceeded_graph_limit = true;
+                        break;
+                    }
+                }

+                let language = language.unwrap();
+                fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
+                fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
+            }
+
+            if exceeded_graph_limit {
                let lhs_positions = line_parser::change_positions(&lhs_src, &rhs_src);
                let rhs_positions = line_parser::change_positions(&rhs_src, &lhs_src);
                (
-                    Some("Text (exceeded DFT_NODE_LIMIT)".into()),
+                    Some("Text (exceeded DFT_GRAPH_LIMIT)".into()),
                    lhs_positions,
                    rhs_positions,
                )
            } else {
-                for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
-                    init_next_prev(&lhs_section_nodes);
-                    init_next_prev(&rhs_section_nodes);
-
-                    mark_syntax(
-                        lhs_section_nodes.get(0).copied(),
-                        rhs_section_nodes.get(0).copied(),
-                        &mut change_map,
-                    );
-
-                    let language = language.unwrap();
-                    fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
-                    fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
-                }
-
                let lhs_positions = syntax::change_positions(&lhs, &change_map);
                let rhs_positions = syntax::change_positions(&rhs, &change_map);
                (Some(ts_lang.name.into()), lhs_positions, rhs_positions)
@ -356,7 +360,7 @@ fn diff_directories<'a>(
    lhs_dir: &'a Path,
    rhs_dir: &'a Path,
    display_options: &DisplayOptions,
-    node_limit: u32,
+    graph_limit: usize,
    byte_limit: usize,
    language_override: Option<parse::guess_language::Language>,
 ) -> impl ParallelIterator<Item = DiffResult> + 'a {
@ -380,7 +384,7 @@ fn diff_directories<'a>(
            &rhs_path,
            &display_options,
            true,
-            node_limit,
+            graph_limit,
            byte_limit,
            language_override,
        )
@ -495,33 +499,10 @@ fn print_diff_result(display_options: &DisplayOptions, summary: &DiffResult) {
    }
 }

-/// What is the total number of nodes in `roots`?
-fn num_nodes(roots: &[&syntax::Syntax]) -> u32 {
-    roots
-        .iter()
-        .map(|n| {
-            1 + match n {
-                syntax::Syntax::List {
-                    num_descendants, ..
-                } => *num_descendants,
-                syntax::Syntax::Atom { .. } => 0,
-            }
-        })
-        .sum()
-}
-
-fn max_num_nodes(roots_vec: &[(Vec<&syntax::Syntax>, Vec<&syntax::Syntax>)]) -> u32 {
-    roots_vec
-        .iter()
-        .map(|(lhs, rhs)| num_nodes(lhs) + num_nodes(rhs))
-        .max()
-        .unwrap_or(0)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_NODE_LIMIT, DEFAULT_TAB_WIDTH};
+    use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_GRAPH_LIMIT, DEFAULT_TAB_WIDTH};

    #[test]
    fn test_diff_identical_content() {
@ -532,7 +513,7 @@ mod tests {
            s.as_bytes(),
            s.as_bytes(),
            DEFAULT_TAB_WIDTH,
-            DEFAULT_NODE_LIMIT,
+            DEFAULT_GRAPH_LIMIT,
            DEFAULT_BYTE_LIMIT,
            None,
        );
--- a/src/options.rs
+++ b/src/options.rs
@ -8,8 +8,11 @@ use const_format::formatcp;

 use crate::{display::style::BackgroundColor, parse::guess_language};

-pub const DEFAULT_NODE_LIMIT: u32 = 30_000;
 pub const DEFAULT_BYTE_LIMIT: usize = 1_000_000;
+// Chosen experimentally: this is sufficiently many for all the sample
+// files (the highest is slow_before/after.rs at 1.3M nodes), but
+// small enough to terminate in ~5 seconds like the test file in #306.
+pub const DEFAULT_GRAPH_LIMIT: usize = 3_000_000;
 pub const DEFAULT_TAB_WIDTH: usize = 8;

 const USAGE: &str = concat!(env!("CARGO_BIN_NAME"), " [OPTIONS] OLD-PATH NEW-PATH");
@ -134,22 +137,22 @@ fn app() -> clap::Command<'static> {
                // TODO: support DFT_LANGUAGE for consistency
        )
        .arg(
-            Arg::new("node-limit").long("node-limit")
+            Arg::new("byte-limit").long("byte-limit")
                .takes_value(true)
                .value_name("LIMIT")
-                .help(concat!("Use a text diff if the number of syntax nodes exceeds this number."))
-                .default_value(formatcp!("{}", DEFAULT_NODE_LIMIT))
-                .env("DFT_NODE_LIMIT")
-                .validator(|s| s.parse::<u32>())
+                .help(concat!("Use a text diff if either input file exceeds this size."))
+                .default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
+                .env("DFT_BYTE_LIMIT")
+                .validator(|s| s.parse::<usize>())
                .required(false),
        )
        .arg(
-            Arg::new("byte-limit").long("byte-limit")
+            Arg::new("graph-limit").long("graph-limit")
                .takes_value(true)
                .value_name("LIMIT")
-                .help(concat!("Use a text diff if either input file exceeds this size."))
-                .default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
-                .env("DFT_BYTE_LIMIT")
+                .help(concat!("Use a text diff if the structural graph exceed this number of nodes in memory."))
+                .default_value(formatcp!("{}", DEFAULT_GRAPH_LIMIT))
+                .env("DFT_GRAPH_LIMIT")
                .validator(|s| s.parse::<usize>())
                .required(false),
        )
@ -172,7 +175,7 @@ pub enum DisplayMode {

 pub enum Mode {
    Diff {
-        node_limit: u32,
+        graph_limit: usize,
        byte_limit: usize,
        display_options: DisplayOptions,
        missing_as_empty: bool,
@ -324,10 +327,10 @@ pub fn parse_args() -> Mode {

    let syntax_highlight = matches.value_of("syntax-highlight") == Some("on");

-    let node_limit = matches
-        .value_of("node-limit")
+    let graph_limit = matches
+        .value_of("graph-limit")
        .expect("Always present as we've given clap a default")
-        .parse::<u32>()
+        .parse::<usize>()
        .expect("Value already validated by clap");

    let byte_limit = matches
@ -359,7 +362,7 @@ pub fn parse_args() -> Mode {
    };

    Mode::Diff {
-        node_limit,
+        graph_limit,
        byte_limit,
        display_options,
        missing_as_empty,