From f10cfa00cc146c06efb78f1b27c6a295a5dfa7a5 Mon Sep 17 00:00:00 2001
From: Wilfred Hughes <me@wilfred.me.uk>
Date: Sun, 3 Jul 2022 18:23:17 -0700
Subject: [PATCH] Limit the graph size based on the number of predecessors
 found

Fixes #183
Fixes #306
---
 CHANGELOG.md         | 17 +++++++++
 src/diff/dijkstra.rs | 55 +++++++++++++++++---------
 src/main.rs          | 91 ++++++++++++++++++--------------------------
 src/options.rs       | 33 ++++++++--------
 4 files changed, 107 insertions(+), 89 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7767219c6..6706bcc6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,22 @@
 ## 0.30 (unreleased)
 
+### Command Line Interface
+
+`--node-limit` has been replaced by `--graph-limit`, and the
+corresponding environment variable `DFT_NODE_LIMIT` has been replaced
+with `DFT_GRAPH_LIMIT`.
+
+`--graph-limit` makes difftastic give up on structural diffs after
+traversing this many graph nodes. `--node-limit` applied a limit based
+on an estimate of how big the graph would be, leading to very slow
+diffs when the estimate was wrong.
+
+This new setting sets a more accurate limit on difftastic
+performance. It also means that difftastic will always try a
+structural diff first. This will be slower for files that exceed
+`--graph-limit`, but guarantees that files with a small number of
+changes will always get a structural diff.
+
 ## 0.29.1 (released 13th June 2022)
 
 Fixed a major memory regression in 0.29 when performing large
diff --git a/src/diff/dijkstra.rs b/src/diff/dijkstra.rs
index a1694f8bf..ff6222f7d 100644
--- a/src/diff/dijkstra.rs
+++ b/src/diff/dijkstra.rs
@@ -15,8 +15,15 @@ use rustc_hash::FxHashMap;
 
 type PredecessorInfo<'a, 'b> = (u64, &'b Vertex<'a>);
 
+#[derive(Debug)]
+pub struct ExceededGraphLimit {}
+
 /// Return the shortest route from `start` to the end vertex.
-fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
+fn shortest_vertex_path(
+    start: Vertex,
+    size_hint: usize,
+    graph_limit: usize,
+) -> Result<Vec<Vertex>, ExceededGraphLimit> {
     // We want to visit nodes with the shortest distance first, but
     // RadixHeapMap is a max-heap. Ensure nodes are wrapped with
     // Reverse to flip comparisons.
@@ -56,6 +63,9 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
                         }
                     }
                 }
+                if predecessors.len() > graph_limit {
+                    return Err(ExceededGraphLimit {});
+                }
             }
             None => panic!("Ran out of graph nodes before reaching end"),
         }
@@ -77,7 +87,7 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
     }
 
     vertex_route.reverse();
-    vertex_route
+    Ok(vertex_route)
 }
 
 fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)> {
@@ -101,9 +111,13 @@ fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)>
 ///
 /// The vec returned does not return the very last vertex. This is
 /// necessary because a route of N vertices only has N-1 edges.
-fn shortest_path(start: Vertex, size_hint: usize) -> Vec<(Edge, Vertex)> {
-    let vertex_path = shortest_vertex_path(start, size_hint);
-    shortest_path_with_edges(&vertex_path)
+fn shortest_path(
+    start: Vertex,
+    size_hint: usize,
+    graph_limit: usize,
+) -> Result<Vec<(Edge, Vertex)>, ExceededGraphLimit> {
+    let vertex_path = shortest_vertex_path(start, size_hint, graph_limit)?;
+    Ok(shortest_path_with_edges(&vertex_path))
 }
 
 fn edge_between<'a>(before: &Vertex<'a>, after: &Vertex<'a>) -> Edge {
@@ -177,7 +191,8 @@ pub fn mark_syntax<'a>(
     lhs_syntax: Option<&'a Syntax<'a>>,
     rhs_syntax: Option<&'a Syntax<'a>>,
     change_map: &mut ChangeMap<'a>,
-) {
+    graph_limit: usize,
+) -> Result<(), ExceededGraphLimit> {
     let lhs_node_count = node_count(lhs_syntax) as usize;
     let rhs_node_count = node_count(rhs_syntax) as usize;
     info!(
@@ -195,7 +210,7 @@ pub fn mark_syntax<'a>(
     let size_hint = lhs_node_count * rhs_node_count;
 
     let start = Vertex::new(lhs_syntax, rhs_syntax);
-    let route = shortest_path(start, size_hint);
+    let route = shortest_path(start, size_hint, graph_limit)?;
 
     let print_length = if env::var("DFT_VERBOSE").is_ok() {
         50
@@ -223,6 +238,7 @@ pub fn mark_syntax<'a>(
     );
 
     populate_change_map(&route, change_map);
+    Ok(())
 }
 
 #[cfg(test)]
@@ -231,6 +247,7 @@ mod tests {
     use crate::{
         diff::changes::ChangeKind,
         diff::graph::Edge::*,
+        options::DEFAULT_GRAPH_LIMIT,
         positions::SingleLineSpan,
         syntax::{init_all_info, AtomKind},
     };
@@ -264,7 +281,7 @@ mod tests {
         init_all_info(&[lhs], &[rhs]);
 
         let start = Vertex::new(Some(lhs), Some(rhs));
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -304,7 +321,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -346,7 +363,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -392,7 +409,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -431,7 +448,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -468,7 +485,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -506,7 +523,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -540,7 +557,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -571,7 +588,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -610,7 +627,7 @@ mod tests {
         init_all_info(&lhs, &rhs);
 
         let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
-        let route = shortest_path(start, 0);
+        let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
 
         let actions = route.iter().map(|(action, _)| *action).collect_vec();
         assert_eq!(
@@ -632,7 +649,7 @@ mod tests {
         init_all_info(&[lhs], &[rhs]);
 
         let mut change_map = ChangeMap::default();
-        mark_syntax(Some(lhs), Some(rhs), &mut change_map);
+        mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
 
         assert_eq!(change_map.get(lhs), Some(ChangeKind::Unchanged(rhs)));
         assert_eq!(change_map.get(rhs), Some(ChangeKind::Unchanged(lhs)));
@@ -646,7 +663,7 @@ mod tests {
         init_all_info(&[lhs], &[rhs]);
 
         let mut change_map = ChangeMap::default();
-        mark_syntax(Some(lhs), Some(rhs), &mut change_map);
+        mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
         assert_eq!(change_map.get(lhs), Some(ChangeKind::Novel));
         assert_eq!(change_map.get(rhs), Some(ChangeKind::Novel));
     }
diff --git a/src/main.rs b/src/main.rs
index 8c98d34b3..dbb6ba625 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -31,6 +31,7 @@ use crate::diff::{dijkstra, unchanged};
 use crate::display::hunks::{matched_pos_to_hunks, merge_adjacent};
 use crate::parse::syntax;
 use diff::changes::ChangeMap;
+use diff::dijkstra::ExceededGraphLimit;
 use display::context::opposite_positions;
 use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
 use log::info;
@@ -121,7 +122,7 @@ fn main() {
             }
         }
         Mode::Diff {
-            node_limit,
+            graph_limit,
             byte_limit,
             display_options,
             missing_as_empty,
@@ -150,7 +151,7 @@ fn main() {
                     lhs_path,
                     rhs_path,
                     &display_options,
-                    node_limit,
+                    graph_limit,
                     byte_limit,
                     language_override,
                 )
@@ -165,7 +166,7 @@ fn main() {
                     rhs_path,
                     &display_options,
                     missing_as_empty,
-                    node_limit,
+                    graph_limit,
                     byte_limit,
                     language_override,
                 );
@@ -183,7 +184,7 @@ fn diff_file(
     rhs_path: &Path,
     display_options: &DisplayOptions,
     missing_as_empty: bool,
-    node_limit: u32,
+    graph_limit: usize,
     byte_limit: usize,
     language_override: Option<parse::guess_language::Language>,
 ) -> DiffResult {
@@ -194,7 +195,7 @@ fn diff_file(
         &lhs_bytes,
         &rhs_bytes,
         display_options.tab_width,
-        node_limit,
+        graph_limit,
         byte_limit,
         language_override,
     )
@@ -206,7 +207,7 @@ fn diff_file_content(
     lhs_bytes: &[u8],
     rhs_bytes: &[u8],
     tab_width: usize,
-    node_limit: u32,
+    graph_limit: usize,
     byte_limit: usize,
     language_override: Option<parse::guess_language::Language>,
 ) -> DiffResult {
@@ -293,36 +294,39 @@ fn diff_file_content(
                 unchanged::mark_unchanged(&lhs, &rhs, &mut change_map)
             };
 
-            let possibly_changed_max = max_num_nodes(&possibly_changed);
-            if possibly_changed_max > node_limit {
-                info!(
-                    "Found {} nodes, exceeding the limit {}",
-                    possibly_changed_max, node_limit
-                );
+            let mut exceeded_graph_limit = false;
+
+            for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
+                init_next_prev(&lhs_section_nodes);
+                init_next_prev(&rhs_section_nodes);
+
+                match mark_syntax(
+                    lhs_section_nodes.get(0).copied(),
+                    rhs_section_nodes.get(0).copied(),
+                    &mut change_map,
+                    graph_limit,
+                ) {
+                    Ok(()) => {}
+                    Err(ExceededGraphLimit {}) => {
+                        exceeded_graph_limit = true;
+                        break;
+                    }
+                }
 
+                let language = language.unwrap();
+                fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
+                fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
+            }
+
+            if exceeded_graph_limit {
                 let lhs_positions = line_parser::change_positions(&lhs_src, &rhs_src);
                 let rhs_positions = line_parser::change_positions(&rhs_src, &lhs_src);
                 (
-                    Some("Text (exceeded DFT_NODE_LIMIT)".into()),
+                    Some("Text (exceeded DFT_GRAPH_LIMIT)".into()),
                     lhs_positions,
                     rhs_positions,
                 )
             } else {
-                for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
-                    init_next_prev(&lhs_section_nodes);
-                    init_next_prev(&rhs_section_nodes);
-
-                    mark_syntax(
-                        lhs_section_nodes.get(0).copied(),
-                        rhs_section_nodes.get(0).copied(),
-                        &mut change_map,
-                    );
-
-                    let language = language.unwrap();
-                    fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
-                    fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
-                }
-
                 let lhs_positions = syntax::change_positions(&lhs, &change_map);
                 let rhs_positions = syntax::change_positions(&rhs, &change_map);
                 (Some(ts_lang.name.into()), lhs_positions, rhs_positions)
@@ -356,7 +360,7 @@ fn diff_directories<'a>(
     lhs_dir: &'a Path,
     rhs_dir: &'a Path,
     display_options: &DisplayOptions,
-    node_limit: u32,
+    graph_limit: usize,
     byte_limit: usize,
     language_override: Option<parse::guess_language::Language>,
 ) -> impl ParallelIterator<Item = DiffResult> + 'a {
@@ -380,7 +384,7 @@ fn diff_directories<'a>(
             &rhs_path,
             &display_options,
             true,
-            node_limit,
+            graph_limit,
             byte_limit,
             language_override,
         )
@@ -495,33 +499,10 @@ fn print_diff_result(display_options: &DisplayOptions, summary: &DiffResult) {
     }
 }
 
-/// What is the total number of nodes in `roots`?
-fn num_nodes(roots: &[&syntax::Syntax]) -> u32 {
-    roots
-        .iter()
-        .map(|n| {
-            1 + match n {
-                syntax::Syntax::List {
-                    num_descendants, ..
-                } => *num_descendants,
-                syntax::Syntax::Atom { .. } => 0,
-            }
-        })
-        .sum()
-}
-
-fn max_num_nodes(roots_vec: &[(Vec<&syntax::Syntax>, Vec<&syntax::Syntax>)]) -> u32 {
-    roots_vec
-        .iter()
-        .map(|(lhs, rhs)| num_nodes(lhs) + num_nodes(rhs))
-        .max()
-        .unwrap_or(0)
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_NODE_LIMIT, DEFAULT_TAB_WIDTH};
+    use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_GRAPH_LIMIT, DEFAULT_TAB_WIDTH};
 
     #[test]
     fn test_diff_identical_content() {
@@ -532,7 +513,7 @@ mod tests {
             s.as_bytes(),
             s.as_bytes(),
             DEFAULT_TAB_WIDTH,
-            DEFAULT_NODE_LIMIT,
+            DEFAULT_GRAPH_LIMIT,
             DEFAULT_BYTE_LIMIT,
             None,
         );
diff --git a/src/options.rs b/src/options.rs
index c59e087f0..eb6cb0439 100644
--- a/src/options.rs
+++ b/src/options.rs
@@ -8,8 +8,11 @@ use const_format::formatcp;
 
 use crate::{display::style::BackgroundColor, parse::guess_language};
 
-pub const DEFAULT_NODE_LIMIT: u32 = 30_000;
 pub const DEFAULT_BYTE_LIMIT: usize = 1_000_000;
+// Chosen experimentally: this is sufficiently many for all the sample
+// files (the highest is slow_before/after.rs at 1.3M nodes), but
+// small enough to terminate in ~5 seconds like the test file in #306.
+pub const DEFAULT_GRAPH_LIMIT: usize = 3_000_000;
 pub const DEFAULT_TAB_WIDTH: usize = 8;
 
 const USAGE: &str = concat!(env!("CARGO_BIN_NAME"), " [OPTIONS] OLD-PATH NEW-PATH");
@@ -134,22 +137,22 @@ fn app() -> clap::Command<'static> {
                 // TODO: support DFT_LANGUAGE for consistency
         )
         .arg(
-            Arg::new("node-limit").long("node-limit")
+            Arg::new("byte-limit").long("byte-limit")
                 .takes_value(true)
                 .value_name("LIMIT")
-                .help(concat!("Use a text diff if the number of syntax nodes exceeds this number."))
-                .default_value(formatcp!("{}", DEFAULT_NODE_LIMIT))
-                .env("DFT_NODE_LIMIT")
-                .validator(|s| s.parse::<u32>())
+                .help(concat!("Use a text diff if either input file exceeds this size."))
+                .default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
+                .env("DFT_BYTE_LIMIT")
+                .validator(|s| s.parse::<usize>())
                 .required(false),
         )
         .arg(
-            Arg::new("byte-limit").long("byte-limit")
+            Arg::new("graph-limit").long("graph-limit")
                 .takes_value(true)
                 .value_name("LIMIT")
-                .help(concat!("Use a text diff if either input file exceeds this size."))
-                .default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
-                .env("DFT_BYTE_LIMIT")
+                .help(concat!("Use a text diff if the structural graph exceed this number of nodes in memory."))
+                .default_value(formatcp!("{}", DEFAULT_GRAPH_LIMIT))
+                .env("DFT_GRAPH_LIMIT")
                 .validator(|s| s.parse::<usize>())
                 .required(false),
         )
@@ -172,7 +175,7 @@ pub enum DisplayMode {
 
 pub enum Mode {
     Diff {
-        node_limit: u32,
+        graph_limit: usize,
         byte_limit: usize,
         display_options: DisplayOptions,
         missing_as_empty: bool,
@@ -324,10 +327,10 @@ pub fn parse_args() -> Mode {
 
     let syntax_highlight = matches.value_of("syntax-highlight") == Some("on");
 
-    let node_limit = matches
-        .value_of("node-limit")
+    let graph_limit = matches
+        .value_of("graph-limit")
         .expect("Always present as we've given clap a default")
-        .parse::<u32>()
+        .parse::<usize>()
         .expect("Value already validated by clap");
 
     let byte_limit = matches
@@ -359,7 +362,7 @@ pub fn parse_args() -> Mode {
     };
 
     Mode::Diff {
-        node_limit,
+        graph_limit,
         byte_limit,
         display_options,
         missing_as_empty,