Merge branch 'master' into add-html-parser

pull/305/head
Wilfred Hughes 2022-07-03 21:40:57 +07:00 committed by GitHub
commit d96ccc884f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 138 additions and 109 deletions

@ -4,6 +4,23 @@
Added support for HTML.
### Command Line Interface
`--node-limit` has been replaced by `--graph-limit`, and the
corresponding environment variable `DFT_NODE_LIMIT` has been replaced
with `DFT_GRAPH_LIMIT`.
`--graph-limit` makes difftastic give up on structural diffs after
traversing this many graph nodes. `--node-limit` applied a limit based
on an estimate of how big the graph would be, leading to very slow
diffs when the estimate was wrong.
This new setting sets a more accurate limit on difftastic
performance. It also means that difftastic will always try a
structural diff first. This will be slower for files that exceed
`--graph-limit`, but guarantees that files with a small number of
changes will always get a structural diff.
## 0.29.1 (released 13th June 2022)
Fixed a major memory regression in 0.29 when performing large

@ -15,8 +15,15 @@ use rustc_hash::FxHashMap;
type PredecessorInfo<'a, 'b> = (u64, &'b Vertex<'a>);
#[derive(Debug)]
pub struct ExceededGraphLimit {}
/// Return the shortest route from `start` to the end vertex.
fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
fn shortest_vertex_path(
start: Vertex,
size_hint: usize,
graph_limit: usize,
) -> Result<Vec<Vertex>, ExceededGraphLimit> {
// We want to visit nodes with the shortest distance first, but
// RadixHeapMap is a max-heap. Ensure nodes are wrapped with
// Reverse to flip comparisons.
@ -56,6 +63,9 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
}
}
}
if predecessors.len() > graph_limit {
return Err(ExceededGraphLimit {});
}
}
None => panic!("Ran out of graph nodes before reaching end"),
}
@ -77,7 +87,7 @@ fn shortest_vertex_path(start: Vertex, size_hint: usize) -> Vec<Vertex> {
}
vertex_route.reverse();
vertex_route
Ok(vertex_route)
}
fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)> {
@ -101,9 +111,13 @@ fn shortest_path_with_edges<'a>(route: &[Vertex<'a>]) -> Vec<(Edge, Vertex<'a>)>
///
/// The vec returned does not return the very last vertex. This is
/// necessary because a route of N vertices only has N-1 edges.
fn shortest_path(start: Vertex, size_hint: usize) -> Vec<(Edge, Vertex)> {
let vertex_path = shortest_vertex_path(start, size_hint);
shortest_path_with_edges(&vertex_path)
fn shortest_path(
start: Vertex,
size_hint: usize,
graph_limit: usize,
) -> Result<Vec<(Edge, Vertex)>, ExceededGraphLimit> {
let vertex_path = shortest_vertex_path(start, size_hint, graph_limit)?;
Ok(shortest_path_with_edges(&vertex_path))
}
fn edge_between<'a>(before: &Vertex<'a>, after: &Vertex<'a>) -> Edge {
@ -113,14 +127,29 @@ fn edge_between<'a>(before: &Vertex<'a>, after: &Vertex<'a>) -> Edge {
let vertex_arena = Bump::new();
neighbours(before, &mut neighbour_buf, &vertex_arena);
let mut shortest_edge: Option<Edge> = None;
for neighbour in &mut neighbour_buf {
if let Some((edge, next)) = neighbour.take() {
// If there are multiple edges that can take us to `next`,
// prefer the shortest.
if next == after {
return edge;
let is_shorter = match shortest_edge {
Some(prev_edge) => edge.cost() < prev_edge.cost(),
None => true,
};
if is_shorter {
shortest_edge = Some(edge);
}
}
}
}
if let Some(edge) = shortest_edge {
return edge;
}
panic!(
"Expected a route between the two vertices {:#?} and {:#?}",
before, after
@ -162,7 +191,8 @@ pub fn mark_syntax<'a>(
lhs_syntax: Option<&'a Syntax<'a>>,
rhs_syntax: Option<&'a Syntax<'a>>,
change_map: &mut ChangeMap<'a>,
) {
graph_limit: usize,
) -> Result<(), ExceededGraphLimit> {
let lhs_node_count = node_count(lhs_syntax) as usize;
let rhs_node_count = node_count(rhs_syntax) as usize;
info!(
@ -180,7 +210,7 @@ pub fn mark_syntax<'a>(
let size_hint = lhs_node_count * rhs_node_count;
let start = Vertex::new(lhs_syntax, rhs_syntax);
let route = shortest_path(start, size_hint);
let route = shortest_path(start, size_hint, graph_limit)?;
let print_length = if env::var("DFT_VERBOSE").is_ok() {
50
@ -208,6 +238,7 @@ pub fn mark_syntax<'a>(
);
populate_change_map(&route, change_map);
Ok(())
}
#[cfg(test)]
@ -216,6 +247,7 @@ mod tests {
use crate::{
diff::changes::ChangeKind,
diff::graph::Edge::*,
options::DEFAULT_GRAPH_LIMIT,
positions::SingleLineSpan,
syntax::{init_all_info, AtomKind},
};
@ -249,7 +281,7 @@ mod tests {
init_all_info(&[lhs], &[rhs]);
let start = Vertex::new(Some(lhs), Some(rhs));
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -289,7 +321,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -331,7 +363,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -377,7 +409,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -416,7 +448,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -453,7 +485,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -491,7 +523,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -525,7 +557,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -556,7 +588,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -595,7 +627,7 @@ mod tests {
init_all_info(&lhs, &rhs);
let start = Vertex::new(lhs.get(0).copied(), rhs.get(0).copied());
let route = shortest_path(start, 0);
let route = shortest_path(start, 0, DEFAULT_GRAPH_LIMIT).unwrap();
let actions = route.iter().map(|(action, _)| *action).collect_vec();
assert_eq!(
@ -617,7 +649,7 @@ mod tests {
init_all_info(&[lhs], &[rhs]);
let mut change_map = ChangeMap::default();
mark_syntax(Some(lhs), Some(rhs), &mut change_map);
mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
assert_eq!(change_map.get(lhs), Some(ChangeKind::Unchanged(rhs)));
assert_eq!(change_map.get(rhs), Some(ChangeKind::Unchanged(lhs)));
@ -631,7 +663,7 @@ mod tests {
init_all_info(&[lhs], &[rhs]);
let mut change_map = ChangeMap::default();
mark_syntax(Some(lhs), Some(rhs), &mut change_map);
mark_syntax(Some(lhs), Some(rhs), &mut change_map, DEFAULT_GRAPH_LIMIT).unwrap();
assert_eq!(change_map.get(lhs), Some(ChangeKind::Novel));
assert_eq!(change_map.get(rhs), Some(ChangeKind::Novel));
}

@ -408,7 +408,7 @@ pub fn neighbours<'a, 'b>(
if lhs_syntax == rhs_syntax {
let depth_difference = (lhs_syntax.num_ancestors() as i32
- rhs_syntax.num_ancestors() as i32)
.abs() as u32;
.unsigned_abs();
// Both nodes are equal, the happy case.
buf[i] = Some((
@ -450,7 +450,7 @@ pub fn neighbours<'a, 'b>(
let depth_difference = (lhs_syntax.num_ancestors() as i32
- rhs_syntax.num_ancestors() as i32)
.abs() as u32;
.unsigned_abs();
buf[i] = Some((
EnterUnchangedDelimiter { depth_difference },

@ -486,7 +486,7 @@ pub fn print(
lhs_lines[lhs_line_num.as_usize()],
source_dims.lhs_content_width,
display_options.use_color,
lhs_highlights.get(&lhs_line_num).unwrap_or(&vec![]),
lhs_highlights.get(lhs_line_num).unwrap_or(&vec![]),
Side::Left,
),
None => vec![" ".repeat(source_dims.lhs_content_width)],
@ -496,7 +496,7 @@ pub fn print(
rhs_lines[rhs_line_num.as_usize()],
source_dims.rhs_content_width,
display_options.use_color,
rhs_highlights.get(&rhs_line_num).unwrap_or(&vec![]),
rhs_highlights.get(rhs_line_num).unwrap_or(&vec![]),
Side::Right,
),
None => vec!["".into()],
@ -520,7 +520,7 @@ pub fn print(
display_options.use_color,
);
if let Some(line_num) = lhs_line_num {
if lhs_lines_with_novel.contains(&line_num) {
if lhs_lines_with_novel.contains(line_num) {
s = if display_options.background_color.is_dark() {
s.bright_red().to_string()
} else {
@ -541,7 +541,7 @@ pub fn print(
display_options.use_color,
);
if let Some(line_num) = rhs_line_num {
if rhs_lines_with_novel.contains(&line_num) {
if rhs_lines_with_novel.contains(line_num) {
s = if display_options.background_color.is_dark() {
s.bright_green().to_string()
} else {

@ -286,15 +286,15 @@ pub fn color_positions(
}
MatchKind::Novel { highlight, .. } => {
style = novel_style(style, is_lhs, background);
if syntax_highlight {
if matches!(
if syntax_highlight
&& matches!(
highlight,
TokenKind::Delimiter
| TokenKind::Atom(AtomKind::Keyword)
| TokenKind::Atom(AtomKind::Type)
) {
style = style.bold();
}
)
{
style = style.bold();
}
if matches!(highlight, TokenKind::Atom(AtomKind::Comment)) {
style = style.italic();
@ -302,18 +302,14 @@ pub fn color_positions(
}
MatchKind::NovelWord { highlight } => {
style = novel_style(style, is_lhs, background).bold();
if syntax_highlight {
if matches!(highlight, TokenKind::Atom(AtomKind::Comment)) {
style = style.italic();
}
if syntax_highlight && matches!(highlight, TokenKind::Atom(AtomKind::Comment)) {
style = style.italic();
}
}
MatchKind::NovelLinePart { highlight, .. } => {
style = novel_style(style, is_lhs, background);
if syntax_highlight {
if matches!(highlight, TokenKind::Atom(AtomKind::Comment)) {
style = style.italic();
}
if syntax_highlight && matches!(highlight, TokenKind::Atom(AtomKind::Comment)) {
style = style.italic();
}
}
};

@ -31,6 +31,7 @@ use crate::diff::{dijkstra, unchanged};
use crate::display::hunks::{matched_pos_to_hunks, merge_adjacent};
use crate::parse::syntax;
use diff::changes::ChangeMap;
use diff::dijkstra::ExceededGraphLimit;
use display::context::opposite_positions;
use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
use log::info;
@ -121,7 +122,7 @@ fn main() {
}
}
Mode::Diff {
node_limit,
graph_limit,
byte_limit,
display_options,
missing_as_empty,
@ -150,7 +151,7 @@ fn main() {
lhs_path,
rhs_path,
&display_options,
node_limit,
graph_limit,
byte_limit,
language_override,
)
@ -165,7 +166,7 @@ fn main() {
rhs_path,
&display_options,
missing_as_empty,
node_limit,
graph_limit,
byte_limit,
language_override,
);
@ -183,7 +184,7 @@ fn diff_file(
rhs_path: &Path,
display_options: &DisplayOptions,
missing_as_empty: bool,
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> DiffResult {
@ -194,7 +195,7 @@ fn diff_file(
&lhs_bytes,
&rhs_bytes,
display_options.tab_width,
node_limit,
graph_limit,
byte_limit,
language_override,
)
@ -206,7 +207,7 @@ fn diff_file_content(
lhs_bytes: &[u8],
rhs_bytes: &[u8],
tab_width: usize,
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> DiffResult {
@ -293,36 +294,39 @@ fn diff_file_content(
unchanged::mark_unchanged(&lhs, &rhs, &mut change_map)
};
let possibly_changed_max = max_num_nodes(&possibly_changed);
if possibly_changed_max > node_limit {
info!(
"Found {} nodes, exceeding the limit {}",
possibly_changed_max, node_limit
);
let mut exceeded_graph_limit = false;
for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
init_next_prev(&lhs_section_nodes);
init_next_prev(&rhs_section_nodes);
match mark_syntax(
lhs_section_nodes.get(0).copied(),
rhs_section_nodes.get(0).copied(),
&mut change_map,
graph_limit,
) {
Ok(()) => {}
Err(ExceededGraphLimit {}) => {
exceeded_graph_limit = true;
break;
}
}
let language = language.unwrap();
fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
}
if exceeded_graph_limit {
let lhs_positions = line_parser::change_positions(&lhs_src, &rhs_src);
let rhs_positions = line_parser::change_positions(&rhs_src, &lhs_src);
(
Some("Text (exceeded DFT_NODE_LIMIT)".into()),
Some("Text (exceeded DFT_GRAPH_LIMIT)".into()),
lhs_positions,
rhs_positions,
)
} else {
for (lhs_section_nodes, rhs_section_nodes) in possibly_changed {
init_next_prev(&lhs_section_nodes);
init_next_prev(&rhs_section_nodes);
mark_syntax(
lhs_section_nodes.get(0).copied(),
rhs_section_nodes.get(0).copied(),
&mut change_map,
);
let language = language.unwrap();
fix_all_sliders(language, &lhs_section_nodes, &mut change_map);
fix_all_sliders(language, &rhs_section_nodes, &mut change_map);
}
let lhs_positions = syntax::change_positions(&lhs, &change_map);
let rhs_positions = syntax::change_positions(&rhs, &change_map);
(Some(ts_lang.name.into()), lhs_positions, rhs_positions)
@ -356,7 +360,7 @@ fn diff_directories<'a>(
lhs_dir: &'a Path,
rhs_dir: &'a Path,
display_options: &DisplayOptions,
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> impl ParallelIterator<Item = DiffResult> + 'a {
@ -380,7 +384,7 @@ fn diff_directories<'a>(
&rhs_path,
&display_options,
true,
node_limit,
graph_limit,
byte_limit,
language_override,
)
@ -495,33 +499,10 @@ fn print_diff_result(display_options: &DisplayOptions, summary: &DiffResult) {
}
}
/// What is the total number of nodes in `roots`?
fn num_nodes(roots: &[&syntax::Syntax]) -> u32 {
roots
.iter()
.map(|n| {
1 + match n {
syntax::Syntax::List {
num_descendants, ..
} => *num_descendants,
syntax::Syntax::Atom { .. } => 0,
}
})
.sum()
}
fn max_num_nodes(roots_vec: &[(Vec<&syntax::Syntax>, Vec<&syntax::Syntax>)]) -> u32 {
roots_vec
.iter()
.map(|(lhs, rhs)| num_nodes(lhs) + num_nodes(rhs))
.max()
.unwrap_or(0)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_NODE_LIMIT, DEFAULT_TAB_WIDTH};
use crate::options::{DEFAULT_BYTE_LIMIT, DEFAULT_GRAPH_LIMIT, DEFAULT_TAB_WIDTH};
#[test]
fn test_diff_identical_content() {
@ -532,7 +513,7 @@ mod tests {
s.as_bytes(),
s.as_bytes(),
DEFAULT_TAB_WIDTH,
DEFAULT_NODE_LIMIT,
DEFAULT_GRAPH_LIMIT,
DEFAULT_BYTE_LIMIT,
None,
);

@ -8,8 +8,11 @@ use const_format::formatcp;
use crate::{display::style::BackgroundColor, parse::guess_language};
pub const DEFAULT_NODE_LIMIT: u32 = 30_000;
pub const DEFAULT_BYTE_LIMIT: usize = 1_000_000;
// Chosen experimentally: this is sufficiently many for all the sample
// files (the highest is slow_before/after.rs at 1.3M nodes), but
// small enough to terminate in ~5 seconds like the test file in #306.
pub const DEFAULT_GRAPH_LIMIT: usize = 3_000_000;
pub const DEFAULT_TAB_WIDTH: usize = 8;
const USAGE: &str = concat!(env!("CARGO_BIN_NAME"), " [OPTIONS] OLD-PATH NEW-PATH");
@ -134,22 +137,22 @@ fn app() -> clap::Command<'static> {
// TODO: support DFT_LANGUAGE for consistency
)
.arg(
Arg::new("node-limit").long("node-limit")
Arg::new("byte-limit").long("byte-limit")
.takes_value(true)
.value_name("LIMIT")
.help(concat!("Use a text diff if the number of syntax nodes exceeds this number."))
.default_value(formatcp!("{}", DEFAULT_NODE_LIMIT))
.env("DFT_NODE_LIMIT")
.validator(|s| s.parse::<u32>())
.help(concat!("Use a text diff if either input file exceeds this size."))
.default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
.env("DFT_BYTE_LIMIT")
.validator(|s| s.parse::<usize>())
.required(false),
)
.arg(
Arg::new("byte-limit").long("byte-limit")
Arg::new("graph-limit").long("graph-limit")
.takes_value(true)
.value_name("LIMIT")
.help(concat!("Use a text diff if either input file exceeds this size."))
.default_value(formatcp!("{}", DEFAULT_BYTE_LIMIT))
.env("DFT_BYTE_LIMIT")
.help(concat!("Use a text diff if the structural graph exceed this number of nodes in memory."))
.default_value(formatcp!("{}", DEFAULT_GRAPH_LIMIT))
.env("DFT_GRAPH_LIMIT")
.validator(|s| s.parse::<usize>())
.required(false),
)
@ -172,7 +175,7 @@ pub enum DisplayMode {
pub enum Mode {
Diff {
node_limit: u32,
graph_limit: usize,
byte_limit: usize,
display_options: DisplayOptions,
missing_as_empty: bool,
@ -324,10 +327,10 @@ pub fn parse_args() -> Mode {
let syntax_highlight = matches.value_of("syntax-highlight") == Some("on");
let node_limit = matches
.value_of("node-limit")
let graph_limit = matches
.value_of("graph-limit")
.expect("Always present as we've given clap a default")
.parse::<u32>()
.parse::<usize>()
.expect("Value already validated by clap");
let byte_limit = matches
@ -359,7 +362,7 @@ pub fn parse_args() -> Mode {
};
Mode::Diff {
node_limit,
graph_limit,
byte_limit,
display_options,
missing_as_empty,