Fix directory diffing when files were only present on one side

This particularly helps usage with mercurial when files are added or
removed.

Fixes #272
pull/210/head
Wilfred Hughes 2022-04-27 21:46:46 +07:00
parent dd31fd3a2b
commit f98f2a8aca
6 changed files with 98 additions and 22 deletions

@ -16,10 +16,16 @@ Fixed crash in inline mode.
Difftastic now diffs files in parallel when diffing whole directories,
increasing performance.
Directory diffing now correctly handles files that are only in one of
the directories.
## Command Line Interface
Fixed handling of paths that aren't valid UTF-8.
`--missing-as-empty` now only applies when diffing files, and has no
effect when diffing directories.
## 0.27 (released 18th April 2022)
### Parsing

@ -0,0 +1,4 @@
fn main() {
// Say hello
println!("Hello World!");
}

@ -0,0 +1,7 @@
#include <stdio.h>
int main() {
// Foo bar
printf("Hello, World!");
return 0;
}

@ -1,6 +1,13 @@
//! File reading utilities.
use std::{fs, io::ErrorKind::*, path::Path};
use std::{
fs,
io::ErrorKind::*,
path::{Path, PathBuf},
};
use rustc_hash::FxHashSet;
use walkdir::WalkDir;
pub fn read_files_or_die(
lhs_path: &Path,
@ -73,6 +80,70 @@ pub fn is_probably_binary(bytes: &[u8]) -> bool {
num_replaced > 20
}
/// All the files in `dir`, including subdirectories.
fn relative_file_paths_in_dir(dir: &Path) -> Vec<PathBuf> {
WalkDir::new(dir)
.into_iter()
.filter_map(Result::ok)
.map(|entry| entry.into_path())
.filter(|path| !path.is_dir())
.map(|path| path.strip_prefix(dir).unwrap().to_path_buf())
.collect()
}
/// Walk `lhs_dir` and `rhs_dir`, and return relative paths of files
/// that occur in at least one directory.
///
/// Attempts to preserve the ordering of files in both directories.
pub fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec<PathBuf> {
let lhs_paths = relative_file_paths_in_dir(lhs_dir);
let rhs_paths = relative_file_paths_in_dir(rhs_dir);
let mut seen = FxHashSet::default();
let mut res: Vec<PathBuf> = vec![];
let mut i = 0;
let mut j = 0;
loop {
match (lhs_paths.get(i), rhs_paths.get(j)) {
(Some(lhs_path), Some(rhs_path)) if lhs_path == rhs_path => {
if !seen.contains(lhs_path) {
// It should be impossible to get duplicates, but
// be defensive.
res.push(lhs_path.to_owned());
seen.insert(lhs_path);
}
i += 1;
j += 1;
}
(Some(lhs_path), Some(rhs_path)) => {
if seen.contains(lhs_path) {
i += 1;
} else if seen.contains(rhs_path) {
j += 1;
} else {
res.push(lhs_path.to_owned());
res.push(rhs_path.to_owned());
seen.insert(lhs_path);
seen.insert(rhs_path);
i += 1;
j += 1;
}
}
_ => break,
}
}
res.extend(lhs_paths.into_iter().skip(i));
res.extend(rhs_paths.into_iter().skip(j));
res
}
#[cfg(test)]
mod tests {
use super::*;

@ -41,7 +41,7 @@ extern crate log;
use crate::hunks::{matched_pos_to_hunks, merge_adjacent};
use changes::ChangeMap;
use context::opposite_positions;
use files::read_files_or_die;
use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
use guess_language::guess;
use log::info;
use mimalloc::MiMalloc;
@ -61,14 +61,9 @@ use style::BackgroundColor;
use summary::{DiffResult, FileContent};
use syntax::init_next_prev;
use typed_arena::Arena;
use walkdir::WalkDir;
use crate::{
dijkstra::mark_syntax,
files::{is_probably_binary, read_or_die},
lines::MaxLine,
syntax::init_all_info,
tree_sitter_parser as tsp,
dijkstra::mark_syntax, lines::MaxLine, syntax::init_all_info, tree_sitter_parser as tsp,
};
extern crate pretty_env_logger;
@ -169,7 +164,6 @@ fn main() {
diff_directories(
lhs_path,
rhs_path,
missing_as_empty,
node_limit,
byte_limit,
language_override,
@ -377,7 +371,6 @@ fn diff_file_content(
fn diff_directories<'a>(
lhs_dir: &'a Path,
rhs_dir: &'a Path,
missing_as_empty: bool,
node_limit: u32,
byte_limit: usize,
language_override: Option<guess_language::Language>,
@ -385,24 +378,19 @@ fn diff_directories<'a>(
// We greedily list all files in the directory, and then diff them
// in parallel. This is assuming that diffing is slower than
// enumerating files, so it benefits more from parallelism.
let lhs_paths: Vec<_> = WalkDir::new(lhs_dir)
.into_iter()
.filter_map(Result::ok)
.map(|entry| entry.into_path())
.filter(|lhs_path| !lhs_path.is_dir())
.collect();
let paths = relative_paths_in_either(lhs_dir, rhs_dir);
lhs_paths.into_par_iter().map(move |lhs_path| {
info!("LHS path is {:?} inside {:?}", lhs_path, lhs_dir);
paths.into_par_iter().map(move |rel_path| {
info!("Relative path is {:?} inside {:?}", rel_path, lhs_dir);
let rel_path = lhs_path.strip_prefix(lhs_dir).unwrap();
let rhs_path = Path::new(rhs_dir).join(rel_path);
let lhs_path = Path::new(lhs_dir).join(&rel_path);
let rhs_path = Path::new(rhs_dir).join(&rel_path);
diff_file(
&rel_path.to_string_lossy(),
&lhs_path,
&rhs_path,
missing_as_empty,
true,
node_limit,
byte_limit,
language_override,

@ -90,7 +90,7 @@ fn app() -> clap::Command<'static> {
)
.arg(
Arg::new("missing-as-empty").long("missing-as-empty")
.help("Treat paths that don't exist as equivalent to an empty file.")
.help("Treat paths that don't exist as equivalent to an empty file. Only applies when diffing files, not directories.")
)
.arg(
Arg::new("language").long("language")