Allow users to override binary detection using globs

Closes #841
pull/856/head
Wilfred Hughes 2025-07-02 19:13:36 +07:00
parent 6aa5eb2d24
commit 62752b6ab6
5 changed files with 113 additions and 3 deletions

@ -7,6 +7,9 @@ text. Windows-1252 was added in 0.63 and some binary files
(e.g. Brotli compressed files) were incorrectly treated as this
encoding.
Added the `--override-binary` option to force files to be treated as
binary rather than text.
## 0.64 (released 16th June 2025)
### Parsing

@ -138,7 +138,24 @@ pub(crate) enum ProbableFileKind {
}
/// Do these bytes look like a binary (non-textual) format?
pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
pub(crate) fn guess_content(
bytes: &[u8],
path: &FileArgument,
binary_overrides: &[glob::Pattern],
) -> ProbableFileKind {
if let FileArgument::NamedPath(path) = path {
let path = path.to_string_lossy();
for pattern in binary_overrides {
if pattern.matches(&path) {
info!(
"Input file is treated as binary due to explicit override glob {}",
pattern
);
return ProbableFileKind::Binary;
}
}
}
// If the bytes are entirely valid UTF-8, treat them as a string.
if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) {
info!("Input file is valid UTF-8");
@ -341,6 +358,10 @@ pub(crate) fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec<Pa
mod tests {
use super::*;
fn guess_content(bytes: &[u8]) -> ProbableFileKind {
super::guess_content(bytes, &FileArgument::Stdin, &[])
}
#[test]
fn test_plaintext_is_text() {
let s = "hello world";

@ -241,6 +241,7 @@ fn main() {
display_options,
set_exit_code,
language_overrides,
binary_overrides,
} => {
let diff_result = diff_conflicts_file(
&display_path,
@ -248,6 +249,7 @@ fn main() {
&display_options,
&diff_options,
&language_overrides,
&binary_overrides,
);
print_diff_result(&display_options, &diff_result);
@ -264,6 +266,7 @@ fn main() {
display_options,
set_exit_code,
language_overrides,
binary_overrides,
lhs_path,
rhs_path,
lhs_permissions,
@ -299,6 +302,7 @@ fn main() {
&display_options,
&diff_options,
&language_overrides,
&binary_overrides,
);
if matches!(display_options.display_mode, DisplayMode::Json) {
@ -353,6 +357,7 @@ fn main() {
&diff_options,
false,
&language_overrides,
&binary_overrides,
);
if diff_result.has_reportable_change() {
encountered_changes = true;
@ -391,9 +396,16 @@ fn diff_file(
diff_options: &DiffOptions,
missing_as_empty: bool,
overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
binary_overrides: &[glob::Pattern],
) -> DiffResult {
let (lhs_bytes, rhs_bytes) = read_files_or_die(lhs_path, rhs_path, missing_as_empty);
let (mut lhs_src, mut rhs_src) = match (guess_content(&lhs_bytes), guess_content(&rhs_bytes)) {
// Override here? Separate option or part of existing --override arg?
let (mut lhs_src, mut rhs_src) = match (
guess_content(&lhs_bytes, &lhs_path, binary_overrides),
guess_content(&rhs_bytes, &rhs_path, binary_overrides),
) {
(ProbableFileKind::Binary, _) | (_, ProbableFileKind::Binary) => {
return DiffResult {
extra_info: renamed,
@ -469,9 +481,10 @@ fn diff_conflicts_file(
display_options: &DisplayOptions,
diff_options: &DiffOptions,
overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
binary_overrides: &[glob::Pattern],
) -> DiffResult {
let bytes = read_file_or_die(path);
let mut src = match guess_content(&bytes) {
let mut src = match guess_content(&bytes, path, binary_overrides) {
ProbableFileKind::Text(src) => src,
ProbableFileKind::Binary => {
eprintln!("error: Expected a text file with conflict markers, got a binary file.");
@ -788,10 +801,12 @@ fn diff_directories<'a>(
display_options: &DisplayOptions,
diff_options: &DiffOptions,
overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
binary_overrides: &[glob::Pattern],
) -> impl ParallelIterator<Item = DiffResult> + 'a {
let diff_options = diff_options.clone();
let display_options = display_options.clone();
let overrides: Vec<_> = overrides.into();
let binary_overrides: Vec<_> = binary_overrides.into();
// We greedily list all files in the directory, and then diff them
// in parallel. This is assuming that diffing is slower than
@ -815,6 +830,7 @@ fn diff_directories<'a>(
&diff_options,
true,
&overrides,
&binary_overrides,
)
})
}

@ -271,6 +271,25 @@ $ export DFT_OVERRIDE_2='*.js:javascript jsx'
When multiple overrides are specified, the first matching override wins."))
.env("DFT_OVERRIDE")
)
.arg(
Arg::new("override-binary").long("override-binary")
.value_name("GLOB")
.action(ArgAction::Append)
.help(concat!("Treat file names matching this glob as binary files, overriding normal binary detection. For example:
$ ", env!("CARGO_BIN_NAME"), " --override-binary='*.gz' old.gz new.gz
This argument may be given more than once. For example:
$ ", env!("CARGO_BIN_NAME"), " --override-binary='*.gz' --override-binary='foo.pickle' old.gz new.gz
To configure multiple overrides using environment variables, difftastic also accepts DFT_OVERRIDE_BINARY_1 up to DFT_OVERRIDE_BINARY_9.
$ export DFT_OVERRIDE_BINARY='*.gz'
$ export DFT_OVERRIDE_BINARY_1='*.bz2'
$ export DFT_OVERRIDE_BINARY_2='foo.pickle'"))
.env("DFT_OVERRIDE_BINARY")
)
.arg(
Arg::new("list-languages").long("list-languages")
.action(ArgAction::SetTrue)
@ -466,6 +485,7 @@ pub(crate) enum Mode {
display_options: DisplayOptions,
set_exit_code: bool,
language_overrides: Vec<(LanguageOverride, Vec<glob::Pattern>)>,
binary_overrides: Vec<glob::Pattern>,
/// The path where we can read the LHS file. This is often a
/// temporary file generated by source control.
lhs_path: FileArgument,
@ -484,6 +504,7 @@ pub(crate) enum Mode {
display_options: DisplayOptions,
set_exit_code: bool,
language_overrides: Vec<(LanguageOverride, Vec<glob::Pattern>)>,
binary_overrides: Vec<glob::Pattern>,
path: FileArgument,
/// The path that we show to the user.
display_path: String,
@ -629,6 +650,30 @@ fn parse_overrides_or_die(raw_overrides: &[String]) -> Vec<(LanguageOverride, Ve
combined_overrides
}
fn parse_binary_overrides_or_die(glob_strs: &[String]) -> Vec<glob::Pattern> {
let mut overrides: Vec<glob::Pattern> = vec![];
let mut invalid_syntax = false;
for glob_str in glob_strs {
match glob::Pattern::new(glob_str) {
Ok(pattern) => {
overrides.push(pattern);
}
Err(e) => {
eprintln!("Invalid glob syntax '{}'", glob_str);
eprintln!("Glob parsing error: {}", e.msg);
invalid_syntax = true;
}
}
}
if invalid_syntax {
std::process::exit(EXIT_BAD_ARGUMENTS);
}
overrides
}
/// Parse CLI arguments passed to the binary.
pub(crate) fn parse_args() -> Mode {
let matches = app().get_matches();
@ -649,6 +694,18 @@ pub(crate) fn parse_args() -> Mode {
let ignore_comments = matches.get_flag("ignore-comments");
let mut raw_binary_overrides: Vec<String> = vec![];
if let Some(binary_overrides) = matches.get_many("override-binary") {
raw_binary_overrides = binary_overrides.cloned().collect();
}
for i in 1..=9 {
if let Ok(value) = env::var(format!("DFT_OVERRIDE_BINARY_{}", i)) {
raw_binary_overrides.push(value);
}
}
let binary_overrides = parse_binary_overrides_or_die(&raw_binary_overrides);
let mut raw_overrides: Vec<String> = vec![];
if let Some(overrides) = matches.get_many("override") {
raw_overrides = overrides.cloned().collect();
@ -859,6 +916,7 @@ pub(crate) fn parse_args() -> Mode {
display_options,
set_exit_code,
language_overrides,
binary_overrides,
};
}
_ => {
@ -892,6 +950,7 @@ pub(crate) fn parse_args() -> Mode {
display_options,
set_exit_code,
language_overrides,
binary_overrides,
lhs_path,
rhs_path,
lhs_permissions,

@ -61,6 +61,17 @@ fn binary_changed() {
cmd.assert().stdout(predicate_fn);
}
#[test]
fn binary_override() {
let mut cmd = get_base_command();
cmd.arg("--override-binary=*.js")
.arg("sample_files/simple_1.js")
.arg("sample_files/simple_2.js");
let predicate_fn = predicate::str::contains("Binary contents changed");
cmd.assert().stdout(predicate_fn);
}
#[test]
fn has_changes_default_exit_code() {
let mut cmd = get_base_command();