Allow users to override binary detection using globs

Closes #841
pull/856/head
Wilfred Hughes 2025-07-02 19:13:36 +07:00
parent 6aa5eb2d24
commit 62752b6ab6
5 changed files with 113 additions and 3 deletions

@ -7,6 +7,9 @@ text. Windows-1252 was added in 0.63 and some binary files
(e.g. Brotli compressed files) were incorrectly treated as this (e.g. Brotli compressed files) were incorrectly treated as this
encoding. encoding.
Added the `--override-binary` option to force files to be treated as
binary rather than text.
## 0.64 (released 16th June 2025) ## 0.64 (released 16th June 2025)
### Parsing ### Parsing

@ -138,7 +138,24 @@ pub(crate) enum ProbableFileKind {
} }
/// Do these bytes look like a binary (non-textual) format? /// Do these bytes look like a binary (non-textual) format?
pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { pub(crate) fn guess_content(
bytes: &[u8],
path: &FileArgument,
binary_overrides: &[glob::Pattern],
) -> ProbableFileKind {
if let FileArgument::NamedPath(path) = path {
let path = path.to_string_lossy();
for pattern in binary_overrides {
if pattern.matches(&path) {
info!(
"Input file is treated as binary due to explicit override glob {}",
pattern
);
return ProbableFileKind::Binary;
}
}
}
// If the bytes are entirely valid UTF-8, treat them as a string. // If the bytes are entirely valid UTF-8, treat them as a string.
if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) { if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) {
info!("Input file is valid UTF-8"); info!("Input file is valid UTF-8");
@ -341,6 +358,10 @@ pub(crate) fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec<Pa
mod tests { mod tests {
use super::*; use super::*;
fn guess_content(bytes: &[u8]) -> ProbableFileKind {
super::guess_content(bytes, &FileArgument::Stdin, &[])
}
#[test] #[test]
fn test_plaintext_is_text() { fn test_plaintext_is_text() {
let s = "hello world"; let s = "hello world";

@ -241,6 +241,7 @@ fn main() {
display_options, display_options,
set_exit_code, set_exit_code,
language_overrides, language_overrides,
binary_overrides,
} => { } => {
let diff_result = diff_conflicts_file( let diff_result = diff_conflicts_file(
&display_path, &display_path,
@ -248,6 +249,7 @@ fn main() {
&display_options, &display_options,
&diff_options, &diff_options,
&language_overrides, &language_overrides,
&binary_overrides,
); );
print_diff_result(&display_options, &diff_result); print_diff_result(&display_options, &diff_result);
@ -264,6 +266,7 @@ fn main() {
display_options, display_options,
set_exit_code, set_exit_code,
language_overrides, language_overrides,
binary_overrides,
lhs_path, lhs_path,
rhs_path, rhs_path,
lhs_permissions, lhs_permissions,
@ -299,6 +302,7 @@ fn main() {
&display_options, &display_options,
&diff_options, &diff_options,
&language_overrides, &language_overrides,
&binary_overrides,
); );
if matches!(display_options.display_mode, DisplayMode::Json) { if matches!(display_options.display_mode, DisplayMode::Json) {
@ -353,6 +357,7 @@ fn main() {
&diff_options, &diff_options,
false, false,
&language_overrides, &language_overrides,
&binary_overrides,
); );
if diff_result.has_reportable_change() { if diff_result.has_reportable_change() {
encountered_changes = true; encountered_changes = true;
@ -391,9 +396,16 @@ fn diff_file(
diff_options: &DiffOptions, diff_options: &DiffOptions,
missing_as_empty: bool, missing_as_empty: bool,
overrides: &[(LanguageOverride, Vec<glob::Pattern>)], overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
binary_overrides: &[glob::Pattern],
) -> DiffResult { ) -> DiffResult {
let (lhs_bytes, rhs_bytes) = read_files_or_die(lhs_path, rhs_path, missing_as_empty); let (lhs_bytes, rhs_bytes) = read_files_or_die(lhs_path, rhs_path, missing_as_empty);
let (mut lhs_src, mut rhs_src) = match (guess_content(&lhs_bytes), guess_content(&rhs_bytes)) {
// Override here? Separate option or part of existing --override arg?
let (mut lhs_src, mut rhs_src) = match (
guess_content(&lhs_bytes, &lhs_path, binary_overrides),
guess_content(&rhs_bytes, &rhs_path, binary_overrides),
) {
(ProbableFileKind::Binary, _) | (_, ProbableFileKind::Binary) => { (ProbableFileKind::Binary, _) | (_, ProbableFileKind::Binary) => {
return DiffResult { return DiffResult {
extra_info: renamed, extra_info: renamed,
@ -469,9 +481,10 @@ fn diff_conflicts_file(
display_options: &DisplayOptions, display_options: &DisplayOptions,
diff_options: &DiffOptions, diff_options: &DiffOptions,
overrides: &[(LanguageOverride, Vec<glob::Pattern>)], overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
binary_overrides: &[glob::Pattern],
) -> DiffResult { ) -> DiffResult {
let bytes = read_file_or_die(path); let bytes = read_file_or_die(path);
let mut src = match guess_content(&bytes) { let mut src = match guess_content(&bytes, path, binary_overrides) {
ProbableFileKind::Text(src) => src, ProbableFileKind::Text(src) => src,
ProbableFileKind::Binary => { ProbableFileKind::Binary => {
eprintln!("error: Expected a text file with conflict markers, got a binary file."); eprintln!("error: Expected a text file with conflict markers, got a binary file.");
@ -788,10 +801,12 @@ fn diff_directories<'a>(
display_options: &DisplayOptions, display_options: &DisplayOptions,
diff_options: &DiffOptions, diff_options: &DiffOptions,
overrides: &[(LanguageOverride, Vec<glob::Pattern>)], overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
binary_overrides: &[glob::Pattern],
) -> impl ParallelIterator<Item = DiffResult> + 'a { ) -> impl ParallelIterator<Item = DiffResult> + 'a {
let diff_options = diff_options.clone(); let diff_options = diff_options.clone();
let display_options = display_options.clone(); let display_options = display_options.clone();
let overrides: Vec<_> = overrides.into(); let overrides: Vec<_> = overrides.into();
let binary_overrides: Vec<_> = binary_overrides.into();
// We greedily list all files in the directory, and then diff them // We greedily list all files in the directory, and then diff them
// in parallel. This is assuming that diffing is slower than // in parallel. This is assuming that diffing is slower than
@ -815,6 +830,7 @@ fn diff_directories<'a>(
&diff_options, &diff_options,
true, true,
&overrides, &overrides,
&binary_overrides,
) )
}) })
} }

@ -271,6 +271,25 @@ $ export DFT_OVERRIDE_2='*.js:javascript jsx'
When multiple overrides are specified, the first matching override wins.")) When multiple overrides are specified, the first matching override wins."))
.env("DFT_OVERRIDE") .env("DFT_OVERRIDE")
) )
.arg(
Arg::new("override-binary").long("override-binary")
.value_name("GLOB")
.action(ArgAction::Append)
.help(concat!("Treat file names matching this glob as binary files, overriding normal binary detection. For example:
$ ", env!("CARGO_BIN_NAME"), " --override-binary='*.gz' old.gz new.gz
This argument may be given more than once. For example:
$ ", env!("CARGO_BIN_NAME"), " --override-binary='*.gz' --override-binary='foo.pickle' old.gz new.gz
To configure multiple overrides using environment variables, difftastic also accepts DFT_OVERRIDE_BINARY_1 up to DFT_OVERRIDE_BINARY_9.
$ export DFT_OVERRIDE_BINARY='*.gz'
$ export DFT_OVERRIDE_BINARY_1='*.bz2'
$ export DFT_OVERRIDE_BINARY_2='foo.pickle'"))
.env("DFT_OVERRIDE_BINARY")
)
.arg( .arg(
Arg::new("list-languages").long("list-languages") Arg::new("list-languages").long("list-languages")
.action(ArgAction::SetTrue) .action(ArgAction::SetTrue)
@ -466,6 +485,7 @@ pub(crate) enum Mode {
display_options: DisplayOptions, display_options: DisplayOptions,
set_exit_code: bool, set_exit_code: bool,
language_overrides: Vec<(LanguageOverride, Vec<glob::Pattern>)>, language_overrides: Vec<(LanguageOverride, Vec<glob::Pattern>)>,
binary_overrides: Vec<glob::Pattern>,
/// The path where we can read the LHS file. This is often a /// The path where we can read the LHS file. This is often a
/// temporary file generated by source control. /// temporary file generated by source control.
lhs_path: FileArgument, lhs_path: FileArgument,
@ -484,6 +504,7 @@ pub(crate) enum Mode {
display_options: DisplayOptions, display_options: DisplayOptions,
set_exit_code: bool, set_exit_code: bool,
language_overrides: Vec<(LanguageOverride, Vec<glob::Pattern>)>, language_overrides: Vec<(LanguageOverride, Vec<glob::Pattern>)>,
binary_overrides: Vec<glob::Pattern>,
path: FileArgument, path: FileArgument,
/// The path that we show to the user. /// The path that we show to the user.
display_path: String, display_path: String,
@ -629,6 +650,30 @@ fn parse_overrides_or_die(raw_overrides: &[String]) -> Vec<(LanguageOverride, Ve
combined_overrides combined_overrides
} }
fn parse_binary_overrides_or_die(glob_strs: &[String]) -> Vec<glob::Pattern> {
let mut overrides: Vec<glob::Pattern> = vec![];
let mut invalid_syntax = false;
for glob_str in glob_strs {
match glob::Pattern::new(glob_str) {
Ok(pattern) => {
overrides.push(pattern);
}
Err(e) => {
eprintln!("Invalid glob syntax '{}'", glob_str);
eprintln!("Glob parsing error: {}", e.msg);
invalid_syntax = true;
}
}
}
if invalid_syntax {
std::process::exit(EXIT_BAD_ARGUMENTS);
}
overrides
}
/// Parse CLI arguments passed to the binary. /// Parse CLI arguments passed to the binary.
pub(crate) fn parse_args() -> Mode { pub(crate) fn parse_args() -> Mode {
let matches = app().get_matches(); let matches = app().get_matches();
@ -649,6 +694,18 @@ pub(crate) fn parse_args() -> Mode {
let ignore_comments = matches.get_flag("ignore-comments"); let ignore_comments = matches.get_flag("ignore-comments");
let mut raw_binary_overrides: Vec<String> = vec![];
if let Some(binary_overrides) = matches.get_many("override-binary") {
raw_binary_overrides = binary_overrides.cloned().collect();
}
for i in 1..=9 {
if let Ok(value) = env::var(format!("DFT_OVERRIDE_BINARY_{}", i)) {
raw_binary_overrides.push(value);
}
}
let binary_overrides = parse_binary_overrides_or_die(&raw_binary_overrides);
let mut raw_overrides: Vec<String> = vec![]; let mut raw_overrides: Vec<String> = vec![];
if let Some(overrides) = matches.get_many("override") { if let Some(overrides) = matches.get_many("override") {
raw_overrides = overrides.cloned().collect(); raw_overrides = overrides.cloned().collect();
@ -859,6 +916,7 @@ pub(crate) fn parse_args() -> Mode {
display_options, display_options,
set_exit_code, set_exit_code,
language_overrides, language_overrides,
binary_overrides,
}; };
} }
_ => { _ => {
@ -892,6 +950,7 @@ pub(crate) fn parse_args() -> Mode {
display_options, display_options,
set_exit_code, set_exit_code,
language_overrides, language_overrides,
binary_overrides,
lhs_path, lhs_path,
rhs_path, rhs_path,
lhs_permissions, lhs_permissions,

@ -61,6 +61,17 @@ fn binary_changed() {
cmd.assert().stdout(predicate_fn); cmd.assert().stdout(predicate_fn);
} }
#[test]
fn binary_override() {
let mut cmd = get_base_command();
cmd.arg("--override-binary=*.js")
.arg("sample_files/simple_1.js")
.arg("sample_files/simple_2.js");
let predicate_fn = predicate::str::contains("Binary contents changed");
cmd.assert().stdout(predicate_fn);
}
#[test] #[test]
fn has_changes_default_exit_code() { fn has_changes_default_exit_code() {
let mut cmd = get_base_command(); let mut cmd = get_base_command();