diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b4e9cb34..740e4fdcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ text. Windows-1252 was added in 0.63 and some binary files (e.g. Brotli compressed files) were incorrectly treated as this encoding. +Added the `--override-binary` option to force files to be treated as +binary rather than text. + ## 0.64 (released 16th June 2025) ### Parsing diff --git a/src/files.rs b/src/files.rs index 1fd1105be..29224081b 100644 --- a/src/files.rs +++ b/src/files.rs @@ -138,7 +138,24 @@ pub(crate) enum ProbableFileKind { } /// Do these bytes look like a binary (non-textual) format? -pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { +pub(crate) fn guess_content( + bytes: &[u8], + path: &FileArgument, + binary_overrides: &[glob::Pattern], +) -> ProbableFileKind { + if let FileArgument::NamedPath(path) = path { + let path = path.to_string_lossy(); + for pattern in binary_overrides { + if pattern.matches(&path) { + info!( + "Input file is treated as binary due to explicit override glob {}", + pattern + ); + return ProbableFileKind::Binary; + } + } + } + // If the bytes are entirely valid UTF-8, treat them as a string. if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) { info!("Input file is valid UTF-8"); @@ -341,6 +358,10 @@ pub(crate) fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec ProbableFileKind { + super::guess_content(bytes, &FileArgument::Stdin, &[]) + } + #[test] fn test_plaintext_is_text() { let s = "hello world"; diff --git a/src/main.rs b/src/main.rs index ae7271847..2d2e28008 100644 --- a/src/main.rs +++ b/src/main.rs @@ -241,6 +241,7 @@ fn main() { display_options, set_exit_code, language_overrides, + binary_overrides, } => { let diff_result = diff_conflicts_file( &display_path, @@ -248,6 +249,7 @@ fn main() { &display_options, &diff_options, &language_overrides, + &binary_overrides, ); print_diff_result(&display_options, &diff_result); @@ -264,6 +266,7 @@ fn main() { display_options, set_exit_code, language_overrides, + binary_overrides, lhs_path, rhs_path, lhs_permissions, @@ -299,6 +302,7 @@ fn main() { &display_options, &diff_options, &language_overrides, + &binary_overrides, ); if matches!(display_options.display_mode, DisplayMode::Json) { @@ -353,6 +357,7 @@ fn main() { &diff_options, false, &language_overrides, + &binary_overrides, ); if diff_result.has_reportable_change() { encountered_changes = true; @@ -391,9 +396,16 @@ fn diff_file( diff_options: &DiffOptions, missing_as_empty: bool, overrides: &[(LanguageOverride, Vec)], + binary_overrides: &[glob::Pattern], ) -> DiffResult { let (lhs_bytes, rhs_bytes) = read_files_or_die(lhs_path, rhs_path, missing_as_empty); - let (mut lhs_src, mut rhs_src) = match (guess_content(&lhs_bytes), guess_content(&rhs_bytes)) { + + // Override here? Separate option or part of existing --override arg? + + let (mut lhs_src, mut rhs_src) = match ( + guess_content(&lhs_bytes, &lhs_path, binary_overrides), + guess_content(&rhs_bytes, &rhs_path, binary_overrides), + ) { (ProbableFileKind::Binary, _) | (_, ProbableFileKind::Binary) => { return DiffResult { extra_info: renamed, @@ -469,9 +481,10 @@ fn diff_conflicts_file( display_options: &DisplayOptions, diff_options: &DiffOptions, overrides: &[(LanguageOverride, Vec)], + binary_overrides: &[glob::Pattern], ) -> DiffResult { let bytes = read_file_or_die(path); - let mut src = match guess_content(&bytes) { + let mut src = match guess_content(&bytes, path, binary_overrides) { ProbableFileKind::Text(src) => src, ProbableFileKind::Binary => { eprintln!("error: Expected a text file with conflict markers, got a binary file."); @@ -788,10 +801,12 @@ fn diff_directories<'a>( display_options: &DisplayOptions, diff_options: &DiffOptions, overrides: &[(LanguageOverride, Vec)], + binary_overrides: &[glob::Pattern], ) -> impl ParallelIterator + 'a { let diff_options = diff_options.clone(); let display_options = display_options.clone(); let overrides: Vec<_> = overrides.into(); + let binary_overrides: Vec<_> = binary_overrides.into(); // We greedily list all files in the directory, and then diff them // in parallel. This is assuming that diffing is slower than @@ -815,6 +830,7 @@ fn diff_directories<'a>( &diff_options, true, &overrides, + &binary_overrides, ) }) } diff --git a/src/options.rs b/src/options.rs index a7729c53b..d7e553600 100644 --- a/src/options.rs +++ b/src/options.rs @@ -271,6 +271,25 @@ $ export DFT_OVERRIDE_2='*.js:javascript jsx' When multiple overrides are specified, the first matching override wins.")) .env("DFT_OVERRIDE") ) + .arg( + Arg::new("override-binary").long("override-binary") + .value_name("GLOB") + .action(ArgAction::Append) + .help(concat!("Treat file names matching this glob as binary files, overriding normal binary detection. For example: + +$ ", env!("CARGO_BIN_NAME"), " --override-binary='*.gz' old.gz new.gz + +This argument may be given more than once. For example: + +$ ", env!("CARGO_BIN_NAME"), " --override-binary='*.gz' --override-binary='foo.pickle' old.gz new.gz + +To configure multiple overrides using environment variables, difftastic also accepts DFT_OVERRIDE_BINARY_1 up to DFT_OVERRIDE_BINARY_9. + +$ export DFT_OVERRIDE_BINARY='*.gz' +$ export DFT_OVERRIDE_BINARY_1='*.bz2' +$ export DFT_OVERRIDE_BINARY_2='foo.pickle'")) + .env("DFT_OVERRIDE_BINARY") + ) .arg( Arg::new("list-languages").long("list-languages") .action(ArgAction::SetTrue) @@ -466,6 +485,7 @@ pub(crate) enum Mode { display_options: DisplayOptions, set_exit_code: bool, language_overrides: Vec<(LanguageOverride, Vec)>, + binary_overrides: Vec, /// The path where we can read the LHS file. This is often a /// temporary file generated by source control. lhs_path: FileArgument, @@ -484,6 +504,7 @@ pub(crate) enum Mode { display_options: DisplayOptions, set_exit_code: bool, language_overrides: Vec<(LanguageOverride, Vec)>, + binary_overrides: Vec, path: FileArgument, /// The path that we show to the user. display_path: String, @@ -629,6 +650,30 @@ fn parse_overrides_or_die(raw_overrides: &[String]) -> Vec<(LanguageOverride, Ve combined_overrides } +fn parse_binary_overrides_or_die(glob_strs: &[String]) -> Vec { + let mut overrides: Vec = vec![]; + let mut invalid_syntax = false; + + for glob_str in glob_strs { + match glob::Pattern::new(glob_str) { + Ok(pattern) => { + overrides.push(pattern); + } + Err(e) => { + eprintln!("Invalid glob syntax '{}'", glob_str); + eprintln!("Glob parsing error: {}", e.msg); + invalid_syntax = true; + } + } + } + + if invalid_syntax { + std::process::exit(EXIT_BAD_ARGUMENTS); + } + + overrides +} + /// Parse CLI arguments passed to the binary. pub(crate) fn parse_args() -> Mode { let matches = app().get_matches(); @@ -649,6 +694,18 @@ pub(crate) fn parse_args() -> Mode { let ignore_comments = matches.get_flag("ignore-comments"); + let mut raw_binary_overrides: Vec = vec![]; + if let Some(binary_overrides) = matches.get_many("override-binary") { + raw_binary_overrides = binary_overrides.cloned().collect(); + } + for i in 1..=9 { + if let Ok(value) = env::var(format!("DFT_OVERRIDE_BINARY_{}", i)) { + raw_binary_overrides.push(value); + } + } + + let binary_overrides = parse_binary_overrides_or_die(&raw_binary_overrides); + let mut raw_overrides: Vec = vec![]; if let Some(overrides) = matches.get_many("override") { raw_overrides = overrides.cloned().collect(); @@ -859,6 +916,7 @@ pub(crate) fn parse_args() -> Mode { display_options, set_exit_code, language_overrides, + binary_overrides, }; } _ => { @@ -892,6 +950,7 @@ pub(crate) fn parse_args() -> Mode { display_options, set_exit_code, language_overrides, + binary_overrides, lhs_path, rhs_path, lhs_permissions, diff --git a/tests/cli.rs b/tests/cli.rs index 2d2a10b87..ebbe3aa0f 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -61,6 +61,17 @@ fn binary_changed() { cmd.assert().stdout(predicate_fn); } +#[test] +fn binary_override() { + let mut cmd = get_base_command(); + + cmd.arg("--override-binary=*.js") + .arg("sample_files/simple_1.js") + .arg("sample_files/simple_2.js"); + let predicate_fn = predicate::str::contains("Binary contents changed"); + cmd.assert().stdout(predicate_fn); +} + #[test] fn has_changes_default_exit_code() { let mut cmd = get_base_command();