From c6da85759c0c3acb0c57abf62e7ffae4621d878a Mon Sep 17 00:00:00 2001 From: Wilfred Hughes Date: Wed, 6 Mar 2024 23:07:59 -0800 Subject: [PATCH] Consider null bytes in text file detection --- CHANGELOG.md | 5 +++++ src/files.rs | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca1f195c7..b99c08a41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## 0.57 (unreleased) +### Parsing + +Text file detection is now stricter, considering null bytes as a hint +that files are binaries. + ### Diffing Scheme now uses the same slider heuristics as other lisps, preferring diff --git a/src/files.rs b/src/files.rs index 4b0f5e23c..0fa3cc25a 100644 --- a/src/files.rs +++ b/src/files.rs @@ -197,7 +197,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { let num_utf8_invalid = utf8_string .chars() .take(5000) - .filter(|c| *c == std::char::REPLACEMENT_CHARACTER) + .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_utf8_invalid <= 10 { info!( @@ -213,7 +213,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { let num_utf16_invalid = utf16_string .chars() .take(5000) - .filter(|c| *c == std::char::REPLACEMENT_CHARACTER) + .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_utf16_invalid <= 5 { info!( @@ -327,6 +327,18 @@ mod tests { assert_eq!(guess_content(&bytes), ProbableFileKind::Binary); } + #[test] + fn test_dex_is_binary() { + let bytes = vec![ + 0x34, 0x8a, 0x4b, 0x8f, 0x77, 0xa4, 0x4e, 0xb1, 0x31, 0x2d, 0x5f, 0xfb, 0x10, 0x08, + 0xa8, 0x6b, 0x58, 0x06, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x05, 0x00, 0x00, 0x23, 0x00, + 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + ]; + + assert_eq!(guess_content(&bytes), ProbableFileKind::Binary); + } + #[test] fn test_png_bytes_are_binary() { let bytes = vec![