From c6da85759c0c3acb0c57abf62e7ffae4621d878a Mon Sep 17 00:00:00 2001
From: Wilfred Hughes <me@wilfred.me.uk>
Date: Wed, 6 Mar 2024 23:07:59 -0800
Subject: [PATCH] Consider null bytes in text file detection

---
 CHANGELOG.md |  5 +++++
 src/files.rs | 16 ++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca1f195c7..b99c08a41 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 ## 0.57 (unreleased)
 
+### Parsing
+
+Text file detection is now stricter, considering null bytes as a hint
+that files are binaries.
+
 ### Diffing
 
 Scheme now uses the same slider heuristics as other lisps, preferring
diff --git a/src/files.rs b/src/files.rs
index 4b0f5e23c..0fa3cc25a 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -197,7 +197,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
     let num_utf8_invalid = utf8_string
         .chars()
         .take(5000)
-        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
+        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
         .count();
     if num_utf8_invalid <= 10 {
         info!(
@@ -213,7 +213,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
     let num_utf16_invalid = utf16_string
         .chars()
         .take(5000)
-        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
+        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
         .count();
     if num_utf16_invalid <= 5 {
         info!(
@@ -327,6 +327,18 @@ mod tests {
         assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
     }
 
+    #[test]
+    fn test_dex_is_binary() {
+        let bytes = vec![
+            0x34, 0x8a, 0x4b, 0x8f, 0x77, 0xa4, 0x4e, 0xb1, 0x31, 0x2d, 0x5f, 0xfb, 0x10, 0x08,
+            0xa8, 0x6b, 0x58, 0x06, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x05, 0x00, 0x00, 0x23, 0x00,
+            0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+        ];
+
+        assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
+    }
+
     #[test]
     fn test_png_bytes_are_binary() {
         let bytes = vec![