Improve file detection on compressed files

Fixes #835
2025-05-21 00:24:28 +07:00 · 2025-05-21 00:24:28 +07:00 · dbfc68ec6a
parent 1341ec68e5
commit dbfc68ec6a
2 changed files with 12 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,9 @@
 Updated to the latest tree-sitter parser for Erlang, F#, Pascal and
 Swift.

+File detection is now stricter for UTF-8, and recognises more
+compression file types as binary (e.g. zstd or bzip2).
+
 ### Build

 CI on GitHub now uses Ubuntu 22.04 for Linux builds (previously Ubuntu
--- a/src/files.rs
+++ b/src/files.rs
@ -163,8 +163,16 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
        // application/* is a mix of stuff, application/json is fine
        // but application/zip is binary that often decodes as valid
        // UTF-16.
+        //
+        // See
+        // <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
+        // for a list of MIME types.
+        "application/x-bzip" => return ProbableFileKind::Binary,
+        "application/x-bzip2" => return ProbableFileKind::Binary,
+        "application/x-7zip-compressed" => return ProbableFileKind::Binary,
        "application/gzip" => return ProbableFileKind::Binary,
        "application/zip" => return ProbableFileKind::Binary,
+        "application/zstd" => return ProbableFileKind::Binary,
        // Treat all image content as binary.
        v if v.starts_with("image/") => return ProbableFileKind::Binary,
        // Treat all audio content as binary.
@ -199,7 +207,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
        .take(5000)
        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
        .count();
-    if num_utf8_invalid <= 10 {
+    if num_utf8_invalid <= 2 {
        info!(
            "Input file is mostly valid UTF-8 (invalid characters: {})",
            num_utf8_invalid