Check more bytes when detecting encoding

I've observed PDF files that have sufficiently large headers that they were detected as text, which wasn't helpful. Also improve logging to report how many invalid bytes were found.
2023-07-21 08:34:41 +07:00 · 2023-07-21 08:34:41 +07:00 · 4e9637c861
parent 1c0b3153df
commit 4e9637c861
2 changed files with 14 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,10 @@ Updated grammar for Java.
 Improved parsing of qualified constructors in Haskell.
 Difftastic is now stricter about valid UTF-8 and UTF-16, considering
 more of the file's bytes during filetype detection. This fixes cases
 where e.g. PDF was sometimes incorrectly considered as UTF-8.
 ### Diffing
 Improved handling of delimiters ("nested sliders") in languages that
--- a/src/files.rs
+++ b/src/files.rs
@ -193,11 +193,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
    let utf8_string = String::from_utf8_lossy(bytes).to_string();
    let num_utf8_invalid = utf8_string
        .chars()
-        .take(1000)
+        .take(5000)
        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
        .count();
    if num_utf8_invalid <= 10 {
-        info!("Input file is mostly valid UTF-8");
+        info!(
            "Input file is mostly valid UTF-8 (invalid characters: {})",
            num_utf8_invalid
        );
        return ProbableFileKind::Text(utf8_string);
    }
@ -206,11 +209,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
    let utf16_string = String::from_utf16_lossy(&u16_values);
    let num_utf16_invalid = utf16_string
        .chars()
-        .take(1000)
+        .take(5000)
        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
        .count();
    if num_utf16_invalid <= 5 {
-        info!("Input file is mostly valid UTF-16");
+        info!(
            "Input file is mostly valid UTF-16 (invalid characters: {})",
            num_utf16_invalid
        );
        return ProbableFileKind::Text(utf16_string);
    }