Make Windows-1252 detection stricter, consistent with other encodings

This should help with #841.
2025-06-20 23:20:48 +07:00 · 2025-06-20 23:20:48 +07:00 · b14cc7c96f
parent 43eb109472
commit b14cc7c96f
2 changed files with 14 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,12 @@
 ## 0.65 (unreleased)

+### Parsing
+
+File detection is now stricter with Windows-1252 (Latin 1) encoded
+text. Windows-1252 was added in 0.63 and some binary files
+(e.g. Brotli compressed files) were incorrectly treated as this
+encoding.
+
 ## 0.64 (released 16th June 2025)

 ### Parsing
--- a/src/files.rs
+++ b/src/files.rs
@ -188,13 +188,14 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
    // to be valid UTF-16. Decoding these as UTF-16 leads to garbage
    // ("mojibake").
    //
-    // To avoid this, we only try UTF-16 after we'vedone MIME type
+    // To avoid this, we only try UTF-16 after we've done MIME type
    // checks for binary, and we conservatively require an explicit
    // byte order mark.
    let u16_values = u16_from_bytes(bytes);
    let utf16_str_result = String::from_utf16(&u16_values);
    match utf16_str_result {
        Ok(valid_utf16_string) if has_utf16_byte_order_mark(bytes) => {
+            info!("Input file is valid UTF-16 with a byte order mark");
            return ProbableFileKind::Text(valid_utf16_string);
        }
        _ => {}
@ -238,9 +239,13 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
        let num_null = utf16_string
            .chars()
            .take(5000)
-            .filter(|c| *c == '\0')
+            .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
            .count();
        if num_null <= 1 {
+            info!(
+                "Input file is mostly valid Latin 1 (invalid characters: {})",
+                num_null
+            );
            return ProbableFileKind::Text(latin1_str.to_string());
        }
    }