diff --git a/CHANGELOG.md b/CHANGELOG.md index 43cfd2c97..6ec281a49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,10 +4,8 @@ Updated Clojure, Common Lisp and Zig parsers. -File detection is now stricter with Windows-1252 (Latin 1) encoded -text. Windows-1252 was added in 0.63 and some binary files -(e.g. Brotli compressed files) were incorrectly treated as this -encoding. +Text encoding detection is now stricter, fixing more cases where +binary files were treated as text. Added the `--override-binary` option to force files to be treated as binary rather than text. diff --git a/src/files.rs b/src/files.rs index 3d23fd954..629cbae56 100644 --- a/src/files.rs +++ b/src/files.rs @@ -223,7 +223,7 @@ pub(crate) fn guess_content( let utf8_string = String::from_utf8_lossy(bytes).to_string(); let num_utf8_invalid = utf8_string .chars() - .take(5000) + .take(50000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_utf8_invalid <= 2 { @@ -239,7 +239,7 @@ pub(crate) fn guess_content( let utf16_string = String::from_utf16_lossy(&u16_values); let num_utf16_invalid = utf16_string .chars() - .take(5000) + .take(50000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_utf16_invalid <= 1 { @@ -250,13 +250,13 @@ pub(crate) fn guess_content( return ProbableFileKind::Text(utf16_string); } - // If the input bytes are valid Windows-1252 (an extension of + // If the input bytes are mostly valid Windows-1252 (an extension of // ISO-8859-1 aka Latin 1), treat them as such. let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes); if !saw_malformed { let num_null = latin1_str .chars() - .take(5000) + .take(50000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_null <= 1 {