Be stricter about slightly-invalid text decoding

pull/864/head
Wilfred Hughes 2025-07-10 09:26:01 +07:00
parent 0010944ef5
commit 73a293ffc9
2 changed files with 6 additions and 8 deletions

@ -4,10 +4,8 @@
Updated Clojure, Common Lisp and Zig parsers.
File detection is now stricter with Windows-1252 (Latin 1) encoded
text. Windows-1252 was added in 0.63 and some binary files
(e.g. Brotli compressed files) were incorrectly treated as this
encoding.
Text encoding detection is now stricter, fixing more cases where
binary files were treated as text.
Added the `--override-binary` option to force files to be treated as
binary rather than text.

@ -223,7 +223,7 @@ pub(crate) fn guess_content(
let utf8_string = String::from_utf8_lossy(bytes).to_string();
let num_utf8_invalid = utf8_string
.chars()
.take(5000)
.take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 2 {
@ -239,7 +239,7 @@ pub(crate) fn guess_content(
let utf16_string = String::from_utf16_lossy(&u16_values);
let num_utf16_invalid = utf16_string
.chars()
.take(5000)
.take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf16_invalid <= 1 {
@ -250,13 +250,13 @@ pub(crate) fn guess_content(
return ProbableFileKind::Text(utf16_string);
}
// If the input bytes are valid Windows-1252 (an extension of
// If the input bytes are mostly valid Windows-1252 (an extension of
// ISO-8859-1 aka Latin 1), treat them as such.
let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes);
if !saw_malformed {
let num_null = latin1_str
.chars()
.take(5000)
.take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_null <= 1 {