Be stricter about slightly-invalid text decoding

pull/864/head
Wilfred Hughes 2025-07-10 09:26:01 +07:00
parent 0010944ef5
commit 73a293ffc9
2 changed files with 6 additions and 8 deletions

@ -4,10 +4,8 @@
Updated Clojure, Common Lisp and Zig parsers. Updated Clojure, Common Lisp and Zig parsers.
File detection is now stricter with Windows-1252 (Latin 1) encoded Text encoding detection is now stricter, fixing more cases where
text. Windows-1252 was added in 0.63 and some binary files binary files were treated as text.
(e.g. Brotli compressed files) were incorrectly treated as this
encoding.
Added the `--override-binary` option to force files to be treated as Added the `--override-binary` option to force files to be treated as
binary rather than text. binary rather than text.

@ -223,7 +223,7 @@ pub(crate) fn guess_content(
let utf8_string = String::from_utf8_lossy(bytes).to_string(); let utf8_string = String::from_utf8_lossy(bytes).to_string();
let num_utf8_invalid = utf8_string let num_utf8_invalid = utf8_string
.chars() .chars()
.take(5000) .take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count(); .count();
if num_utf8_invalid <= 2 { if num_utf8_invalid <= 2 {
@ -239,7 +239,7 @@ pub(crate) fn guess_content(
let utf16_string = String::from_utf16_lossy(&u16_values); let utf16_string = String::from_utf16_lossy(&u16_values);
let num_utf16_invalid = utf16_string let num_utf16_invalid = utf16_string
.chars() .chars()
.take(5000) .take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count(); .count();
if num_utf16_invalid <= 1 { if num_utf16_invalid <= 1 {
@ -250,13 +250,13 @@ pub(crate) fn guess_content(
return ProbableFileKind::Text(utf16_string); return ProbableFileKind::Text(utf16_string);
} }
// If the input bytes are valid Windows-1252 (an extension of // If the input bytes are mostly valid Windows-1252 (an extension of
// ISO-8859-1 aka Latin 1), treat them as such. // ISO-8859-1 aka Latin 1), treat them as such.
let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes); let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes);
if !saw_malformed { if !saw_malformed {
let num_null = latin1_str let num_null = latin1_str
.chars() .chars()
.take(5000) .take(50000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count(); .count();
if num_null <= 1 { if num_null <= 1 {