Make Windows-1252 detection stricter, consistent with other encodings

This should help with #841.
pull/603/merge
Wilfred Hughes 2025-06-20 23:20:48 +07:00
parent 43eb109472
commit b14cc7c96f
2 changed files with 14 additions and 2 deletions

@ -1,5 +1,12 @@
## 0.65 (unreleased)
### Parsing
File detection is now stricter with Windows-1252 (Latin 1) encoded
text. Windows-1252 was added in 0.63 and some binary files
(e.g. Brotli compressed files) were incorrectly treated as this
encoding.
## 0.64 (released 16th June 2025)
### Parsing

@ -188,13 +188,14 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
// to be valid UTF-16. Decoding these as UTF-16 leads to garbage
// ("mojibake").
//
// To avoid this, we only try UTF-16 after we'vedone MIME type
// To avoid this, we only try UTF-16 after we've done MIME type
// checks for binary, and we conservatively require an explicit
// byte order mark.
let u16_values = u16_from_bytes(bytes);
let utf16_str_result = String::from_utf16(&u16_values);
match utf16_str_result {
Ok(valid_utf16_string) if has_utf16_byte_order_mark(bytes) => {
info!("Input file is valid UTF-16 with a byte order mark");
return ProbableFileKind::Text(valid_utf16_string);
}
_ => {}
@ -238,9 +239,13 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let num_null = utf16_string
.chars()
.take(5000)
.filter(|c| *c == '\0')
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_null <= 1 {
info!(
"Input file is mostly valid Latin 1 (invalid characters: {})",
num_null
);
return ProbableFileKind::Text(latin1_str.to_string());
}
}