Check more bytes when detecting encoding

I've observed PDF files that have sufficiently large headers that they
were detected as text, which wasn't helpful.

Also improve logging to report how many invalid bytes were found.
pull/594/head
Wilfred Hughes 2023-07-21 08:34:41 +07:00
parent 1c0b3153df
commit 4e9637c861
2 changed files with 14 additions and 4 deletions

@ -8,6 +8,10 @@ Updated grammar for Java.
Improved parsing of qualified constructors in Haskell. Improved parsing of qualified constructors in Haskell.
Difftastic is now stricter about valid UTF-8 and UTF-16, considering
more of the file's bytes during filetype detection. This fixes cases
where e.g. PDF was sometimes incorrectly considered as UTF-8.
### Diffing ### Diffing
Improved handling of delimiters ("nested sliders") in languages that Improved handling of delimiters ("nested sliders") in languages that

@ -193,11 +193,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let utf8_string = String::from_utf8_lossy(bytes).to_string(); let utf8_string = String::from_utf8_lossy(bytes).to_string();
let num_utf8_invalid = utf8_string let num_utf8_invalid = utf8_string
.chars() .chars()
.take(1000) .take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.count(); .count();
if num_utf8_invalid <= 10 { if num_utf8_invalid <= 10 {
info!("Input file is mostly valid UTF-8"); info!(
"Input file is mostly valid UTF-8 (invalid characters: {})",
num_utf8_invalid
);
return ProbableFileKind::Text(utf8_string); return ProbableFileKind::Text(utf8_string);
} }
@ -206,11 +209,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let utf16_string = String::from_utf16_lossy(&u16_values); let utf16_string = String::from_utf16_lossy(&u16_values);
let num_utf16_invalid = utf16_string let num_utf16_invalid = utf16_string
.chars() .chars()
.take(1000) .take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.count(); .count();
if num_utf16_invalid <= 5 { if num_utf16_invalid <= 5 {
info!("Input file is mostly valid UTF-16"); info!(
"Input file is mostly valid UTF-16 (invalid characters: {})",
num_utf16_invalid
);
return ProbableFileKind::Text(utf16_string); return ProbableFileKind::Text(utf16_string);
} }