Be stricter about text encoding validity

20 was too much for .zip files, where we were only seeing 21 invalid
characters in UTF-8 and 6 in UTF-16.

Improves #358 further
pull/369/head
Wilfred Hughes 2022-09-10 15:43:11 +07:00
parent fe30b7d86b
commit 888894d0f0
1 changed files with 2 additions and 2 deletions

@ -139,7 +139,7 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
.take(1000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 20 {
if num_utf8_invalid <= 10 {
return ProbableFileKind::Text(utf8_string);
}
@ -149,7 +149,7 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
.take(1000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf16_invalid <= 20 {
if num_utf16_invalid <= 5 {
return ProbableFileKind::Text(utf16_string);
}