From 084a72b558f1813fb7fe34035a2c2c256079c08a Mon Sep 17 00:00:00 2001 From: Wilfred Hughes Date: Sat, 12 Jul 2025 00:35:17 +0100 Subject: [PATCH] Stop supporting invalid UTF-16 when decoding files --- src/files.rs | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/files.rs b/src/files.rs index e77a8e118..9534ab9f8 100644 --- a/src/files.rs +++ b/src/files.rs @@ -220,7 +220,10 @@ pub(crate) fn guess_content( _ => {} } - // If the input bytes are *almost* valid UTF-8, treat them as UTF-8. + // If the input bytes are *almost* valid UTF-8, treat them as + // UTF-8. This is helpful when the user has written a small number + // of bad bytes to a file. Users would still like to be able to + // diff these files. let utf8_string = String::from_utf8_lossy(bytes).to_string(); let num_utf8_invalid = utf8_string .chars() @@ -235,21 +238,8 @@ pub(crate) fn guess_content( return ProbableFileKind::Text(utf8_string); } - // If the input bytes are *almost* valid UTF-16, treat them as - // UTF-16. - let utf16_string = String::from_utf16_lossy(&u16_values); - let num_utf16_invalid = utf16_string - .chars() - .take(50000) - .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') - .count(); - if num_utf16_invalid <= 1 { - info!( - "Input file is mostly valid UTF-16 (invalid characters: {})", - num_utf16_invalid - ); - return ProbableFileKind::Text(utf16_string); - } + // Deliberately don't check for mostly-valid UTF-16 due to the + // high UTF-16 false positive rate on binary files. // If the input bytes are mostly valid Windows-1252 (an extension of // ISO-8859-1 aka Latin 1), treat them as such.