Attempt to detect and decode UTF-16 files too

Closes #345
2022-08-28 15:37:09 +07:00 · 2022-08-28 15:37:09 +07:00 · b1b3756fa7
parent 8b5642ef51
commit b1b3756fa7
4 changed files with 84 additions and 34 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,10 @@
 ## 0.35 (unreleased)

+### Parsing
+
+Difftastic will now autodetect files in UTF-16-BE and
+UTF-16-LE. Previously it required files to be UTF-8.
+
 ## 0.34 (released 27th August 2022)

 ### Build
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -181,6 +181,9 @@ sample_files/toml_before.toml sample_files/toml_after.toml
 sample_files/typing_before.ml sample_files/typing_after.ml
 3941fd44b0bf744da834a0b3eda1ba76  -

+sample_files/utf16_before.py sample_files/utf16_after.py
+23ae372384bdddc7dd8745c22fab580d  -
+
 sample_files/whitespace_before.tsx sample_files/whitespace_after.tsx
 c4151c5a44b11e04fd11c2594597ed33  -

--- a/src/files.rs
+++ b/src/files.rs
@ -75,40 +75,81 @@ pub fn read_or_die(path: &Path) -> Vec<u8> {
    }
 }

+fn utf16_from_bytes_lossy(bytes: &[u8]) -> String {
+    let is_big_endian = match &bytes {
+        [0xfe, 0xff, ..] => true,
+        [0xff, 0xfe, ..] => false,
+        _ => false, // assume little endian if no BOM is present.
+    };
+
+    // https://stackoverflow.com/a/57172592
+    let u16_values: Vec<u16> = bytes
+        .chunks_exact(2)
+        .into_iter()
+        .map(|a| {
+            if is_big_endian {
+                u16::from_be_bytes([a[0], a[1]])
+            } else {
+                u16::from_le_bytes([a[0], a[1]])
+            }
+        })
+        .collect();
+    String::from_utf16_lossy(u16_values.as_slice())
+}
+
+pub enum ProbableFileKind {
+    Text(String),
+    Binary,
+}
+
 /// Do these bytes look like a binary (non-textual) format?
-pub fn is_probably_binary(bytes: &[u8]) -> bool {
+pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
    // Only consider the first 1,000 bytes, as tree_magic_mini
    // considers the entire file, which is very slow on large files.
-    let mut bytes = bytes;
-    if bytes.len() > 1000 {
-        bytes = &bytes[..1000];
+    let mut magic_bytes = bytes;
+    if magic_bytes.len() > 1000 {
+        magic_bytes = &magic_bytes[..1000];
    }

-    let mime = tree_magic_mini::from_u8(bytes);
+    let mime = tree_magic_mini::from_u8(magic_bytes);
    match mime {
        // Treat pdf as binary.
-        "application/pdf" => return true,
+        "application/pdf" => return ProbableFileKind::Binary,
        // Treat all image content as binary.
-        v if v.starts_with("image/") => return true,
+        v if v.starts_with("image/") => return ProbableFileKind::Binary,
        // Treat all audio content as binary.
-        v if v.starts_with("audio/") => return true,
+        v if v.starts_with("audio/") => return ProbableFileKind::Binary,
        // Treat all video content as binary.
-        v if v.starts_with("video/") => return true,
+        v if v.starts_with("video/") => return ProbableFileKind::Binary,
        // Treat all font content as binary.
-        v if v.starts_with("font/") => return true,
+        v if v.starts_with("font/") => return ProbableFileKind::Binary,
        _ => {}
    }

    // If more than 20 of the first 1,000 characters are null bytes or
    // invalid UTF-8, we assume it's binary.
-    let num_replaced = String::from_utf8_lossy(bytes)
-        .to_string()
+    let utf8_string = String::from_utf8_lossy(bytes).to_string();
+    
+    let num_utf8_invalid = utf8_string
        .chars()
        .take(1000)
        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
        .count();
+    if num_utf8_invalid <= 20 {
+        return ProbableFileKind::Text(utf8_string);
+    }
+
+    let utf16_string = utf16_from_bytes_lossy(bytes);
+    let num_utf16_invalid = utf16_string
+        .chars()
+        .take(1000)
+        .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
+        .count();
+    if num_utf16_invalid <= 20 {
+        return ProbableFileKind::Text(utf16_string);
+    }

-    num_replaced > 20
+    ProbableFileKind::Binary
 }

 /// All the files in `dir`, including subdirectories.
@ -180,13 +221,13 @@ mod tests {
    use super::*;

    #[test]
-    fn test_text_is_not_binary() {
+    fn test_plaintext_is_text() {
        let s = "hello world";
-        assert!(!is_probably_binary(s.as_bytes()));
+        assert!(matches!(guess_content(s.as_bytes()), ProbableFileKind::Text(_)));
    }
    #[test]
    fn test_null_bytes_are_binary() {
        let s = "\0".repeat(1000);
-        assert!(is_probably_binary(s.as_bytes()));
+        assert!(matches!(guess_content(s.as_bytes()), ProbableFileKind::Binary));
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -36,7 +36,9 @@ use crate::parse::syntax;
 use diff::changes::ChangeMap;
 use diff::dijkstra::ExceededGraphLimit;
 use display::context::opposite_positions;
-use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
+use files::{
+    guess_content, read_files_or_die, read_or_die, relative_paths_in_either, ProbableFileKind,
+};
 use log::info;
 use mimalloc::MiMalloc;
 use parse::guess_language::guess;
@ -214,26 +216,25 @@ fn diff_file_content(
    byte_limit: usize,
    language_override: Option<parse::guess_language::Language>,
 ) -> DiffResult {
-    if is_probably_binary(lhs_bytes) || is_probably_binary(rhs_bytes) {
-        return DiffResult {
-            lhs_display_path: lhs_display_path.into(),
-            rhs_display_path: rhs_display_path.into(),
-            language: None,
-            lhs_src: FileContent::Binary(lhs_bytes.to_vec()),
-            rhs_src: FileContent::Binary(rhs_bytes.to_vec()),
-            lhs_positions: vec![],
-            rhs_positions: vec![],
-        };
-    }
+    let (mut lhs_src, mut rhs_src) = match (guess_content(lhs_bytes), guess_content(rhs_bytes)) {
+        (ProbableFileKind::Binary, _) | (_, ProbableFileKind::Binary) => {
+            return DiffResult {
+                lhs_display_path: lhs_display_path.into(),
+                rhs_display_path: rhs_display_path.into(),
+                language: None,
+                lhs_src: FileContent::Binary(lhs_bytes.to_vec()),
+                rhs_src: FileContent::Binary(rhs_bytes.to_vec()),
+                lhs_positions: vec![],
+                rhs_positions: vec![],
+            };
+        }
+        (ProbableFileKind::Text(lhs_src), ProbableFileKind::Text(rhs_src)) => (lhs_src, rhs_src),
+    };

    // TODO: don't replace tab characters inside string literals.
    let tab_as_spaces = " ".repeat(tab_width);
-    let mut lhs_src = String::from_utf8_lossy(lhs_bytes)
-        .to_string()
-        .replace('\t', &tab_as_spaces);
-    let mut rhs_src = String::from_utf8_lossy(rhs_bytes)
-        .to_string()
-        .replace('\t', &tab_as_spaces);
+    lhs_src = lhs_src.replace('\t', &tab_as_spaces);
+    rhs_src = rhs_src.replace('\t', &tab_as_spaces);

    // Ignore the trailing newline, if present.
    // TODO: highlight if this has changes (#144).