//! File reading utilities. use std::io::Read; use std::{ fs, io::ErrorKind::*, path::{Path, PathBuf}, }; use ignore::WalkBuilder; use crate::exit_codes::EXIT_BAD_ARGUMENTS; use crate::hash::DftHashSet; use crate::options::FileArgument; pub(crate) fn read_file_or_die(path: &FileArgument) -> Vec { match read_file_arg(path) { Ok(src) => src, Err(e) => { eprint_read_error(path, &e); std::process::exit(EXIT_BAD_ARGUMENTS); } } } pub(crate) fn read_files_or_die( lhs_path: &FileArgument, rhs_path: &FileArgument, missing_as_empty: bool, ) -> (Vec, Vec) { let lhs_res = read_file_arg(lhs_path); let rhs_res = read_file_arg(rhs_path); match (lhs_res, rhs_res) { // Both files exist, the happy case. (Ok(lhs_src), Ok(rhs_src)) => (lhs_src, rhs_src), // Proceed if we've been given two paths and only one // exists. This is important for mercurial diffs when a file // has been removed. (Ok(lhs_src), Err(e)) if missing_as_empty && e.kind() == NotFound => (lhs_src, vec![]), (Err(e), Ok(rhs_src)) if missing_as_empty && e.kind() == NotFound => (vec![], rhs_src), (lhs_res, rhs_res) => { // Something else went wrong. Print both errors // encountered. if let Err(e) = lhs_res { eprint_read_error(lhs_path, &e); } if let Err(e) = rhs_res { eprint_read_error(rhs_path, &e); } std::process::exit(EXIT_BAD_ARGUMENTS); } } } /// Read a path provided in a CLI argument, handling /dev/null and - /// correctly. fn read_file_arg(file_arg: &FileArgument) -> std::io::Result> { match file_arg { FileArgument::NamedPath(path) => fs::read(path), FileArgument::Stdin => { let stdin = std::io::stdin(); let mut handle = stdin.lock(); let mut bytes = vec![]; handle.read_to_end(&mut bytes)?; Ok(bytes) } FileArgument::DevNull => { // Treat /dev/null as an empty file, even on platforms like // Windows where this path doesn't exist. Git uses /dev/null // regardless of the platform. Ok(vec![]) } } } /// Write a human-friendly description of `e` to stderr. fn eprint_read_error(file_arg: &FileArgument, e: &std::io::Error) { match e.kind() { std::io::ErrorKind::NotFound => { eprintln!("No such file: {}", file_arg); } std::io::ErrorKind::PermissionDenied => { eprintln!("Permission denied when reading file: {}", file_arg); } _ => match file_arg { FileArgument::NamedPath(path) if path.is_dir() => { eprintln!("Expected a file, got a directory: {}", path.display()); } _ => eprintln!("Could not read file: {} (error {:?})", file_arg, e.kind()), }, }; } pub(crate) fn read_or_die(path: &Path) -> Vec { match fs::read(path) { Ok(src) => src, Err(e) => { eprint_read_error(&FileArgument::NamedPath(path.to_path_buf()), &e); std::process::exit(EXIT_BAD_ARGUMENTS); } } } fn has_utf16_byte_order_mark(bytes: &[u8]) -> bool { matches!(bytes, [0xfe, 0xff, ..] | [0xff, 0xfe, ..]) } /// Group bytes into u16 values for conversion to UTF-16, respecting /// the byte order mark if present. fn u16_from_bytes(bytes: &[u8]) -> Vec { let is_big_endian = match &bytes { [0xfe, 0xff, ..] => true, [0xff, 0xfe, ..] => false, _ => false, // assume little endian if no BOM is present. }; // https://stackoverflow.com/a/57172592 bytes .chunks_exact(2) .map(|a| { if is_big_endian { u16::from_be_bytes([a[0], a[1]]) } else { u16::from_le_bytes([a[0], a[1]]) } }) .collect() } #[derive(Debug, Eq, PartialEq)] pub(crate) enum ProbableFileKind { Text(String), Binary, } /// Do these bytes look like a binary (non-textual) format? pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { // If the bytes are entirely valid UTF-8, treat them as a string. if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) { return ProbableFileKind::Text(valid_utf8_string.to_owned()); } // Only consider the first 1,000 bytes, as tree_magic_mini // considers the entire file, which is very slow on large files. let mut magic_bytes = bytes; if magic_bytes.len() > 1000 { magic_bytes = &magic_bytes[..1000]; } let mime = tree_magic_mini::from_u8(magic_bytes); info!("MIME type detected: {}", mime); // Use MIME type detection to guess whether a file is binary. This // has false positives and false negatives, so only check the MIME // type after allowing perfect text files (see issue #433). match mime { // Treat pdf as binary. "application/pdf" => return ProbableFileKind::Binary, // application/* is a mix of stuff, application/json is fine // but application/zip is binary that often decodes as valid // UTF-16. "application/gzip" => return ProbableFileKind::Binary, "application/zip" => return ProbableFileKind::Binary, // Treat all image content as binary. v if v.starts_with("image/") => return ProbableFileKind::Binary, // Treat all audio content as binary. v if v.starts_with("audio/") => return ProbableFileKind::Binary, // Treat all video content as binary. v if v.starts_with("video/") => return ProbableFileKind::Binary, // Treat all font content as binary. v if v.starts_with("font/") => return ProbableFileKind::Binary, _ => {} } // Note that many binary files and mostly-valid UTF-8 files happen // to be valid UTF-16. Decoding these as UTF-16 leads to garbage // ("mojibake"). // // To avoid this, we only try UTF-16 after we'vedone MIME type // checks for binary, and we conservatively require an explicit // byte order mark. let u16_values = u16_from_bytes(bytes); let utf16_str_result = String::from_utf16(&u16_values); match utf16_str_result { Ok(valid_utf16_string) if has_utf16_byte_order_mark(bytes) => { return ProbableFileKind::Text(valid_utf16_string); } _ => {} } // If the input bytes are *almost* valid UTF-8, treat them as UTF-8. let utf8_string = String::from_utf8_lossy(bytes).to_string(); let num_utf8_invalid = utf8_string .chars() .take(5000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_utf8_invalid <= 10 { info!( "Input file is mostly valid UTF-8 (invalid characters: {})", num_utf8_invalid ); return ProbableFileKind::Text(utf8_string); } // If the input bytes are *almost* valid UTF-16, treat them as // UTF-16. let utf16_string = String::from_utf16_lossy(&u16_values); let num_utf16_invalid = utf16_string .chars() .take(5000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); if num_utf16_invalid <= 1 { info!( "Input file is mostly valid UTF-16 (invalid characters: {})", num_utf16_invalid ); return ProbableFileKind::Text(utf16_string); } // If the input bytes are valid Windows-1252 (an extension of // ISO-8859-1 aka Latin 1), treat them as such. let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes); if !saw_malformed { let num_null = utf16_string .chars() .take(5000) .filter(|c| *c == '\0') .count(); if num_null <= 1 { return ProbableFileKind::Text(latin1_str.to_string()); } } ProbableFileKind::Binary } /// All the files in `dir`, including subdirectories. fn relative_file_paths_in_dir(dir: &Path) -> Vec { // Walk all the files in `dir`, excluding those mentioned in .git. let walker = WalkBuilder::new(dir) // Include files whose name starts with a dot. .hidden(false) // Exclude the .git directory. .filter_entry(|e| { !(e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) && e.file_name() == ".git") }) .build(); walker .filter_map(Result::ok) .map(|entry| Path::new(entry.path()).to_owned()) .filter(|path| !path.is_dir()) .map(|path| path.strip_prefix(dir).unwrap().to_path_buf()) .collect() } /// Walk `lhs_dir` and `rhs_dir`, and return relative paths of files /// that occur in at least one directory. /// /// Attempts to preserve the ordering of files in both directories. pub(crate) fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec { let lhs_paths = relative_file_paths_in_dir(lhs_dir); let rhs_paths = relative_file_paths_in_dir(rhs_dir); let mut seen = DftHashSet::default(); let mut paths: Vec = vec![]; let mut i = 0; let mut j = 0; loop { match (lhs_paths.get(i), rhs_paths.get(j)) { (Some(lhs_path), Some(rhs_path)) if lhs_path == rhs_path => { if !seen.contains(lhs_path) { // It should be impossible to get duplicates, but // be defensive. paths.push(lhs_path.clone()); seen.insert(lhs_path); } i += 1; j += 1; } (Some(lhs_path), Some(rhs_path)) => { if seen.contains(lhs_path) { i += 1; } else if seen.contains(rhs_path) { j += 1; } else { paths.push(lhs_path.clone()); paths.push(rhs_path.clone()); seen.insert(lhs_path); seen.insert(rhs_path); i += 1; j += 1; } } _ => break, } } paths.extend( lhs_paths[i..] .iter() .filter(|&path| !seen.contains(path)) .cloned(), ); paths.extend( rhs_paths[j..] .iter() .filter(|&path| !seen.contains(path)) .cloned(), ); paths } #[cfg(test)] mod tests { use super::*; #[test] fn test_plaintext_is_text() { let s = "hello world"; assert!(matches!( guess_content(s.as_bytes()), ProbableFileKind::Text(_) )); } #[test] // This test only works if the MIME database is present, so make // it opt-in. #[ignore] fn test_gzip_is_binary() { // Bytes for "echo "[]" | gzip -c | hexdump -C" let bytes = vec![ 0x1f, 0x8b, 0x08, 0x00, 0x3a, 0xb0, 0x91, 0x63, 0x00, 0x03, 0x8b, 0x8e, 0xe5, 0x02, 0x00, 0x44, 0xd2, 0x68, 0x70, 0x03, 0x00, 0x00, 0x00, ]; assert_eq!(guess_content(&bytes), ProbableFileKind::Binary); } #[test] fn test_dex_is_binary() { let bytes = vec![ 0x34, 0x8a, 0x4b, 0x8f, 0x77, 0xa4, 0x4e, 0xb1, 0x31, 0x2d, 0x5f, 0xfb, 0x10, 0x08, 0xa8, 0x6b, 0x58, 0x06, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x05, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, ]; assert_eq!(guess_content(&bytes), ProbableFileKind::Binary); } #[test] fn test_png_bytes_are_binary() { let bytes = vec![ 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x03, 0x76, 0x00, 0x00, 0x01, 0xed, 0x08, 0x06, 0x00, 0x00, 0x01, 0x22, 0x94, 0xdc, 0xb2, 0x00, 0x00, 0x1b, 0x5a, 0x7a, 0x54, 0x58, 0x74, 0x52, 0x61, 0x77, 0x20, 0x70, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x74, 0x79, 0x70, 0x65, 0x20, 0x65, 0x78, 0x69, 0x66, 0x00, 0x00, 0x78, 0xda, 0xad, 0x9b, 0x69, 0x76, 0x1c, 0xb7, 0x92, 0x85, 0xff, 0x63, 0x15, 0xbd, 0x04, 0x4c, 0x81, 0x61, 0x39, 0x18, 0xcf, 0x79, 0x3b, 0xe8, 0xe5, 0xf7, 0x77, 0xb3, 0x48, 0x8a, 0x92, 0xa8, 0xc1, 0xcf, 0x2d, 0xda, 0x2a, 0xba, 0x2a, 0x2b, 0x13, 0x19, 0xc3, 0x1d, 0x02, 0x69, 0x77, 0xfe, 0xf7, 0x3f, 0xd7, 0xfd, 0x0f, 0x7f, 0x4a, 0x6f, 0xc5, 0x65, 0xab, 0xad, 0xf4, 0x52, 0x3c, 0x7f, 0x72, 0xcf, 0x3d, 0x0e, 0x7e, 0x69, 0xfe, 0xdb, 0x9f, 0xf3, 0xf6, 0xda, 0xbd, 0xbf, 0xfa, 0x3d, 0xf3, 0x6f, 0x78, 0x7b, 0x2f, 0x7c, 0x1c, 0x15, 0xdc, 0x77, 0x1f, 0xbc, 0xff, 0x16, 0xf8, 0xf9, 0xf4, 0x7e, 0x7b, 0x3f, 0x59, 0xe4, 0x9d, 0xc4, 0x6b, 0x7a, 0x7b, 0xff, 0xfd, 0x5b, 0x49, 0x27, 0x4a, 0x9f, 0x4e, 0x54, 0x3e, 0x5e, 0xc3, 0x57, 0xef, 0x07, 0xfb, 0xe1, 0xfd, 0xf4, 0x71, 0xf9, 0xf8, 0xdd, 0x8a, 0x5a, 0xf9, 0xb8, 0x72, 0xfc, 0xfc, 0xfe, 0x3c, 0x61, 0xf8, 0xcf, 0x7f, 0xda, 0xb7, 0x7f, 0xef, 0xdd, 0xed, 0x72, 0xcf, 0x4e, 0x47, 0xe7, 0x91, 0x0b, 0xf1, 0x29, 0x6f, 0x37, 0xf5, 0x7e, 0x8b, 0xcf, 0x6f, 0x1c, 0x38, 0x09, 0x4c, 0x7a, 0xbe, 0x56, 0xf8, 0xa9, 0xfc, 0x6b, 0xfc, 0x5e, 0x9f, 0x9f, 0xce, 0x4f, 0xf3, 0xc3, 0x2f, 0x17, 0xb2, 0xdf, 0x7e, 0xf9, 0xc9, 0xcf, 0x0a, 0x3d, 0xc4, 0x90, 0xfc, 0x0d, 0x39, 0xec, 0x30, 0xc2, 0x0d, 0xe7, 0x79, 0x5d, 0x61, 0xb1, 0xc4, 0x1c, 0x4f, 0xac, 0xbc, 0xc6, 0xb8, 0x62, 0x7a, 0xde, 0x6b, 0xa9, 0xc6, 0x1e, 0x57, 0xf2, 0x29, 0xa4, 0xec, 0x52, 0x4e, 0x39, 0xdc, 0x58, 0x53, 0x4f, 0x3b, 0xb5, 0x14, 0xd3, 0x8a, 0x27, 0x25, 0xde, 0x8d, 0x1f, 0x6b, 0x09, 0xcf, 0x75, 0xfb, 0x73, 0xb9, 0x15, 0x1a, 0x17, 0xde, 0x81, 0x23, 0x63, 0xe0, 0x64, 0x81, 0x6f, 0x3c, 0x3f, 0xee, 0xfd, 0x97, 0x7f, 0xfb, 0xf3, 0xe5, 0x89, 0xee, 0x5d, 0x0a, 0x51, 0x50, 0x30, 0x49, 0x7d, 0x78, 0x25, 0x38, 0x46, 0xa5, 0x21, 0x28, 0x8a, 0x49, 0x7f, 0x73, 0x14, 0x09, 0x09, 0xf7, 0xbd, 0x8e, 0xec, 0x09, 0xf0, 0xfb, 0xcf, 0x8f, 0x7f, 0x48, 0x2c, 0x27, 0xc9, 0x1c, 0xa5, 0x30, 0x37, 0x6e, 0x70, 0xf8, 0xf9, 0x3a, 0xc5, 0xb4, 0xf0, 0xad, 0xb6, 0x92, 0x7b, 0x12, 0x9d, 0x38, 0xd0, 0x78, 0xcd, 0xaf, 0x2f, 0xd7, 0xfd, 0x76, 0x02, 0x42, 0xc4, 0xb5, 0x8d, 0xc5, 0x84, 0x44, 0x06, 0x7c, 0x09, 0xc9, 0x42, 0x09, 0xbe, 0xc6, 0x58, 0x43, 0x20, 0x8e, 0x8d, 0xfc, 0x0c, 0x56, 0x1e, 0x93, 0xcb, 0x71, 0x92, 0x82, 0x60, 0x16, 0x37, ]; assert_eq!(guess_content(&bytes), ProbableFileKind::Binary); } }