mirror of https://github.com/Wilfred/difftastic/
438 lines
16 KiB
Rust
438 lines
16 KiB
Rust
//! File reading utilities.
|
|
|
|
use std::io::Read;
|
|
use std::{
|
|
fs,
|
|
io::ErrorKind::*,
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
use ignore::WalkBuilder;
|
|
|
|
use crate::exit_codes::EXIT_BAD_ARGUMENTS;
|
|
use crate::hash::DftHashSet;
|
|
use crate::options::FileArgument;
|
|
|
|
pub(crate) fn read_file_or_die(path: &FileArgument) -> Vec<u8> {
|
|
match read_file_arg(path) {
|
|
Ok(src) => src,
|
|
Err(e) => {
|
|
eprint_read_error(path, &e);
|
|
std::process::exit(EXIT_BAD_ARGUMENTS);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub(crate) fn read_files_or_die(
|
|
lhs_path: &FileArgument,
|
|
rhs_path: &FileArgument,
|
|
missing_as_empty: bool,
|
|
) -> (Vec<u8>, Vec<u8>) {
|
|
let lhs_res = read_file_arg(lhs_path);
|
|
let rhs_res = read_file_arg(rhs_path);
|
|
|
|
match (lhs_res, rhs_res) {
|
|
// Both files exist, the happy case.
|
|
(Ok(lhs_src), Ok(rhs_src)) => (lhs_src, rhs_src),
|
|
|
|
// Proceed if we've been given two paths and only one
|
|
// exists. This is important for mercurial diffs when a file
|
|
// has been removed.
|
|
(Ok(lhs_src), Err(e)) if missing_as_empty && e.kind() == NotFound => (lhs_src, vec![]),
|
|
(Err(e), Ok(rhs_src)) if missing_as_empty && e.kind() == NotFound => (vec![], rhs_src),
|
|
|
|
(lhs_res, rhs_res) => {
|
|
// Something else went wrong. Print both errors
|
|
// encountered.
|
|
if let Err(e) = lhs_res {
|
|
eprint_read_error(lhs_path, &e);
|
|
}
|
|
if let Err(e) = rhs_res {
|
|
eprint_read_error(rhs_path, &e);
|
|
}
|
|
std::process::exit(EXIT_BAD_ARGUMENTS);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Read a path provided in a CLI argument, handling /dev/null and -
|
|
/// correctly.
|
|
fn read_file_arg(file_arg: &FileArgument) -> std::io::Result<Vec<u8>> {
|
|
match file_arg {
|
|
FileArgument::NamedPath(path) => fs::read(path),
|
|
FileArgument::Stdin => {
|
|
let stdin = std::io::stdin();
|
|
let mut handle = stdin.lock();
|
|
|
|
let mut bytes = vec![];
|
|
handle.read_to_end(&mut bytes)?;
|
|
Ok(bytes)
|
|
}
|
|
FileArgument::DevNull => {
|
|
// Treat /dev/null as an empty file, even on platforms like
|
|
// Windows where this path doesn't exist. Git uses /dev/null
|
|
// regardless of the platform.
|
|
Ok(vec![])
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Write a human-friendly description of `e` to stderr.
|
|
fn eprint_read_error(file_arg: &FileArgument, e: &std::io::Error) {
|
|
match e.kind() {
|
|
std::io::ErrorKind::NotFound => {
|
|
eprintln!("No such file: {}", file_arg);
|
|
}
|
|
std::io::ErrorKind::PermissionDenied => {
|
|
eprintln!("Permission denied when reading file: {}", file_arg);
|
|
}
|
|
_ => match file_arg {
|
|
FileArgument::NamedPath(path) if path.is_dir() => {
|
|
eprintln!("Expected a file, got a directory: {}", path.display());
|
|
}
|
|
_ => eprintln!("Could not read file: {} (error {:?})", file_arg, e.kind()),
|
|
},
|
|
};
|
|
}
|
|
|
|
pub(crate) fn read_or_die(path: &Path) -> Vec<u8> {
|
|
match fs::read(path) {
|
|
Ok(src) => src,
|
|
Err(e) => {
|
|
eprint_read_error(&FileArgument::NamedPath(path.to_path_buf()), &e);
|
|
std::process::exit(EXIT_BAD_ARGUMENTS);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn has_utf16_byte_order_mark(bytes: &[u8]) -> bool {
|
|
matches!(bytes, [0xfe, 0xff, ..] | [0xff, 0xfe, ..])
|
|
}
|
|
|
|
/// Group bytes into u16 values for conversion to UTF-16, respecting
|
|
/// the byte order mark if present.
|
|
fn u16_from_bytes(bytes: &[u8]) -> Vec<u16> {
|
|
let is_big_endian = match &bytes {
|
|
[0xfe, 0xff, ..] => true,
|
|
[0xff, 0xfe, ..] => false,
|
|
_ => false, // assume little endian if no BOM is present.
|
|
};
|
|
|
|
// https://stackoverflow.com/a/57172592
|
|
bytes
|
|
.chunks_exact(2)
|
|
.map(|a| {
|
|
if is_big_endian {
|
|
u16::from_be_bytes([a[0], a[1]])
|
|
} else {
|
|
u16::from_le_bytes([a[0], a[1]])
|
|
}
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
#[derive(Debug, Eq, PartialEq)]
|
|
pub(crate) enum ProbableFileKind {
|
|
Text(String),
|
|
Binary,
|
|
}
|
|
|
|
/// Do these bytes look like a binary (non-textual) format?
|
|
pub(crate) fn guess_content(
|
|
bytes: &[u8],
|
|
path: &FileArgument,
|
|
binary_overrides: &[glob::Pattern],
|
|
) -> ProbableFileKind {
|
|
if let FileArgument::NamedPath(path) = path {
|
|
let path = path.to_string_lossy();
|
|
for pattern in binary_overrides {
|
|
if pattern.matches(&path) {
|
|
info!(
|
|
"Input file is treated as binary due to explicit override glob {}",
|
|
pattern
|
|
);
|
|
return ProbableFileKind::Binary;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the bytes are entirely valid UTF-8, treat them as a string.
|
|
if let Ok(valid_utf8_string) = std::str::from_utf8(bytes) {
|
|
info!("Input file is valid UTF-8");
|
|
return ProbableFileKind::Text(valid_utf8_string.to_owned());
|
|
}
|
|
|
|
// Only consider the first 1,000 bytes, as tree_magic_mini
|
|
// considers the entire file, which is very slow on large files.
|
|
let mut magic_bytes = bytes;
|
|
if magic_bytes.len() > 1000 {
|
|
magic_bytes = &magic_bytes[..1000];
|
|
}
|
|
|
|
let mime = tree_magic_mini::from_u8(magic_bytes);
|
|
info!("MIME type detected: {}", mime);
|
|
|
|
// Use MIME type detection to guess whether a file is binary. This
|
|
// has false positives and false negatives, so only check the MIME
|
|
// type after allowing perfect text files (see issue #433).
|
|
match mime {
|
|
// Treat pdf as binary.
|
|
"application/pdf" => return ProbableFileKind::Binary,
|
|
// application/* is a mix of stuff, application/json is fine
|
|
// but application/zip is binary that often decodes as valid
|
|
// UTF-16.
|
|
//
|
|
// See
|
|
// <https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types>
|
|
// for a list of MIME types.
|
|
"application/x-bzip" => return ProbableFileKind::Binary,
|
|
"application/x-bzip2" => return ProbableFileKind::Binary,
|
|
"application/x-7zip-compressed" => return ProbableFileKind::Binary,
|
|
"application/gzip" => return ProbableFileKind::Binary,
|
|
"application/zip" => return ProbableFileKind::Binary,
|
|
"application/zstd" => return ProbableFileKind::Binary,
|
|
// Treat all image content as binary.
|
|
v if v.starts_with("image/") => return ProbableFileKind::Binary,
|
|
// Treat all audio content as binary.
|
|
v if v.starts_with("audio/") => return ProbableFileKind::Binary,
|
|
// Treat all video content as binary.
|
|
v if v.starts_with("video/") => return ProbableFileKind::Binary,
|
|
// Treat all font content as binary.
|
|
v if v.starts_with("font/") => return ProbableFileKind::Binary,
|
|
_ => {}
|
|
}
|
|
|
|
// Note that many binary files and mostly-valid UTF-8 files happen
|
|
// to be valid UTF-16. Decoding these as UTF-16 leads to garbage
|
|
// ("mojibake").
|
|
//
|
|
// To avoid this, we only try UTF-16 after we've done MIME type
|
|
// checks for binary, and we conservatively require an explicit
|
|
// byte order mark.
|
|
let u16_values = u16_from_bytes(bytes);
|
|
let utf16_str_result = String::from_utf16(&u16_values);
|
|
match utf16_str_result {
|
|
Ok(valid_utf16_string) if has_utf16_byte_order_mark(bytes) => {
|
|
info!("Input file is valid UTF-16 with a byte order mark");
|
|
return ProbableFileKind::Text(valid_utf16_string);
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
// If the input bytes are *almost* valid UTF-8, treat them as UTF-8.
|
|
let utf8_string = String::from_utf8_lossy(bytes).to_string();
|
|
let num_utf8_invalid = utf8_string
|
|
.chars()
|
|
.take(5000)
|
|
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
|
|
.count();
|
|
if num_utf8_invalid <= 2 {
|
|
info!(
|
|
"Input file is mostly valid UTF-8 (invalid characters: {})",
|
|
num_utf8_invalid
|
|
);
|
|
return ProbableFileKind::Text(utf8_string);
|
|
}
|
|
|
|
// If the input bytes are *almost* valid UTF-16, treat them as
|
|
// UTF-16.
|
|
let utf16_string = String::from_utf16_lossy(&u16_values);
|
|
let num_utf16_invalid = utf16_string
|
|
.chars()
|
|
.take(5000)
|
|
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
|
|
.count();
|
|
if num_utf16_invalid <= 1 {
|
|
info!(
|
|
"Input file is mostly valid UTF-16 (invalid characters: {})",
|
|
num_utf16_invalid
|
|
);
|
|
return ProbableFileKind::Text(utf16_string);
|
|
}
|
|
|
|
// If the input bytes are valid Windows-1252 (an extension of
|
|
// ISO-8859-1 aka Latin 1), treat them as such.
|
|
let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes);
|
|
if !saw_malformed {
|
|
let num_null = utf16_string
|
|
.chars()
|
|
.take(5000)
|
|
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
|
|
.count();
|
|
if num_null <= 1 {
|
|
info!(
|
|
"Input file is mostly valid Latin 1 (invalid characters: {})",
|
|
num_null
|
|
);
|
|
return ProbableFileKind::Text(latin1_str.to_string());
|
|
}
|
|
}
|
|
|
|
ProbableFileKind::Binary
|
|
}
|
|
|
|
/// All the files in `dir`, including subdirectories.
|
|
fn relative_file_paths_in_dir(dir: &Path) -> Vec<PathBuf> {
|
|
// Walk all the files in `dir`, excluding those mentioned in .git.
|
|
let walker = WalkBuilder::new(dir)
|
|
// Include files whose name starts with a dot.
|
|
.hidden(false)
|
|
// Exclude the .git directory.
|
|
.filter_entry(|e| {
|
|
!(e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) && e.file_name() == ".git")
|
|
})
|
|
.build();
|
|
|
|
walker
|
|
.filter_map(Result::ok)
|
|
.map(|entry| Path::new(entry.path()).to_owned())
|
|
.filter(|path| !path.is_dir())
|
|
.map(|path| path.strip_prefix(dir).unwrap().to_path_buf())
|
|
.collect()
|
|
}
|
|
|
|
/// Walk `lhs_dir` and `rhs_dir`, and return relative paths of files
|
|
/// that occur in at least one directory.
|
|
///
|
|
/// Attempts to preserve the ordering of files in both directories.
|
|
pub(crate) fn relative_paths_in_either(lhs_dir: &Path, rhs_dir: &Path) -> Vec<PathBuf> {
|
|
let lhs_paths = relative_file_paths_in_dir(lhs_dir);
|
|
let rhs_paths = relative_file_paths_in_dir(rhs_dir);
|
|
|
|
let mut seen = DftHashSet::default();
|
|
let mut paths: Vec<PathBuf> = vec![];
|
|
|
|
let mut i = 0;
|
|
let mut j = 0;
|
|
|
|
loop {
|
|
match (lhs_paths.get(i), rhs_paths.get(j)) {
|
|
(Some(lhs_path), Some(rhs_path)) if lhs_path == rhs_path => {
|
|
if !seen.contains(lhs_path) {
|
|
// It should be impossible to get duplicates, but
|
|
// be defensive.
|
|
paths.push(lhs_path.clone());
|
|
seen.insert(lhs_path);
|
|
}
|
|
|
|
i += 1;
|
|
j += 1;
|
|
}
|
|
(Some(lhs_path), Some(rhs_path)) => {
|
|
if seen.contains(lhs_path) {
|
|
i += 1;
|
|
} else if seen.contains(rhs_path) {
|
|
j += 1;
|
|
} else {
|
|
paths.push(lhs_path.clone());
|
|
paths.push(rhs_path.clone());
|
|
|
|
seen.insert(lhs_path);
|
|
seen.insert(rhs_path);
|
|
|
|
i += 1;
|
|
j += 1;
|
|
}
|
|
}
|
|
_ => break,
|
|
}
|
|
}
|
|
|
|
paths.extend(
|
|
lhs_paths[i..]
|
|
.iter()
|
|
.filter(|&path| !seen.contains(path))
|
|
.cloned(),
|
|
);
|
|
paths.extend(
|
|
rhs_paths[j..]
|
|
.iter()
|
|
.filter(|&path| !seen.contains(path))
|
|
.cloned(),
|
|
);
|
|
|
|
paths
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn guess_content(bytes: &[u8]) -> ProbableFileKind {
|
|
super::guess_content(bytes, &FileArgument::Stdin, &[])
|
|
}
|
|
|
|
#[test]
|
|
fn test_plaintext_is_text() {
|
|
let s = "hello world";
|
|
assert!(matches!(
|
|
guess_content(s.as_bytes()),
|
|
ProbableFileKind::Text(_)
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
// This test only works if the MIME database is present, so make
|
|
// it opt-in.
|
|
#[ignore]
|
|
fn test_gzip_is_binary() {
|
|
// Bytes for "echo "[]" | gzip -c | hexdump -C"
|
|
let bytes = vec![
|
|
0x1f, 0x8b, 0x08, 0x00, 0x3a, 0xb0, 0x91, 0x63, 0x00, 0x03, 0x8b, 0x8e, 0xe5, 0x02,
|
|
0x00, 0x44, 0xd2, 0x68, 0x70, 0x03, 0x00, 0x00, 0x00,
|
|
];
|
|
|
|
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
|
|
}
|
|
|
|
#[test]
|
|
fn test_dex_is_binary() {
|
|
let bytes = vec![
|
|
0x34, 0x8a, 0x4b, 0x8f, 0x77, 0xa4, 0x4e, 0xb1, 0x31, 0x2d, 0x5f, 0xfb, 0x10, 0x08,
|
|
0xa8, 0x6b, 0x58, 0x06, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x05, 0x00, 0x00, 0x23, 0x00,
|
|
0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
|
|
];
|
|
|
|
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
|
|
}
|
|
|
|
#[test]
|
|
fn test_png_bytes_are_binary() {
|
|
let bytes = vec![
|
|
0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48,
|
|
0x44, 0x52, 0x00, 0x00, 0x03, 0x76, 0x00, 0x00, 0x01, 0xed, 0x08, 0x06, 0x00, 0x00,
|
|
0x01, 0x22, 0x94, 0xdc, 0xb2, 0x00, 0x00, 0x1b, 0x5a, 0x7a, 0x54, 0x58, 0x74, 0x52,
|
|
0x61, 0x77, 0x20, 0x70, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x74, 0x79, 0x70,
|
|
0x65, 0x20, 0x65, 0x78, 0x69, 0x66, 0x00, 0x00, 0x78, 0xda, 0xad, 0x9b, 0x69, 0x76,
|
|
0x1c, 0xb7, 0x92, 0x85, 0xff, 0x63, 0x15, 0xbd, 0x04, 0x4c, 0x81, 0x61, 0x39, 0x18,
|
|
0xcf, 0x79, 0x3b, 0xe8, 0xe5, 0xf7, 0x77, 0xb3, 0x48, 0x8a, 0x92, 0xa8, 0xc1, 0xcf,
|
|
0x2d, 0xda, 0x2a, 0xba, 0x2a, 0x2b, 0x13, 0x19, 0xc3, 0x1d, 0x02, 0x69, 0x77, 0xfe,
|
|
0xf7, 0x3f, 0xd7, 0xfd, 0x0f, 0x7f, 0x4a, 0x6f, 0xc5, 0x65, 0xab, 0xad, 0xf4, 0x52,
|
|
0x3c, 0x7f, 0x72, 0xcf, 0x3d, 0x0e, 0x7e, 0x69, 0xfe, 0xdb, 0x9f, 0xf3, 0xf6, 0xda,
|
|
0xbd, 0xbf, 0xfa, 0x3d, 0xf3, 0x6f, 0x78, 0x7b, 0x2f, 0x7c, 0x1c, 0x15, 0xdc, 0x77,
|
|
0x1f, 0xbc, 0xff, 0x16, 0xf8, 0xf9, 0xf4, 0x7e, 0x7b, 0x3f, 0x59, 0xe4, 0x9d, 0xc4,
|
|
0x6b, 0x7a, 0x7b, 0xff, 0xfd, 0x5b, 0x49, 0x27, 0x4a, 0x9f, 0x4e, 0x54, 0x3e, 0x5e,
|
|
0xc3, 0x57, 0xef, 0x07, 0xfb, 0xe1, 0xfd, 0xf4, 0x71, 0xf9, 0xf8, 0xdd, 0x8a, 0x5a,
|
|
0xf9, 0xb8, 0x72, 0xfc, 0xfc, 0xfe, 0x3c, 0x61, 0xf8, 0xcf, 0x7f, 0xda, 0xb7, 0x7f,
|
|
0xef, 0xdd, 0xed, 0x72, 0xcf, 0x4e, 0x47, 0xe7, 0x91, 0x0b, 0xf1, 0x29, 0x6f, 0x37,
|
|
0xf5, 0x7e, 0x8b, 0xcf, 0x6f, 0x1c, 0x38, 0x09, 0x4c, 0x7a, 0xbe, 0x56, 0xf8, 0xa9,
|
|
0xfc, 0x6b, 0xfc, 0x5e, 0x9f, 0x9f, 0xce, 0x4f, 0xf3, 0xc3, 0x2f, 0x17, 0xb2, 0xdf,
|
|
0x7e, 0xf9, 0xc9, 0xcf, 0x0a, 0x3d, 0xc4, 0x90, 0xfc, 0x0d, 0x39, 0xec, 0x30, 0xc2,
|
|
0x0d, 0xe7, 0x79, 0x5d, 0x61, 0xb1, 0xc4, 0x1c, 0x4f, 0xac, 0xbc, 0xc6, 0xb8, 0x62,
|
|
0x7a, 0xde, 0x6b, 0xa9, 0xc6, 0x1e, 0x57, 0xf2, 0x29, 0xa4, 0xec, 0x52, 0x4e, 0x39,
|
|
0xdc, 0x58, 0x53, 0x4f, 0x3b, 0xb5, 0x14, 0xd3, 0x8a, 0x27, 0x25, 0xde, 0x8d, 0x1f,
|
|
0x6b, 0x09, 0xcf, 0x75, 0xfb, 0x73, 0xb9, 0x15, 0x1a, 0x17, 0xde, 0x81, 0x23, 0x63,
|
|
0xe0, 0x64, 0x81, 0x6f, 0x3c, 0x3f, 0xee, 0xfd, 0x97, 0x7f, 0xfb, 0xf3, 0xe5, 0x89,
|
|
0xee, 0x5d, 0x0a, 0x51, 0x50, 0x30, 0x49, 0x7d, 0x78, 0x25, 0x38, 0x46, 0xa5, 0x21,
|
|
0x28, 0x8a, 0x49, 0x7f, 0x73, 0x14, 0x09, 0x09, 0xf7, 0xbd, 0x8e, 0xec, 0x09, 0xf0,
|
|
0xfb, 0xcf, 0x8f, 0x7f, 0x48, 0x2c, 0x27, 0xc9, 0x1c, 0xa5, 0x30, 0x37, 0x6e, 0x70,
|
|
0xf8, 0xf9, 0x3a, 0xc5, 0xb4, 0xf0, 0xad, 0xb6, 0x92, 0x7b, 0x12, 0x9d, 0x38, 0xd0,
|
|
0x78, 0xcd, 0xaf, 0x2f, 0xd7, 0xfd, 0x76, 0x02, 0x42, 0xc4, 0xb5, 0x8d, 0xc5, 0x84,
|
|
0x44, 0x06, 0x7c, 0x09, 0xc9, 0x42, 0x09, 0xbe, 0xc6, 0x58, 0x43, 0x20, 0x8e, 0x8d,
|
|
0xfc, 0x0c, 0x56, 0x1e, 0x93, 0xcb, 0x71, 0x92, 0x82, 0x60, 0x16, 0x37,
|
|
];
|
|
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
|
|
}
|
|
}
|