Merge pull request #281 from Xuanwo/binary-content

feat: Improve binary content guess
pull/290/head
Wilfred Hughes 2022-05-18 23:50:01 +07:00 committed by GitHub
commit 64d65ad08a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 81 additions and 0 deletions

65
Cargo.lock generated

@ -52,6 +52,12 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bytecount"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e"
[[package]]
name = "cc"
version = "1.0.73"
@ -185,6 +191,7 @@ dependencies = [
"strsim",
"term_size",
"tree-sitter",
"tree_magic_mini",
"typed-arena",
"walkdir",
"wu-diff",
@ -209,6 +216,18 @@ dependencies = [
"termcolor",
]
[[package]]
name = "fixedbitset"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e"
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "hashbrown"
version = "0.11.2"
@ -306,6 +325,22 @@ dependencies = [
"libmimalloc-sys",
]
[[package]]
name = "minimal-lexical"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "nom"
version = "7.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36"
dependencies = [
"memchr",
"minimal-lexical",
]
[[package]]
name = "num_cpus"
version = "1.13.1"
@ -316,6 +351,12 @@ dependencies = [
"libc",
]
[[package]]
name = "once_cell"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
[[package]]
name = "os_str_bytes"
version = "6.0.0"
@ -340,6 +381,16 @@ version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e72e30578e0d0993c8ae20823dd9cff2bc5517d2f586a8aef462a581e8a03eb"
[[package]]
name = "petgraph"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f"
dependencies = [
"fixedbitset",
"indexmap",
]
[[package]]
name = "pretty_assertions"
version = "1.0.0"
@ -534,6 +585,20 @@ dependencies = [
"regex",
]
[[package]]
name = "tree_magic_mini"
version = "3.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91adfd0607cacf6e4babdb870e9bec4037c1c4b151cfd279ccefc5e0c7feaa6d"
dependencies = [
"bytecount",
"fnv",
"lazy_static",
"nom",
"once_cell",
"petgraph",
]
[[package]]
name = "typed-arena"
version = "2.0.1"

@ -42,6 +42,7 @@ owo-colors = "3.3.0"
rpds = "0.10.0"
wu-diff = "0.1.2"
rayon = "1.5.2"
tree_magic_mini = "3.0.3"
[dev-dependencies]
pretty_assertions = "1.0.0"

@ -69,6 +69,21 @@ pub fn read_or_die(path: &Path) -> Vec<u8> {
/// Do these bytes look like a binary (non-textual) format?
pub fn is_probably_binary(bytes: &[u8]) -> bool {
let mime = tree_magic_mini::from_u8(bytes);
match mime {
// Treat pdf as binary.
"application/pdf" => return true,
// Treat all image content as binary.
v if v.starts_with("image/") => return true,
// Treat all audio content as binary.
v if v.starts_with("audio/") => return true,
// Treat all video content as binary.
v if v.starts_with("video/") => return true,
// Treat all font content as binary.
v if v.starts_with("font/") => return true,
_ => {}
}
// If more than 20 of the first 1,000 characters are null bytes or
// invalid UTF-8, we assume it's binary.
let num_replaced = String::from_utf8_lossy(bytes)