diff --git a/Cargo.lock b/Cargo.lock index 1b39732d8..ddb24275f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,6 +52,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bytecount" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" + [[package]] name = "cc" version = "1.0.73" @@ -185,6 +191,7 @@ dependencies = [ "strsim", "term_size", "tree-sitter", + "tree_magic_mini", "typed-arena", "walkdir", "wu-diff", @@ -209,6 +216,18 @@ dependencies = [ "termcolor", ] +[[package]] +name = "fixedbitset" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "hashbrown" version = "0.11.2" @@ -306,6 +325,22 @@ dependencies = [ "libmimalloc-sys", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num_cpus" version = "1.13.1" @@ -316,6 +351,12 @@ dependencies = [ "libc", ] +[[package]] +name = "once_cell" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" + [[package]] name = "os_str_bytes" version = "6.0.0" @@ -340,6 +381,16 @@ version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e72e30578e0d0993c8ae20823dd9cff2bc5517d2f586a8aef462a581e8a03eb" +[[package]] +name = "petgraph" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "pretty_assertions" version = "1.0.0" @@ -534,6 +585,20 @@ dependencies = [ "regex", ] +[[package]] +name = "tree_magic_mini" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91adfd0607cacf6e4babdb870e9bec4037c1c4b151cfd279ccefc5e0c7feaa6d" +dependencies = [ + "bytecount", + "fnv", + "lazy_static", + "nom", + "once_cell", + "petgraph", +] + [[package]] name = "typed-arena" version = "2.0.1" diff --git a/Cargo.toml b/Cargo.toml index b851a8f87..9a6c9b9d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ owo-colors = "3.3.0" rpds = "0.10.0" wu-diff = "0.1.2" rayon = "1.5.2" +tree_magic_mini = "3.0.3" [dev-dependencies] pretty_assertions = "1.0.0" diff --git a/src/files.rs b/src/files.rs index 0324ee52d..3d62d8dbb 100644 --- a/src/files.rs +++ b/src/files.rs @@ -69,6 +69,21 @@ pub fn read_or_die(path: &Path) -> Vec { /// Do these bytes look like a binary (non-textual) format? pub fn is_probably_binary(bytes: &[u8]) -> bool { + let mime = tree_magic_mini::from_u8(bytes); + match mime { + // Treat pdf as binary. + "application/pdf" => return true, + // Treat all image content as binary. + v if v.starts_with("image/") => return true, + // Treat all audio content as binary. + v if v.starts_with("audio/") => return true, + // Treat all video content as binary. + v if v.starts_with("video/") => return true, + // Treat all font content as binary. + v if v.starts_with("font/") => return true, + _ => {} + } + // If more than 20 of the first 1,000 characters are null bytes or // invalid UTF-8, we assume it's binary. let num_replaced = String::from_utf8_lossy(bytes)