mirror of https://github.com/Wilfred/difftastic/
636 lines
17 KiB
Rust
636 lines
17 KiB
Rust
//! Guess which programming language a file is written in.
|
|
//!
|
|
//! This is heavily based on GitHub's
|
|
//! [linguist](https://github.com/github/linguist/blob/master/docs/how-linguist-works.md),
|
|
//! particularly its
|
|
//! [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml).
|
|
//!
|
|
//! Difftastic does not reuse languages.yml directly. Linguist has a
|
|
//! larger set of language detection strategies.
|
|
|
|
use std::{borrow::Borrow, path::Path};
|
|
|
|
use lazy_static::lazy_static;
|
|
use regex::Regex;
|
|
use strum::{EnumIter, IntoEnumIterator};
|
|
|
|
/// Languages supported by difftastic. Each language here has a
|
|
/// corresponding tree-sitter parser.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, EnumIter)]
|
|
pub enum Language {
|
|
Ada,
|
|
Apex,
|
|
Bash,
|
|
C,
|
|
Clojure,
|
|
CMake,
|
|
CommonLisp,
|
|
CPlusPlus,
|
|
CSharp,
|
|
Css,
|
|
Dart,
|
|
Elixir,
|
|
Elm,
|
|
Elvish,
|
|
EmacsLisp,
|
|
Erlang,
|
|
Gleam,
|
|
Go,
|
|
Hack,
|
|
Hare,
|
|
Haskell,
|
|
Hcl,
|
|
Html,
|
|
Janet,
|
|
Java,
|
|
JavaScript,
|
|
JavascriptJsx,
|
|
Json,
|
|
Julia,
|
|
Kotlin,
|
|
LaTeX,
|
|
Lua,
|
|
Make,
|
|
Newick,
|
|
Nix,
|
|
OCaml,
|
|
OCamlInterface,
|
|
Pascal,
|
|
Perl,
|
|
Php,
|
|
Python,
|
|
Qml,
|
|
R,
|
|
Racket,
|
|
Ruby,
|
|
Rust,
|
|
Scala,
|
|
Solidity,
|
|
Sql,
|
|
Swift,
|
|
Toml,
|
|
TypeScript,
|
|
TypeScriptTsx,
|
|
Xml,
|
|
Yaml,
|
|
Zig,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
pub enum LanguageOverride {
|
|
Language(Language),
|
|
PlainText,
|
|
}
|
|
|
|
/// If there is a language called `name` (comparing case
|
|
/// insensitively), return it. Treat `"text"` as an additional option.
|
|
pub fn language_override_from_name(name: &str) -> Option<LanguageOverride> {
|
|
let name = name.trim().to_lowercase();
|
|
|
|
if name == "text" {
|
|
return Some(LanguageOverride::PlainText);
|
|
}
|
|
|
|
for language in Language::iter() {
|
|
let lang_name = language_name(language);
|
|
if lang_name.to_lowercase() == name {
|
|
return Some(LanguageOverride::Language(language));
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// The language name shown to the user.
|
|
pub fn language_name(language: Language) -> &'static str {
|
|
match language {
|
|
Ada => "Ada",
|
|
Apex => "Apex",
|
|
Bash => "Bash",
|
|
C => "C",
|
|
Clojure => "Clojure",
|
|
CMake => "CMake",
|
|
CommonLisp => "Common Lisp",
|
|
CPlusPlus => "C++",
|
|
CSharp => "C#",
|
|
Css => "CSS",
|
|
Dart => "Dart",
|
|
Elixir => "Elixir",
|
|
Elm => "Elm",
|
|
Elvish => "Elvish",
|
|
EmacsLisp => "Emacs Lisp",
|
|
Erlang => "Erlang",
|
|
Gleam => "Gleam",
|
|
Go => "Go",
|
|
Hack => "Hack",
|
|
Hare => "Hare",
|
|
Haskell => "Haskell",
|
|
Hcl => "HCL",
|
|
Html => "HTML",
|
|
Janet => "Janet",
|
|
Java => "Java",
|
|
JavaScript => "JavaScript",
|
|
JavascriptJsx => "JavaScript JSX",
|
|
Json => "JSON",
|
|
Julia => "Julia",
|
|
Kotlin => "Kotlin",
|
|
LaTeX => "LaTeX",
|
|
Lua => "Lua",
|
|
Make => "Make",
|
|
Newick => "Newick",
|
|
Nix => "Nix",
|
|
OCaml => "OCaml",
|
|
OCamlInterface => "OCaml Interface",
|
|
Pascal => "Pascal",
|
|
Perl => "Perl",
|
|
Php => "PHP",
|
|
Python => "Python",
|
|
Qml => "QML",
|
|
R => "R",
|
|
Racket => "Racket",
|
|
Ruby => "Ruby",
|
|
Rust => "Rust",
|
|
Scala => "Scala",
|
|
Solidity => "Solidity",
|
|
Sql => "SQL",
|
|
Swift => "Swift",
|
|
Toml => "TOML",
|
|
TypeScript => "TypeScript",
|
|
TypeScriptTsx => "TypeScript TSX",
|
|
Xml => "XML",
|
|
Yaml => "YAML",
|
|
Zig => "Zig",
|
|
}
|
|
}
|
|
|
|
use Language::*;
|
|
|
|
/// File globs that identify languages based on the file path.
|
|
pub fn language_globs(language: Language) -> Vec<glob::Pattern> {
|
|
let glob_strs: &'static [&'static str] = match language {
|
|
Ada => &["*.ada", "*.adb", "*.ads"],
|
|
Bash => &[
|
|
"*.bash",
|
|
"*.bats",
|
|
"*.cgi",
|
|
"*.command",
|
|
"*.env",
|
|
"*.fcgi",
|
|
"*.ksh",
|
|
"*.sh",
|
|
"*.sh.in",
|
|
"*.tmux",
|
|
"*.tool",
|
|
"*.zsh",
|
|
".bash_aliases",
|
|
".bash_history",
|
|
".bash_logout",
|
|
".bash_profile",
|
|
".bashrc",
|
|
".cshrc",
|
|
".env",
|
|
".env.example",
|
|
".flaskenv",
|
|
".kshrc",
|
|
".login",
|
|
".profile",
|
|
".zlogin",
|
|
".zlogout",
|
|
".zprofile",
|
|
".zshenv",
|
|
".zshrc",
|
|
"9fs",
|
|
"PKGBUILD",
|
|
"bash_aliases",
|
|
"bash_logout",
|
|
"bash_profile",
|
|
"bashrc",
|
|
"cshrc",
|
|
"gradlew",
|
|
"kshrc",
|
|
"login",
|
|
"man",
|
|
"profile",
|
|
"zlogin",
|
|
"zlogout",
|
|
"zprofile",
|
|
"zshenv",
|
|
"zshrc",
|
|
],
|
|
Apex => &["*.cls", "*.apexc", "*.trigger"],
|
|
C => &["*.c"],
|
|
Clojure => &[
|
|
"*.bb", "*.boot", "*.clj", "*.cljc", "*.clje", "*.cljs", "*.cljx", "*.edn", "*.joke",
|
|
"*.joker",
|
|
],
|
|
CMake => &["*.cmake", "*.cmake.in", "CMakeLists.txt"],
|
|
CommonLisp => &["*.lisp", "*.lsp", "*.asd"],
|
|
// Treat .h as C++ rather than C. This is an arbitrary choice, but
|
|
// C++ is more widely used than C according to
|
|
// https://madnight.github.io/githut/
|
|
// Also, treating CUDA as C++
|
|
CPlusPlus => &[
|
|
"*.cc", "*.cpp", "*.h", "*.hh", "*.hpp", "*.ino", "*.cxx", "*.cu",
|
|
],
|
|
CSharp => &["*.cs"],
|
|
Css => &["*.css"],
|
|
Dart => &["*.dart"],
|
|
Elm => &["*.elm"],
|
|
EmacsLisp => &["*.el", ".emacs", "_emacs", "Cask"],
|
|
Elixir => &["*.ex", "*.exs"],
|
|
Elvish => &["*.elv"],
|
|
Erlang => &[
|
|
"*.erl",
|
|
"*.app.src",
|
|
"*.es",
|
|
"*.escript",
|
|
"*.hrl",
|
|
"*.xrl",
|
|
"*.yrl",
|
|
"Emakefile",
|
|
],
|
|
Gleam => &["*.gleam"],
|
|
Go => &["*.go"],
|
|
Hack => &["*.hack", "*.hck", "*.hhi"],
|
|
Hare => &["*.ha"],
|
|
Haskell => &["*.hs"],
|
|
Hcl => &["*.hcl", "*.nomad", "*.tf", "*.tfvars", "*.workflow"],
|
|
Html => &["*.html", "*.htm", "*.xhtml"],
|
|
Janet => &["*.janet", "*.jdn"],
|
|
Java => &["*.java"],
|
|
JavaScript => &["*.cjs", "*.js", "*.mjs"],
|
|
Json => &[
|
|
"*.json",
|
|
"*.avsc",
|
|
"*.geojson",
|
|
"*.gltf",
|
|
"*.har",
|
|
"*.ice",
|
|
"*.JSON-tmLanguage",
|
|
"*.jsonl",
|
|
"*.mcmeta",
|
|
"*.tfstate",
|
|
"*.tfstate.backup",
|
|
"*.topojson",
|
|
"*.webapp",
|
|
"*.webmanifest",
|
|
".arcconfig",
|
|
".auto-changelog",
|
|
".c8rc",
|
|
".htmlhintrc",
|
|
".imgbotconfig",
|
|
".nycrc",
|
|
".tern-config",
|
|
".tern-project",
|
|
".watchmanconfig",
|
|
"Pipfile.lock",
|
|
"composer.lock",
|
|
"mcmod.info",
|
|
],
|
|
JavascriptJsx => &["*.jsx"],
|
|
Julia => &["*.jl"],
|
|
Kotlin => &["*.kt", "*.ktm", "*.kts"],
|
|
LaTeX => &["*.aux", "*.cls", "*.sty", "*.tex"],
|
|
Lua => &["*.lua"],
|
|
Make => &[
|
|
"*.mak",
|
|
"*.d",
|
|
"*.make",
|
|
"*.makefile",
|
|
"*.mk",
|
|
"*.mkfile",
|
|
"BSDmakefile",
|
|
"GNUmakefile",
|
|
"Kbuild",
|
|
"Makefile",
|
|
"Makefile.am",
|
|
"Makefile.boot",
|
|
"Makefile.frag",
|
|
"Makefile.in",
|
|
"Makefile.inc",
|
|
"Makefile.wat",
|
|
"makefile",
|
|
"makefile.sco",
|
|
"mkfile",
|
|
],
|
|
Newick => &["*.nhx", "*.nwk", "*.nh"],
|
|
Nix => &["*.nix"],
|
|
OCaml => &["*.ml"],
|
|
OCamlInterface => &["*.mli"],
|
|
Pascal => &["*.pas", "*.dfm", "*.dpr", "*.lpr", "*.pascal"],
|
|
Perl => &["*.pm", "*.pl"],
|
|
Php => &["*.php"],
|
|
Python => &["*.py", "*.py3", "*.pyi", "*.bzl", "TARGETS", "BUCK", "DEPS"],
|
|
Qml => &["*.qml"],
|
|
R => &["*.R", "*.r", "*.rd", "*.rsx", ".Rprofile", "expr-dist"],
|
|
Racket => &["*.rkt"],
|
|
Ruby => &[
|
|
"*.rb",
|
|
"*.builder",
|
|
"*.spec",
|
|
"*.rake",
|
|
"Gemfile",
|
|
"Rakefile",
|
|
],
|
|
Rust => &["*.rs"],
|
|
Scala => &["*.scala", "*.sbt", "*.sc"],
|
|
Solidity => &["*.sol"],
|
|
Sql => &["*.sql", "*.pgsql"],
|
|
Swift => &["*.swift"],
|
|
Toml => &[
|
|
"*.toml",
|
|
"Cargo.lock",
|
|
"Gopkg.lock",
|
|
"Pipfile",
|
|
"poetry.lock",
|
|
],
|
|
TypeScript => &["*.ts"],
|
|
TypeScriptTsx => &["*.tsx"],
|
|
Xml => &[
|
|
"*.ant",
|
|
"*.csproj",
|
|
"*.plist",
|
|
"*.resx",
|
|
"*.svg",
|
|
"*.ui",
|
|
"*.vbproj",
|
|
"*.xaml",
|
|
"*.xml",
|
|
"*.xsl",
|
|
"*.xslt",
|
|
"App.config",
|
|
"nuget.config",
|
|
"packages.config",
|
|
".classpath",
|
|
".cproject",
|
|
".project",
|
|
],
|
|
Yaml => &["*.yaml", "*.yml"],
|
|
Zig => &["*.zig"],
|
|
};
|
|
|
|
glob_strs
|
|
.iter()
|
|
.map(|name| {
|
|
glob::Pattern::new(name).expect("Glob in difftastic source should be well-formed")
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn looks_like_hacklang(path: &Path, src: &str) -> bool {
|
|
if let Some(extension) = path.extension() {
|
|
if extension == "php" && src.starts_with("<?hh") {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|
|
|
|
pub fn guess(
|
|
path: &Path,
|
|
src: &str,
|
|
overrides: &[(LanguageOverride, Vec<glob::Pattern>)],
|
|
) -> Option<Language> {
|
|
if let Some(file_name) = path.file_name() {
|
|
let file_name = file_name.to_string_lossy();
|
|
for (lang_override, patterns) in overrides {
|
|
for pattern in patterns {
|
|
if pattern.matches(&file_name) {
|
|
match lang_override {
|
|
LanguageOverride::Language(lang) => return Some(*lang),
|
|
LanguageOverride::PlainText => {
|
|
return None;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(lang) = from_emacs_mode_header(src) {
|
|
return Some(lang);
|
|
}
|
|
if let Some(lang) = from_shebang(src) {
|
|
return Some(lang);
|
|
}
|
|
if looks_like_hacklang(path, src) {
|
|
return Some(Language::Hack);
|
|
}
|
|
if let Some(lang) = from_glob(path) {
|
|
return Some(lang);
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Try to guess the language based on an Emacs mode comment at the
|
|
/// beginning of the file.
|
|
///
|
|
/// <https://www.gnu.org/software/emacs/manual/html_node/emacs/Choosing-Modes.html>
|
|
/// <https://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html>
|
|
fn from_emacs_mode_header(src: &str) -> Option<Language> {
|
|
lazy_static! {
|
|
static ref MODE_RE: Regex = Regex::new(r"-\*-.*mode:([^;]+?);.*-\*-").unwrap();
|
|
static ref SHORTHAND_RE: Regex = Regex::new(r"-\*-(.+)-\*-").unwrap();
|
|
}
|
|
|
|
// Emacs allows the mode header to occur on the second line if the
|
|
// first line is a shebang.
|
|
for line in src.lines().take(2) {
|
|
let mode_name: String = match (MODE_RE.captures(line), SHORTHAND_RE.captures(line)) {
|
|
(Some(cap), _) | (_, Some(cap)) => cap[1].into(),
|
|
_ => "".into(),
|
|
};
|
|
let lang = match mode_name.to_ascii_lowercase().trim() {
|
|
"ada" => Some(Ada),
|
|
"c" => Some(C),
|
|
"clojure" => Some(Clojure),
|
|
"csharp" => Some(CSharp),
|
|
"css" => Some(Css),
|
|
"dart" => Some(Dart),
|
|
"c++" => Some(CPlusPlus),
|
|
"elixir" => Some(Elixir),
|
|
"elm" => Some(Elm),
|
|
"elvish" => Some(Elvish),
|
|
"emacs-lisp" => Some(EmacsLisp),
|
|
"gleam" => Some(Gleam),
|
|
"go" => Some(Go),
|
|
"haskell" => Some(Haskell),
|
|
"hcl" => Some(Hcl),
|
|
"html" => Some(Html),
|
|
"janet" => Some(Janet),
|
|
"java" => Some(Java),
|
|
"js" | "js2" => Some(JavaScript),
|
|
"lisp" => Some(CommonLisp),
|
|
"nxml" => Some(Xml),
|
|
"perl" => Some(Perl),
|
|
"python" => Some(Python),
|
|
"racket" => Some(Racket),
|
|
"rjsx" => Some(JavascriptJsx),
|
|
"ruby" => Some(Ruby),
|
|
"rust" => Some(Rust),
|
|
"scala" => Some(Scala),
|
|
"sh" => Some(Bash),
|
|
"solidity" => Some(Solidity),
|
|
"sql" => Some(Sql),
|
|
"swift" => Some(Swift),
|
|
"toml" => Some(Toml),
|
|
"tuareg" => Some(OCaml),
|
|
"typescript" => Some(TypeScript),
|
|
"yaml" => Some(Yaml),
|
|
"zig" => Some(Zig),
|
|
_ => None,
|
|
};
|
|
if lang.is_some() {
|
|
return lang;
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Try to guess the language based on a shebang present in the source.
|
|
fn from_shebang(src: &str) -> Option<Language> {
|
|
lazy_static! {
|
|
static ref RE: Regex = Regex::new(r"#!(?:/usr/bin/env )?([^ ]+)").unwrap();
|
|
}
|
|
if let Some(first_line) = src.lines().next() {
|
|
if let Some(cap) = RE.captures(first_line) {
|
|
let interpreter_path = Path::new(&cap[1]);
|
|
if let Some(name) = interpreter_path.file_name() {
|
|
match name.to_string_lossy().borrow() {
|
|
"ash" | "bash" | "dash" | "ksh" | "mksh" | "pdksh" | "rc" | "sh" | "zsh" => {
|
|
return Some(Bash)
|
|
}
|
|
"tcc" => return Some(C),
|
|
"lisp" | "sbc" | "ccl" | "clisp" | "ecl" => return Some(CommonLisp),
|
|
"elixir" => return Some(Elixir),
|
|
"elvish" => return Some(Elvish),
|
|
"escript" => return Some(Erlang),
|
|
"hhvm" => return Some(Hack),
|
|
"runghc" | "runhaskell" | "runhugs" => return Some(Haskell),
|
|
"chakra" | "d8" | "gjs" | "js" | "node" | "nodejs" | "qjs" | "rhino" | "v8"
|
|
| "v8-shell" => return Some(JavaScript),
|
|
"ocaml" | "ocamlrun" | "ocamlscript" => return Some(OCaml),
|
|
"perl" => return Some(Perl),
|
|
"python" | "python2" | "python3" => return Some(Python),
|
|
"Rscript" => return Some(R),
|
|
"ruby" | "macruby" | "rake" | "jruby" | "rbx" => return Some(Ruby),
|
|
"swift" => return Some(Swift),
|
|
"deno" | "ts-node" => return Some(TypeScript),
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Hack can use <?hh in files with a .php extension.
|
|
if first_line.starts_with("<?hh") {
|
|
return Some(Hack);
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn from_glob(path: &Path) -> Option<Language> {
|
|
match path.file_name() {
|
|
Some(name) => {
|
|
let name = name.to_string_lossy().into_owned();
|
|
for language in Language::iter() {
|
|
for glob in language_globs(language) {
|
|
if glob.matches(&name) {
|
|
return Some(language);
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
None => None,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use pretty_assertions::assert_eq;
|
|
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_guess_by_extension() {
|
|
let path = Path::new("foo.el");
|
|
assert_eq!(guess(path, "", &[]), Some(EmacsLisp));
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_whole_name() {
|
|
let path = Path::new("foo/.bashrc");
|
|
assert_eq!(guess(path, "", &[]), Some(Bash));
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_shebang() {
|
|
let path = Path::new("foo");
|
|
assert_eq!(guess(path, "#!/bin/bash", &[]), Some(Bash));
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_env_shebang() {
|
|
let path = Path::new("foo");
|
|
assert_eq!(guess(path, "#!/usr/bin/env python", &[]), Some(Python));
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_emacs_mode() {
|
|
let path = Path::new("foo");
|
|
assert_eq!(
|
|
guess(path, "; -*- mode: Lisp; eval: (auto-fill-mode 1); -*-", &[]),
|
|
Some(CommonLisp)
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_emacs_mode_second_line() {
|
|
let path = Path::new("foo");
|
|
assert_eq!(
|
|
guess(path, "#!/bin/bash\n; -*- mode: Lisp; -*-", &[]),
|
|
Some(CommonLisp)
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_emacs_mode_shorthand() {
|
|
let path = Path::new("foo");
|
|
assert_eq!(guess(path, "(* -*- tuareg -*- *)", &[]), Some(OCaml));
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_by_emacs_mode_shorthand_no_spaces() {
|
|
let path = Path::new("foo");
|
|
assert_eq!(guess(path, "# -*-python-*-", &[]), Some(Python));
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_unknown() {
|
|
let path = Path::new("jfkdlsjfkdsljfkdsljf");
|
|
assert_eq!(guess(path, "", &[]), None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_guess_override() {
|
|
let path = Path::new("foo.el");
|
|
assert_eq!(
|
|
guess(
|
|
path,
|
|
"",
|
|
&[(
|
|
LanguageOverride::Language(Css),
|
|
vec![glob::Pattern::new("*.el").unwrap()],
|
|
)]
|
|
),
|
|
Some(Css)
|
|
);
|
|
}
|
|
}
|