mirror of https://github.com/Wilfred/difftastic/
1284 lines
44 KiB
Rust
1284 lines
44 KiB
Rust
//! Load and configure parsers written with tree-sitter.
|
|
|
|
use std::collections::HashSet;
|
|
|
|
use crate::parse::guess_language as guess;
|
|
use tree_sitter as ts;
|
|
use typed_arena::Arena;
|
|
|
|
use crate::{
|
|
lines::NewlinePositions,
|
|
parse::syntax::{AtomKind, Syntax},
|
|
};
|
|
|
|
/// Configuration for a tree-sitter parser.
|
|
pub struct TreeSitterConfig {
|
|
/// The tree-sitter language parser.
|
|
pub language: ts::Language,
|
|
|
|
/// Tree-sitter nodes that we treat as indivisible atoms.
|
|
///
|
|
/// This is particularly useful for strings, as some grammars use
|
|
/// several nodes for a single string literal. We don't want to
|
|
/// say e.g. the closing string delimiter moved, as it's confusing
|
|
/// and not well-balanced syntax.
|
|
///
|
|
/// This is also useful for when tree-sitter nodes don't include
|
|
/// all the children in the source. This is known limitation of
|
|
/// tree-sitter, and occurs more often for complex string syntax.
|
|
/// <https://github.com/tree-sitter/tree-sitter/issues/1156>
|
|
atom_nodes: HashSet<&'static str>,
|
|
|
|
/// We want to consider delimiter tokens as part of lists, not
|
|
/// standalone atoms. Tree-sitter includes delimiter tokens, so
|
|
/// mark which token pairs we consider to be delimiters.
|
|
delimiter_tokens: Vec<(&'static str, &'static str)>,
|
|
|
|
/// Tree-sitter query used for syntax highlighting this
|
|
/// language.
|
|
highlight_query: ts::Query,
|
|
}
|
|
|
|
extern "C" {
|
|
fn tree_sitter_bash() -> ts::Language;
|
|
fn tree_sitter_c() -> ts::Language;
|
|
fn tree_sitter_c_sharp() -> ts::Language;
|
|
fn tree_sitter_clojure() -> ts::Language;
|
|
fn tree_sitter_cmake() -> ts::Language;
|
|
fn tree_sitter_cpp() -> ts::Language;
|
|
fn tree_sitter_commonlisp() -> ts::Language;
|
|
fn tree_sitter_css() -> ts::Language;
|
|
fn tree_sitter_dart() -> ts::Language;
|
|
fn tree_sitter_elisp() -> ts::Language;
|
|
fn tree_sitter_elixir() -> ts::Language;
|
|
fn tree_sitter_elm() -> ts::Language;
|
|
fn tree_sitter_elvish() -> ts::Language;
|
|
fn tree_sitter_gleam() -> ts::Language;
|
|
fn tree_sitter_go() -> ts::Language;
|
|
fn tree_sitter_hare() -> ts::Language;
|
|
fn tree_sitter_hack() -> ts::Language;
|
|
fn tree_sitter_haskell() -> ts::Language;
|
|
fn tree_sitter_hcl() -> ts::Language;
|
|
fn tree_sitter_html() -> ts::Language;
|
|
fn tree_sitter_janet_simple() -> ts::Language;
|
|
fn tree_sitter_java() -> ts::Language;
|
|
fn tree_sitter_javascript() -> ts::Language;
|
|
fn tree_sitter_json() -> ts::Language;
|
|
fn tree_sitter_julia() -> ts::Language;
|
|
fn tree_sitter_kotlin() -> ts::Language;
|
|
fn tree_sitter_lua() -> ts::Language;
|
|
fn tree_sitter_make() -> ts::Language;
|
|
fn tree_sitter_nix() -> ts::Language;
|
|
fn tree_sitter_ocaml() -> ts::Language;
|
|
fn tree_sitter_ocaml_interface() -> ts::Language;
|
|
fn tree_sitter_pascal() -> ts::Language;
|
|
fn tree_sitter_php() -> ts::Language;
|
|
fn tree_sitter_perl() -> ts::Language;
|
|
fn tree_sitter_python() -> ts::Language;
|
|
fn tree_sitter_qmljs() -> ts::Language;
|
|
fn tree_sitter_ruby() -> ts::Language;
|
|
fn tree_sitter_rust() -> ts::Language;
|
|
fn tree_sitter_scala() -> ts::Language;
|
|
fn tree_sitter_sql() -> ts::Language;
|
|
fn tree_sitter_swift() -> ts::Language;
|
|
fn tree_sitter_toml() -> ts::Language;
|
|
fn tree_sitter_tsx() -> ts::Language;
|
|
fn tree_sitter_typescript() -> ts::Language;
|
|
fn tree_sitter_yaml() -> ts::Language;
|
|
fn tree_sitter_zig() -> ts::Language;
|
|
}
|
|
|
|
// TODO: begin/end and object/end.
|
|
const OCAML_ATOM_NODES: [&str; 6] = [
|
|
"character",
|
|
"string",
|
|
"quoted_string",
|
|
"tag",
|
|
"type_variable",
|
|
"attribute_id",
|
|
];
|
|
|
|
pub fn from_language(language: guess::Language) -> TreeSitterConfig {
|
|
use guess::Language::*;
|
|
match language {
|
|
Bash => {
|
|
let language = unsafe { tree_sitter_bash() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "raw_string", "heredoc_body"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/bash.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
C => {
|
|
let language = unsafe { tree_sitter_c() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string_literal", "char_literal"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/c.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
CPlusPlus => {
|
|
let language = unsafe { tree_sitter_cpp() };
|
|
TreeSitterConfig {
|
|
language,
|
|
// The C++ grammar extends the C grammar, so the node
|
|
// names are generally the same.
|
|
atom_nodes: vec!["string_literal", "char_literal"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
concat!(
|
|
include_str!("../../vendor/highlights/c.scm"),
|
|
include_str!("../../vendor/highlights/cpp.scm")
|
|
),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Clojure => {
|
|
let language = unsafe { tree_sitter_clojure() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/clojure.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
CMake => {
|
|
let language = unsafe { tree_sitter_cmake() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["argument"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")")].into_iter().collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/cmake.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
CommonLisp => {
|
|
let language = unsafe { tree_sitter_commonlisp() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["str_lit", "char_lit"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")")],
|
|
highlight_query: ts::Query::new(language, "").unwrap(),
|
|
}
|
|
}
|
|
CSharp => {
|
|
let language = unsafe { tree_sitter_c_sharp() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![
|
|
"string_literal",
|
|
"verbatim_string_literal",
|
|
"character_literal",
|
|
]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/c-sharp.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Css => {
|
|
let language = unsafe { tree_sitter_css() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["integer_value", "float_value"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/css.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Dart => {
|
|
let language = unsafe { tree_sitter_dart() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string_literal", "script_tag"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/dart.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
EmacsLisp => {
|
|
let language = unsafe { tree_sitter_elisp() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/elisp.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Elixir => {
|
|
let language = unsafe { tree_sitter_elixir() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "heredoc"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("do", "end")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/elixir.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Elm => {
|
|
let language = unsafe { tree_sitter_elm() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string_constant_expr"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("[", "]"), ("(", ")")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/elm.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Elvish => {
|
|
let language = unsafe { tree_sitter_elvish() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: [].into(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("|", "|")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/elvish.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Gleam => {
|
|
let language = unsafe { tree_sitter_gleam() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: ["string"].into(),
|
|
delimiter_tokens: vec![("(", ")"), ("[", "]"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/gleam.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Go => {
|
|
let language = unsafe { tree_sitter_go() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["interpreted_string_literal", "raw_string_literal"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("[", "]"), ("(", ")")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/go.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Hack => {
|
|
let language = unsafe { tree_sitter_hack() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["prefixed_string", "heredoc"].into_iter().collect(),
|
|
delimiter_tokens: vec![("[", "]"), ("(", ")"), ("<", ">"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/hack.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Hare => {
|
|
let language = unsafe { tree_sitter_hare() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string_constant", "rune_constant"].into_iter().collect(),
|
|
delimiter_tokens: vec![("[", "]"), ("(", ")"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/hare.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Haskell => {
|
|
let language = unsafe { tree_sitter_haskell() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![].into_iter().collect(),
|
|
delimiter_tokens: vec![("[", "]"), ("(", ")")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/haskell.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Hcl => {
|
|
let language = unsafe { tree_sitter_hcl() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string_lit", "heredoc_template"].into_iter().collect(),
|
|
delimiter_tokens: vec![
|
|
("[", "]"),
|
|
("(", ")"),
|
|
("{", "}"),
|
|
("%{", "}"),
|
|
("%{~", "~}"),
|
|
("${", "}"),
|
|
],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/hcl.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Html => {
|
|
let language = unsafe { tree_sitter_html() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![
|
|
"quoted_attribute_value",
|
|
"comment",
|
|
"raw_text",
|
|
"tag_name",
|
|
"text",
|
|
]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("<", ">"), ("<!", ">"), ("<!--", "-->")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/html.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Janet => {
|
|
let language = unsafe { tree_sitter_janet_simple() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![].into_iter().collect(),
|
|
delimiter_tokens: vec![
|
|
("@{", "}"),
|
|
("@(", ")"),
|
|
("@[", "]"),
|
|
("{", "}"),
|
|
("(", ")"),
|
|
("[", "]"),
|
|
]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/janet_simple.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Java => {
|
|
let language = unsafe { tree_sitter_java() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/java.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
JavaScript | Jsx => {
|
|
let language = unsafe { tree_sitter_javascript() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "template_string", "regex"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![
|
|
("[", "]"),
|
|
("(", ")"),
|
|
("{", "}"),
|
|
// We may see a standalone < token in an expression
|
|
// like 1 < 2, but we should never see both a < and a
|
|
// > at the same level in JSX.
|
|
("<", ">"),
|
|
],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/javascript.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Json => {
|
|
let language = unsafe { tree_sitter_json() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/json.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Julia => {
|
|
let language = unsafe { tree_sitter_julia() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![
|
|
"string_literal",
|
|
"prefixed_string_literal",
|
|
"command_literal",
|
|
"character_literal",
|
|
]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("[", "]"), ("(", ")")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/julia.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Kotlin => {
|
|
let language = unsafe { tree_sitter_kotlin() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["line_string_literal", "character_literal"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]"), ("<", ">")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/kotlin.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Lua => {
|
|
let language = unsafe { tree_sitter_lua() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]")]
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/lua.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Make => {
|
|
let language = unsafe { tree_sitter_make() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["shell_text"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")")].into_iter().collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/make.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Nix => {
|
|
let language = unsafe { tree_sitter_nix() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string_expression", "indented_string_expression"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("[", "]")].into_iter().collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/nix.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
OCaml => {
|
|
let language = unsafe { tree_sitter_ocaml() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: OCAML_ATOM_NODES.iter().copied().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("[", "]"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/ocaml.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
OCamlInterface => {
|
|
let language = unsafe { tree_sitter_ocaml_interface() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: OCAML_ATOM_NODES.iter().copied().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("[", "]"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/ocaml.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Pascal => {
|
|
let language = unsafe { tree_sitter_pascal() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/pascal.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Perl => {
|
|
let language = unsafe { tree_sitter_perl() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![
|
|
"string_single_quoted",
|
|
"string_double_quoted",
|
|
"comments",
|
|
"command_qx_quoted",
|
|
"patter_matcher_m",
|
|
"regex_pattern_qr",
|
|
"transliteration_tr_or_y",
|
|
"substitution_pattern_s",
|
|
]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("{", "}"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/perl.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Php => {
|
|
let language = unsafe { tree_sitter_php() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "encapsed_string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("[", "]"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/php.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Python => {
|
|
let language = unsafe { tree_sitter_python() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")"), ("[", "]"), ("{", "}")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/python.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Qml => {
|
|
let language = unsafe { tree_sitter_qmljs() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "template_string", "regex"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
concat!(
|
|
include_str!("../../vendor/highlights/javascript.scm"),
|
|
include_str!("../../vendor/highlights/typescript.scm"),
|
|
include_str!("../../vendor/highlights/qmljs.scm"),
|
|
),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Ruby => {
|
|
let language = unsafe { tree_sitter_ruby() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "heredoc_body", "regex"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![
|
|
("{", "}"),
|
|
("(", ")"),
|
|
("[", "]"),
|
|
("|", "|"),
|
|
("def", "end"),
|
|
("begin", "end"),
|
|
("class", "end"),
|
|
],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/ruby.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Rust => {
|
|
let language = unsafe { tree_sitter_rust() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["char_literal", "string_literal"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("|", "|"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/rust.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Scala => {
|
|
let language = unsafe { tree_sitter_scala() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "template_string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/scala.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Sql => {
|
|
let language = unsafe { tree_sitter_sql() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "identifier"].into_iter().collect(),
|
|
delimiter_tokens: vec![("(", ")")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/sql.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Swift => {
|
|
let language = unsafe { tree_sitter_swift() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: ["line_string_literal"].into(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/swift.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Toml => {
|
|
let language = unsafe { tree_sitter_toml() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("[", "]")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/toml.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Tsx => {
|
|
let language = unsafe { tree_sitter_tsx() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "template_string"].into_iter().collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
concat!(
|
|
include_str!("../../vendor/highlights/javascript.scm"),
|
|
include_str!("../../vendor/highlights/typescript.scm"),
|
|
),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
TypeScript => {
|
|
let language = unsafe { tree_sitter_typescript() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["string", "template_string", "regex"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: vec![("{", "}"), ("(", ")"), ("[", "]"), ("<", ">")],
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
concat!(
|
|
include_str!("../../vendor/highlights/javascript.scm"),
|
|
include_str!("../../vendor/highlights/typescript.scm"),
|
|
),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Yaml => {
|
|
let language = unsafe { tree_sitter_yaml() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec![
|
|
"string_scalar",
|
|
"double_quote_scalar",
|
|
"single_quote_scalar",
|
|
"block_scalar",
|
|
]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: (vec![("{", "}"), ("(", ")"), ("[", "]")]),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/yaml.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
Zig => {
|
|
let language = unsafe { tree_sitter_zig() };
|
|
TreeSitterConfig {
|
|
language,
|
|
atom_nodes: vec!["STRINGLITERALSINGLE", "BUILTINIDENTIFIER"]
|
|
.into_iter()
|
|
.collect(),
|
|
delimiter_tokens: (vec![("{", "}"), ("[", "]"), ("(", ")")])
|
|
.into_iter()
|
|
.collect(),
|
|
highlight_query: ts::Query::new(
|
|
language,
|
|
include_str!("../../vendor/highlights/zig.scm"),
|
|
)
|
|
.unwrap(),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse `src` with tree-sitter.
|
|
pub fn parse_to_tree(src: &str, config: &TreeSitterConfig) -> tree_sitter::Tree {
|
|
let mut parser = ts::Parser::new();
|
|
parser
|
|
.set_language(config.language)
|
|
.expect("Incompatible tree-sitter version");
|
|
|
|
parser.parse(src, None).unwrap()
|
|
}
|
|
|
|
/// Calculate which tree-sitter node IDs should have which syntax
|
|
/// highlighting.
|
|
fn tree_highlights(
|
|
tree: &tree_sitter::Tree,
|
|
src: &str,
|
|
config: &TreeSitterConfig,
|
|
) -> HighlightedNodeIds {
|
|
let mut keyword_ish_capture_ids: Vec<u32> = vec![];
|
|
let mut string_capture_ids = vec![];
|
|
let mut type_capture_ids = vec![];
|
|
let mut comment_capture_ids = vec![];
|
|
|
|
// Query names are often written with namespacing, so
|
|
// highlights.scm might contain @constant or the more specific
|
|
// @constant.builtin.
|
|
//
|
|
// We support e.g. arbitrary @constant.foo so we get the benefit
|
|
// of all the relevant highlighting queries.
|
|
let cn = config.highlight_query.capture_names();
|
|
for (idx, name) in cn.iter().enumerate() {
|
|
if name == "type"
|
|
|| name.starts_with("type.")
|
|
|| name.starts_with("storage.type.")
|
|
|| name.starts_with("keyword.type.")
|
|
|| name == "tag"
|
|
{
|
|
// TODO: this doesn't capture (type_ref) in Elm as that
|
|
// applies to the parent node.
|
|
type_capture_ids.push(idx as u32);
|
|
} else if name == "keyword"
|
|
|| name.starts_with("keyword.")
|
|
|| name == "constant"
|
|
|| name.starts_with("constant.")
|
|
|| name == "operator"
|
|
|| name == "repeat"
|
|
|| name == "conditional"
|
|
|| name == "boolean"
|
|
{
|
|
keyword_ish_capture_ids.push(idx as u32);
|
|
}
|
|
|
|
if name == "string"
|
|
|| name.starts_with("string.")
|
|
|| name == "character"
|
|
|| name.starts_with("character.")
|
|
{
|
|
string_capture_ids.push(idx as u32);
|
|
}
|
|
|
|
// Rust uses 'label' for lifetimes, and highglighting
|
|
// lifetimes consistently with types seems reasonable.
|
|
if name == "label" {
|
|
type_capture_ids.push(idx as u32);
|
|
}
|
|
|
|
if name == "comment" {
|
|
comment_capture_ids.push(idx as u32);
|
|
}
|
|
}
|
|
|
|
let mut qc = ts::QueryCursor::new();
|
|
let q_matches = qc.matches(&config.highlight_query, tree.root_node(), src.as_bytes());
|
|
|
|
let mut comment_ids = HashSet::new();
|
|
let mut keyword_ids = HashSet::new();
|
|
let mut string_ids = HashSet::new();
|
|
let mut type_ids = HashSet::new();
|
|
for m in q_matches {
|
|
for c in m.captures {
|
|
if comment_capture_ids.contains(&c.index) {
|
|
comment_ids.insert(c.node.id());
|
|
} else if keyword_ish_capture_ids.contains(&c.index) {
|
|
keyword_ids.insert(c.node.id());
|
|
} else if string_capture_ids.contains(&c.index) {
|
|
string_ids.insert(c.node.id());
|
|
} else if type_capture_ids.contains(&c.index) {
|
|
type_ids.insert(c.node.id());
|
|
}
|
|
}
|
|
}
|
|
|
|
HighlightedNodeIds {
|
|
comment_ids,
|
|
keyword_ids,
|
|
string_ids,
|
|
type_ids,
|
|
}
|
|
}
|
|
|
|
pub fn print_tree(src: &str, tree: &tree_sitter::Tree) {
|
|
let mut cursor = tree.walk();
|
|
print_cursor(src, &mut cursor, 0);
|
|
}
|
|
|
|
fn print_cursor(src: &str, cursor: &mut ts::TreeCursor, depth: usize) {
|
|
loop {
|
|
let node = cursor.node();
|
|
node.end_position();
|
|
|
|
let formatted_node = format!(
|
|
"{} {} - {}",
|
|
node.kind().replace('\n', "\\n"),
|
|
node.start_position(),
|
|
node.end_position()
|
|
);
|
|
|
|
if node.child_count() == 0 {
|
|
let node_src = &src[node.start_byte()..node.end_byte()];
|
|
println!("{}{} {:?}", " ".repeat(depth), formatted_node, node_src);
|
|
} else {
|
|
println!("{}{}", " ".repeat(depth), formatted_node,);
|
|
}
|
|
|
|
if cursor.goto_first_child() {
|
|
print_cursor(src, cursor, depth + 1);
|
|
cursor.goto_parent();
|
|
}
|
|
|
|
if !cursor.goto_next_sibling() {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse `src` with tree-sitter and convert to difftastic Syntax.
|
|
pub fn parse<'a>(
|
|
arena: &'a Arena<Syntax<'a>>,
|
|
src: &str,
|
|
config: &TreeSitterConfig,
|
|
) -> Vec<&'a Syntax<'a>> {
|
|
// Don't return anything on an empty input. Most parsers return a
|
|
// zero-width top-level AST node on empty files, which is
|
|
// confusing and not useful for diffing.
|
|
if src.trim().is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
let tree = parse_to_tree(src, config);
|
|
let highlights = tree_highlights(&tree, src, config);
|
|
|
|
let nl_pos = NewlinePositions::from(src);
|
|
let mut cursor = tree.walk();
|
|
|
|
// The tree always has a single root, whereas we want nodes for
|
|
// each top level syntax item.
|
|
cursor.goto_first_child();
|
|
|
|
all_syntaxes_from_cursor(arena, src, &nl_pos, &mut cursor, config, &highlights)
|
|
}
|
|
|
|
fn child_tokens<'a>(src: &'a str, cursor: &mut ts::TreeCursor) -> Vec<Option<&'a str>> {
|
|
let mut tokens = vec![];
|
|
|
|
cursor.goto_first_child();
|
|
loop {
|
|
let node = cursor.node();
|
|
|
|
// We're only interested in tree-sitter nodes that are plain tokens,
|
|
// not lists or comments.
|
|
if node.child_count() > 1 || node.is_extra() {
|
|
tokens.push(None);
|
|
} else {
|
|
tokens.push(Some(&src[node.start_byte()..node.end_byte()]));
|
|
}
|
|
|
|
if !cursor.goto_next_sibling() {
|
|
break;
|
|
}
|
|
}
|
|
cursor.goto_parent();
|
|
|
|
tokens
|
|
}
|
|
|
|
/// Are any of the children of the node at `cursor` delimiters? Return
|
|
/// their indexes if so.
|
|
fn find_delim_positions(
|
|
src: &str,
|
|
cursor: &mut ts::TreeCursor,
|
|
lang_delims: &[(&str, &str)],
|
|
) -> Option<(usize, usize)> {
|
|
let tokens = child_tokens(src, cursor);
|
|
|
|
for (i, token) in tokens.iter().enumerate() {
|
|
for (open_delim, close_delim) in lang_delims {
|
|
if *token == Some(open_delim) {
|
|
for (j, token) in tokens.iter().skip(i + 1).enumerate() {
|
|
if *token == Some(close_delim) {
|
|
return Some((i, i + 1 + j));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
pub struct HighlightedNodeIds {
|
|
keyword_ids: HashSet<usize>,
|
|
comment_ids: HashSet<usize>,
|
|
string_ids: HashSet<usize>,
|
|
type_ids: HashSet<usize>,
|
|
}
|
|
|
|
/// Convert all the tree-sitter nodes at this level to difftastic
|
|
/// syntax nodes.
|
|
///
|
|
/// `cursor` should be pointing at the first tree-sitter node in a level.
|
|
fn all_syntaxes_from_cursor<'a>(
|
|
arena: &'a Arena<Syntax<'a>>,
|
|
src: &str,
|
|
nl_pos: &NewlinePositions,
|
|
cursor: &mut ts::TreeCursor,
|
|
config: &TreeSitterConfig,
|
|
highlights: &HighlightedNodeIds,
|
|
) -> Vec<&'a Syntax<'a>> {
|
|
let mut result: Vec<&Syntax> = vec![];
|
|
|
|
loop {
|
|
result.extend(syntax_from_cursor(
|
|
arena, src, nl_pos, cursor, config, highlights,
|
|
));
|
|
|
|
if !cursor.goto_next_sibling() {
|
|
break;
|
|
}
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Convert the tree-sitter node at `cursor` to a difftastic syntax
|
|
/// node.
|
|
fn syntax_from_cursor<'a>(
|
|
arena: &'a Arena<Syntax<'a>>,
|
|
src: &str,
|
|
nl_pos: &NewlinePositions,
|
|
cursor: &mut ts::TreeCursor,
|
|
config: &TreeSitterConfig,
|
|
highlights: &HighlightedNodeIds,
|
|
) -> Vec<&'a Syntax<'a>> {
|
|
let node = cursor.node();
|
|
|
|
if node.is_error() {
|
|
let position = nl_pos.from_offsets(node.start_byte(), node.end_byte());
|
|
let content = &src[node.start_byte()..node.end_byte()];
|
|
|
|
debug!(
|
|
"Tree-sitter syntax error at {:?}: {}",
|
|
position.get(0),
|
|
content
|
|
);
|
|
}
|
|
|
|
if config.atom_nodes.contains(node.kind()) {
|
|
// Treat nodes like string literals as atoms, regardless
|
|
// of whether they have children.
|
|
atom_from_cursor(arena, src, nl_pos, cursor, highlights)
|
|
} else if node.child_count() > 0 {
|
|
vec![list_from_cursor(
|
|
arena, src, nl_pos, cursor, config, highlights,
|
|
)]
|
|
} else {
|
|
atom_from_cursor(arena, src, nl_pos, cursor, highlights)
|
|
}
|
|
}
|
|
|
|
/// Convert the tree-sitter node at `cursor` to a difftastic list
|
|
/// node.
|
|
fn list_from_cursor<'a>(
|
|
arena: &'a Arena<Syntax<'a>>,
|
|
src: &str,
|
|
nl_pos: &NewlinePositions,
|
|
cursor: &mut ts::TreeCursor,
|
|
config: &TreeSitterConfig,
|
|
highlights: &HighlightedNodeIds,
|
|
) -> &'a Syntax<'a> {
|
|
let root_node = cursor.node();
|
|
|
|
// We may not have an enclosing delimiter for this list. Use "" as
|
|
// the delimiter text and the start/end of this node as the
|
|
// delimiter positions.
|
|
let outer_open_content = "";
|
|
let outer_open_position = nl_pos.from_offsets(root_node.start_byte(), root_node.start_byte());
|
|
let outer_close_content = "";
|
|
let outer_close_position = nl_pos.from_offsets(root_node.end_byte(), root_node.end_byte());
|
|
|
|
let (i, j) = match find_delim_positions(src, cursor, &config.delimiter_tokens) {
|
|
Some((i, j)) => (i as isize, j as isize),
|
|
None => (-1, root_node.child_count() as isize),
|
|
};
|
|
|
|
let mut inner_open_content = outer_open_content;
|
|
let mut inner_open_position = outer_open_position.clone();
|
|
let mut inner_close_content = outer_close_content;
|
|
let mut inner_close_position = outer_close_position.clone();
|
|
|
|
// Tree-sitter trees include the delimiter tokens, so `(x)` is
|
|
// parsed as:
|
|
//
|
|
// "(" "x" ")"
|
|
//
|
|
// However, there's no guarantee that the first token is a
|
|
// delimiter. For example, the C parser treats `foo[0]` as:
|
|
//
|
|
// "foo" "[" "0" "]"
|
|
//
|
|
// Store the syntax nodes before, between and after the
|
|
// delimiters, so we can construct lists.
|
|
let mut before_delim = vec![];
|
|
let mut between_delim = vec![];
|
|
let mut after_delim = vec![];
|
|
|
|
let mut node_i = 0;
|
|
cursor.goto_first_child();
|
|
loop {
|
|
let node = cursor.node();
|
|
if node_i < i {
|
|
before_delim.extend(syntax_from_cursor(
|
|
arena, src, nl_pos, cursor, config, highlights,
|
|
));
|
|
} else if node_i == i {
|
|
inner_open_content = &src[node.start_byte()..node.end_byte()];
|
|
inner_open_position = nl_pos.from_offsets(node.start_byte(), node.end_byte());
|
|
} else if node_i < j {
|
|
between_delim.extend(syntax_from_cursor(
|
|
arena, src, nl_pos, cursor, config, highlights,
|
|
));
|
|
} else if node_i == j {
|
|
inner_close_content = &src[node.start_byte()..node.end_byte()];
|
|
inner_close_position = nl_pos.from_offsets(node.start_byte(), node.end_byte());
|
|
} else if node_i > j {
|
|
after_delim.extend(syntax_from_cursor(
|
|
arena, src, nl_pos, cursor, config, highlights,
|
|
));
|
|
}
|
|
|
|
if !cursor.goto_next_sibling() {
|
|
break;
|
|
}
|
|
node_i += 1;
|
|
}
|
|
cursor.goto_parent();
|
|
|
|
let inner_list = Syntax::new_list(
|
|
arena,
|
|
inner_open_content,
|
|
inner_open_position,
|
|
between_delim,
|
|
inner_close_content,
|
|
inner_close_position,
|
|
);
|
|
|
|
if before_delim.is_empty() && after_delim.is_empty() {
|
|
// The common case "(" "x" ")", so we don't need the outer list.
|
|
inner_list
|
|
} else {
|
|
// Wrap the inner list in an additional list that includes the
|
|
// syntax nodes before and after the delimiters.
|
|
//
|
|
// "foo" "[" "0" "]" // tree-sitter nodes
|
|
//
|
|
// (List "foo" (List "0")) // difftastic syntax nodes
|
|
let mut children = before_delim;
|
|
children.push(inner_list);
|
|
children.append(&mut after_delim);
|
|
|
|
Syntax::new_list(
|
|
arena,
|
|
outer_open_content,
|
|
outer_open_position,
|
|
children,
|
|
outer_close_content,
|
|
outer_close_position,
|
|
)
|
|
}
|
|
}
|
|
|
|
/// Convert the tree-sitter node at `cursor` to a difftastic atom.
|
|
fn atom_from_cursor<'a>(
|
|
arena: &'a Arena<Syntax<'a>>,
|
|
src: &str,
|
|
nl_pos: &NewlinePositions,
|
|
cursor: &mut ts::TreeCursor,
|
|
highlights: &HighlightedNodeIds,
|
|
) -> Vec<&'a Syntax<'a>> {
|
|
let node = cursor.node();
|
|
let position = nl_pos.from_offsets(node.start_byte(), node.end_byte());
|
|
let mut content = &src[node.start_byte()..node.end_byte()];
|
|
|
|
// The C and C++ grammars have a '\n' node with the
|
|
// preprocessor. This isn't useful for difftastic, because it's
|
|
// not visible, but leads us to highlight unchanged lines that
|
|
// happen to have preceding newline node.
|
|
if node.kind() == "\n" {
|
|
return vec![];
|
|
}
|
|
|
|
// JSX trims whitespace at the beginning and end of text nodes.
|
|
// TODO: match the exact trimming behaviour used in React.
|
|
//
|
|
// https://reactjs.org/blog/2014/02/20/react-v0.9.html#jsx-whitespace
|
|
// https://github.com/facebook/react/pull/480
|
|
if node.kind() == "jsx_text" {
|
|
content = content.trim();
|
|
}
|
|
|
|
// 'extra' nodes in tree-sitter are comments. Most parsers use
|
|
// 'comment' as their comment node name, but if they don't we can
|
|
// still detect comments by looking at their syntax highlighting.
|
|
let highlight = if node.is_extra()
|
|
|| node.kind() == "comment"
|
|
|| highlights.comment_ids.contains(&node.id())
|
|
{
|
|
AtomKind::Comment
|
|
} else if highlights.keyword_ids.contains(&node.id()) {
|
|
AtomKind::Keyword
|
|
} else if highlights.string_ids.contains(&node.id()) {
|
|
AtomKind::String
|
|
} else if highlights.type_ids.contains(&node.id()) {
|
|
AtomKind::Type
|
|
} else {
|
|
AtomKind::Normal
|
|
};
|
|
|
|
vec![Syntax::new_atom(arena, position, content, highlight)]
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
/// Simple smoke test for tree-sitter parsing. Having a test also
|
|
/// ensures that this file has its coverage measured.
|
|
/// <https://github.com/mozilla/grcov/issues/617>
|
|
#[test]
|
|
fn test_parse() {
|
|
let arena = Arena::new();
|
|
let css_config = from_language(guess::Language::Css);
|
|
parse(&arena, ".foo {}", &css_config);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_empty_file() {
|
|
let arena = Arena::new();
|
|
let config = from_language(guess::Language::EmacsLisp);
|
|
let res = parse(&arena, "", &config);
|
|
|
|
let expected: Vec<&Syntax> = vec![];
|
|
assert_eq!(res, expected);
|
|
}
|
|
}
|