From d131ae1d3557d12fc53e3c90309dbfca97bfa31a Mon Sep 17 00:00:00 2001 From: Benjamin Manns Date: Fri, 1 Jul 2022 12:13:04 -0400 Subject: [PATCH] Add HTML parser --- CHANGELOG.md | 4 ++ build.rs | 5 +++ manual/src/languages_supported.md | 1 + sample_files/compare.expected | 6 +++ sample_files/html_after.html | 58 ++++++++++++++++++++++++++++ sample_files/html_before.html | 49 +++++++++++++++++++++++ sample_files/html_simple_after.html | 9 +++++ sample_files/html_simple_before.html | 9 +++++ src/diff/sliders.rs | 2 +- src/parse/guess_language.rs | 3 ++ src/parse/tree_sitter_parser.rs | 19 +++++++++ vendor/highlights/html.scm | 1 + vendor/tree-sitter-html-src | 1 + 13 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 sample_files/html_after.html create mode 100644 sample_files/html_before.html create mode 100644 sample_files/html_simple_after.html create mode 100644 sample_files/html_simple_before.html create mode 120000 vendor/highlights/html.scm create mode 120000 vendor/tree-sitter-html-src diff --git a/CHANGELOG.md b/CHANGELOG.md index 7767219c6..5b48941bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## 0.30 (unreleased) +### Parsing + +Added support for HTML. + ## 0.29.1 (released 13th June 2022) Fixed a major memory regression in 0.29 when performing large diff --git a/build.rs b/build.rs index 249f27b69..4a508a0c1 100644 --- a/build.rs +++ b/build.rs @@ -137,6 +137,11 @@ fn main() { src_dir: "vendor/tree-sitter-hcl-src", extra_files: vec!["scanner.cc"], }, + TreeSitterParser { + name: "tree-sitter-html", + src_dir: "vendor/tree-sitter-html-src", + extra_files: vec!["scanner.cc"], + }, TreeSitterParser { name: "tree-sitter-janet-simple", src_dir: "vendor/tree-sitter-janet-simple-src", diff --git a/manual/src/languages_supported.md b/manual/src/languages_supported.md index a596059af..6ce42ad35 100644 --- a/manual/src/languages_supported.md +++ b/manual/src/languages_supported.md @@ -40,6 +40,7 @@ Difftastic also supports the following structured text formats. |----------|-----------------------------------------------------------------------------------| | CSS | [tree-sitter/tree-sitter-css](https://github.com/tree-sitter/tree-sitter-css) | | HCL | [MichaHoffmann/tree-sitter-hcl](https://github.com/MichaHoffmann/tree-sitter-hcl) | +| HTML | [tree-sitter/tree-sitter-html](https://github.com/tree-sitter/tree-sitter-html) | | JSON | [tree-sitter/tree-sitter-json](https://github.com/tree-sitter/tree-sitter-json) | | TOML | [ikatyang/tree-sitter-toml](https://github.com/ikatyang/tree-sitter-toml) | | YAML | [ikatyang/tree-sitter-yaml](https://github.com/ikatyang/tree-sitter-yaml) | diff --git a/sample_files/compare.expected b/sample_files/compare.expected index d865a1078..fa7107b44 100644 --- a/sample_files/compare.expected +++ b/sample_files/compare.expected @@ -52,6 +52,12 @@ bce74573e003cc6b729a63a4bc34c4af - sample_files/helpful-unit-test-before.el sample_files/helpful-unit-test-after.el 79597af48ff80bcf9f5d02d20c51606d - +sample_files/html_before.html sample_files/html_after.html +949b14014822274f3578636275c8e6d6 - + +sample_files/html_simple_before.html sample_files/html_simple_after.html +13b374996a2b449f79638b2ddcf0c5d8 - + sample_files/identical_before.scala sample_files/identical_after.scala 9c7319f61833e46a0a8cb6c01cc997c9 - diff --git a/sample_files/html_after.html b/sample_files/html_after.html new file mode 100644 index 000000000..99b339277 --- /dev/null +++ b/sample_files/html_after.html @@ -0,0 +1,58 @@ + + + + Example Domain + + + + + + + + + + +
+

Example Domain

+

+ This domain is for use in illustrative examples in documents. You may + use this domain in literature without prior coordination or asking for + permission. +

+

+ More information... +

+
+ + + diff --git a/sample_files/html_before.html b/sample_files/html_before.html new file mode 100644 index 000000000..fb5944006 --- /dev/null +++ b/sample_files/html_before.html @@ -0,0 +1,49 @@ + + + + Example Domain + + + + + + + + + + +
+

Example Domain

+

This domain is for use in illustrative examples in documents. You may use this + domain in literature without prior coordination or asking for permission.

+

More information...

+
+ + + diff --git a/sample_files/html_simple_after.html b/sample_files/html_simple_after.html new file mode 100644 index 000000000..30844e01f --- /dev/null +++ b/sample_files/html_simple_after.html @@ -0,0 +1,9 @@ + + + Hi + + +

Bar

+

Story about bar.

+ + diff --git a/sample_files/html_simple_before.html b/sample_files/html_simple_before.html new file mode 100644 index 000000000..cabc15bb6 --- /dev/null +++ b/sample_files/html_simple_before.html @@ -0,0 +1,9 @@ + + + Hi! + + +

Foo

+

Story about foo.

+ + diff --git a/src/diff/sliders.rs b/src/diff/sliders.rs index e81123200..e8227b0be 100644 --- a/src/diff/sliders.rs +++ b/src/diff/sliders.rs @@ -64,7 +64,7 @@ fn prefer_outer_delimiter(language: guess_language::Language) -> bool { // languages have syntax like `foo(bar)` or `foo[bar]` where // the inner delimiter is more relevant. Bash | C | CPlusPlus | CSharp | Css | Dart | Elixir | Elm | Elvish | Gleam | Go - | Haskell | Java | JavaScript | Jsx | Kotlin | Lua | Nix | OCaml | OCamlInterface + | Haskell | Html | Java | JavaScript | Jsx | Kotlin | Lua | Nix | OCaml | OCamlInterface | Perl | Php | Python | Ruby | Rust | Scala | Swift | Tsx | TypeScript | Yaml | Zig => { false } diff --git a/src/parse/guess_language.rs b/src/parse/guess_language.rs index 45b2b1da5..b1696a934 100644 --- a/src/parse/guess_language.rs +++ b/src/parse/guess_language.rs @@ -34,6 +34,7 @@ pub enum Language { Go, Haskell, Hcl, + Html, Janet, Java, JavaScript, @@ -113,6 +114,7 @@ fn from_emacs_mode_header(src: &str) -> Option { "go" => Some(Go), "haskell" => Some(Haskell), "hcl" => Some(Hcl), + "html" => Some(Html), "janet" => Some(Janet), "java" => Some(Java), "js" | "js2" => Some(JavaScript), @@ -221,6 +223,7 @@ pub fn from_extension(extension: &OsStr) -> Option { "go" => Some(Go), "hs" => Some(Haskell), "hcl" | "nomad" | "tf" | "tfvars" | "worfklow" => Some(Hcl), + "html" | "htm" | "xhtml" => Some(Html), "janet" | "jdn" => Some(Janet), "java" => Some(Java), "cjs" | "js" | "mjs" => Some(JavaScript), diff --git a/src/parse/tree_sitter_parser.rs b/src/parse/tree_sitter_parser.rs index a0254c157..10149a125 100644 --- a/src/parse/tree_sitter_parser.rs +++ b/src/parse/tree_sitter_parser.rs @@ -59,6 +59,7 @@ extern "C" { fn tree_sitter_go() -> ts::Language; fn tree_sitter_haskell() -> ts::Language; fn tree_sitter_hcl() -> ts::Language; + fn tree_sitter_html() -> ts::Language; fn tree_sitter_janet_simple() -> ts::Language; fn tree_sitter_java() -> ts::Language; fn tree_sitter_javascript() -> ts::Language; @@ -345,6 +346,24 @@ pub fn from_language(language: guess::Language) -> TreeSitterConfig { .unwrap(), } } + Html => { + let language = unsafe { tree_sitter_html() }; + TreeSitterConfig { + name: "HTML", + language, + atom_nodes: vec!["attribute_value", "comment", "raw_text", "tag_name", "text"] + .into_iter() + .collect(), + delimiter_tokens: vec![("<", ">"), (""), ("")] + .into_iter() + .collect(), + highlight_query: ts::Query::new( + language, + include_str!("../../vendor/highlights/html.scm"), + ) + .unwrap(), + } + } Janet => { let language = unsafe { tree_sitter_janet_simple() }; TreeSitterConfig { diff --git a/vendor/highlights/html.scm b/vendor/highlights/html.scm new file mode 120000 index 000000000..0ea534ed5 --- /dev/null +++ b/vendor/highlights/html.scm @@ -0,0 +1 @@ +../tree-sitter-html/queries/highlights.scm \ No newline at end of file diff --git a/vendor/tree-sitter-html-src b/vendor/tree-sitter-html-src new file mode 120000 index 000000000..72f2053cf --- /dev/null +++ b/vendor/tree-sitter-html-src @@ -0,0 +1 @@ +tree-sitter-html/src \ No newline at end of file