Add HTML parser

pull/305/head
Benjamin Manns 2022-07-01 12:13:04 +07:00
parent edce245beb
commit d131ae1d35
13 changed files with 166 additions and 1 deletions

@ -1,5 +1,9 @@
## 0.30 (unreleased)
### Parsing
Added support for HTML.
## 0.29.1 (released 13th June 2022)
Fixed a major memory regression in 0.29 when performing large

@ -137,6 +137,11 @@ fn main() {
src_dir: "vendor/tree-sitter-hcl-src",
extra_files: vec!["scanner.cc"],
},
TreeSitterParser {
name: "tree-sitter-html",
src_dir: "vendor/tree-sitter-html-src",
extra_files: vec!["scanner.cc"],
},
TreeSitterParser {
name: "tree-sitter-janet-simple",
src_dir: "vendor/tree-sitter-janet-simple-src",

@ -40,6 +40,7 @@ Difftastic also supports the following structured text formats.
|----------|-----------------------------------------------------------------------------------|
| CSS | [tree-sitter/tree-sitter-css](https://github.com/tree-sitter/tree-sitter-css) |
| HCL | [MichaHoffmann/tree-sitter-hcl](https://github.com/MichaHoffmann/tree-sitter-hcl) |
| HTML | [tree-sitter/tree-sitter-html](https://github.com/tree-sitter/tree-sitter-html) |
| JSON | [tree-sitter/tree-sitter-json](https://github.com/tree-sitter/tree-sitter-json) |
| TOML | [ikatyang/tree-sitter-toml](https://github.com/ikatyang/tree-sitter-toml) |
| YAML | [ikatyang/tree-sitter-yaml](https://github.com/ikatyang/tree-sitter-yaml) |

@ -52,6 +52,12 @@ bce74573e003cc6b729a63a4bc34c4af -
sample_files/helpful-unit-test-before.el sample_files/helpful-unit-test-after.el
79597af48ff80bcf9f5d02d20c51606d -
sample_files/html_before.html sample_files/html_after.html
949b14014822274f3578636275c8e6d6 -
sample_files/html_simple_before.html sample_files/html_simple_after.html
13b374996a2b449f79638b2ddcf0c5d8 -
sample_files/identical_before.scala sample_files/identical_after.scala
9c7319f61833e46a0a8cb6c01cc997c9 -

@ -0,0 +1,58 @@
<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<!-- demo for difftastic -->
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #fdfdff;
margin: 10;
padding: 10;
font-family: Helvetica, Arial, sans-serif;
}
#main {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #f0f0f2;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0, 0, 0, 0.02);
}
a:link,
a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div id="main">
<h1>Example Domain</h1>
<p>
This domain is for use in illustrative examples in documents. You may
use this domain in literature without prior coordination or asking for
permission.
</p>
<p>
<a href="https://www.iana.org/domains/example?utm_src=example-dot-org"
>More information...</a
>
</p>
</div>
<script>
alert("goodbye!");
</script>
</body>
</html>

@ -0,0 +1,49 @@
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<!-- demo for tree-sitter -->
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
<script>alert('welcome!');</script>
</body>
</html>

@ -0,0 +1,9 @@
<html>
<head>
<title>Hi</title>
</head>
<body class="bar">
<h1 id="title">Bar</h1>
<p>Story about <strong>bar</strong>.</p>
</body>
</html>

@ -0,0 +1,9 @@
<html>
<head>
<title>Hi!</title>
</head>
<body class="foo">
<h1>Foo</h1>
<p>Story about foo.</p>
</body>
</html>

@ -64,7 +64,7 @@ fn prefer_outer_delimiter(language: guess_language::Language) -> bool {
// languages have syntax like `foo(bar)` or `foo[bar]` where
// the inner delimiter is more relevant.
Bash | C | CPlusPlus | CSharp | Css | Dart | Elixir | Elm | Elvish | Gleam | Go
| Haskell | Java | JavaScript | Jsx | Kotlin | Lua | Nix | OCaml | OCamlInterface
| Haskell | Html | Java | JavaScript | Jsx | Kotlin | Lua | Nix | OCaml | OCamlInterface
| Perl | Php | Python | Ruby | Rust | Scala | Swift | Tsx | TypeScript | Yaml | Zig => {
false
}

@ -34,6 +34,7 @@ pub enum Language {
Go,
Haskell,
Hcl,
Html,
Janet,
Java,
JavaScript,
@ -113,6 +114,7 @@ fn from_emacs_mode_header(src: &str) -> Option<Language> {
"go" => Some(Go),
"haskell" => Some(Haskell),
"hcl" => Some(Hcl),
"html" => Some(Html),
"janet" => Some(Janet),
"java" => Some(Java),
"js" | "js2" => Some(JavaScript),
@ -221,6 +223,7 @@ pub fn from_extension(extension: &OsStr) -> Option<Language> {
"go" => Some(Go),
"hs" => Some(Haskell),
"hcl" | "nomad" | "tf" | "tfvars" | "worfklow" => Some(Hcl),
"html" | "htm" | "xhtml" => Some(Html),
"janet" | "jdn" => Some(Janet),
"java" => Some(Java),
"cjs" | "js" | "mjs" => Some(JavaScript),

@ -59,6 +59,7 @@ extern "C" {
fn tree_sitter_go() -> ts::Language;
fn tree_sitter_haskell() -> ts::Language;
fn tree_sitter_hcl() -> ts::Language;
fn tree_sitter_html() -> ts::Language;
fn tree_sitter_janet_simple() -> ts::Language;
fn tree_sitter_java() -> ts::Language;
fn tree_sitter_javascript() -> ts::Language;
@ -345,6 +346,24 @@ pub fn from_language(language: guess::Language) -> TreeSitterConfig {
.unwrap(),
}
}
Html => {
let language = unsafe { tree_sitter_html() };
TreeSitterConfig {
name: "HTML",
language,
atom_nodes: vec!["attribute_value", "comment", "raw_text", "tag_name", "text"]
.into_iter()
.collect(),
delimiter_tokens: vec![("<", ">"), ("<!", ">"), ("<!--", "-->")]
.into_iter()
.collect(),
highlight_query: ts::Query::new(
language,
include_str!("../../vendor/highlights/html.scm"),
)
.unwrap(),
}
}
Janet => {
let language = unsafe { tree_sitter_janet_simple() };
TreeSitterConfig {

@ -0,0 +1 @@
../tree-sitter-html/queries/highlights.scm

@ -0,0 +1 @@
tree-sitter-html/src