Merge pull request #190 from sogaiu/janet-subtree

Add Janet support
pull/188/head^2
Wilfred Hughes 2022-03-29 19:32:44 +07:00 committed by GitHub
commit 48cf4ece6d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 6729 additions and 0 deletions

@ -114,6 +114,11 @@ fn main() {
src_dir: "vendor/tree-sitter-haskell-src",
extra_files: vec!["scanner.cc"],
},
TreeSitterParser {
name: "tree-sitter-janet-simple",
src_dir: "vendor/tree-sitter-janet-simple-src",
extra_files: vec!["scanner.c"],
},
TreeSitterParser {
name: "tree-sitter-java",
src_dir: "vendor/tree-sitter-java-src",

@ -46,6 +46,9 @@ sample_files/identical_before.scala sample_files/identical_after.scala
sample_files/if_before.py sample_files/if_after.py
e633761742973b2ef3b2ed078cedbdf2 -
sample_files/janet_before.janet sample_files/janet_after.janet
677604a16ef62f6b6252d76d76e86265 -
sample_files/java_before.java sample_files/java_after.java
7fda19b66481b0658fc476b2a0e2657b -

@ -0,0 +1,314 @@
# the "GNU Emacs Lisp Reference Manual" has very useful info
# in the code below section names will be mentioned, like:
# see "Special Read Syntax"
# bl - begin line
# bc - begin column
# el - end line
# ec - end column
(defn make-attrs
[& items]
(zipcoll [:bl :bc :el :ec]
items))
(defn atom-node
[node-type peg-form]
~(cmt (capture (sequence (line) (column)
,peg-form
(line) (column)))
,|[node-type (make-attrs ;(slice $& 0 -2)) (last $&)]))
(defn reader-macro-node
[node-type sigil]
~(cmt (capture (sequence (line) (column)
,sigil
(any :non-form)
:form
(line) (column)))
,|[node-type (make-attrs ;(slice $& 0 2) ;(slice $& -4 -2))
;(slice $& 2 -4)]))
(defn collection-node
[node-type open-delim close-delim]
~(cmt
(capture
(sequence
(line) (column)
,open-delim
(any :input)
(choice ,close-delim
(error
(replace (sequence (line) (column))
,|(string/format
"line: %p column: %p missing %p for %p"
$0 $1 close-delim node-type))))
(line) (column)))
,|[node-type (make-attrs ;(slice $& 0 2) ;(slice $& -4 -2))
;(slice $& 2 -4)]))
(def loc-grammar
~{:main (sequence (line) (column)
(some :input)
(line) (column))
#
:input (choice :non-form
:form)
#
:non-form (choice :whitespace
:comment)
#
:whitespace ,(atom-node :whitespace
'(choice (some (set " \f\t\v"))
(choice "\r\n"
"\r"
"\n")))
#
:comment ,(atom-node :comment
'(sequence ";"
(any (if-not (set "\r\n") 1))))
#
:form (choice # reader macros
:backquote
:function
:quote
:unquote-splice
:unquote
# collections
:list
:vector
:char-table
:sub-char-table
:hash-table
:record
:bytecode
:string-text-props
# atoms
# XXX: might need assertions at end of things before
# symbols. see the partial job in :integer-10 below
:float
:integer
:char
:string
:symbol)
# see "Backquote"
:backquote ,(reader-macro-node :backquote "`")
# see "Anonymous Functions"
:function ,(reader-macro-node :function "#'")
# see "Quoting"
:quote ,(reader-macro-node :quote "'")
# see "Backquote"
:unquote-splice ,(reader-macro-node :unquote-splice ",@")
# see "Backquote"
:unquote ,(reader-macro-node :unquote ",")
#
# see "Cons Cell Type"
:list ,(collection-node :list "(" ")")
# see "Vectors"
:vector ,(collection-node :vector "[" "]")
# see "Char-Table Type"
:char-table ,(collection-node :char-table "#^[" "]")
# see "Char-Table Type"
:sub-char-table ,(collection-node :sub-char-table "#^^[" "]")
# see "Byte-Code Objects"
:bytecode ,(collection-node :bytecode "#[" "]")
# see "Hash Tables"
:hash-table ,(collection-node :hash-table "#s(hash-table" ")")
# see "Records"
:record ,(collection-node :record "#s(" ")")
# see "Text Props and Strings"
:string-text-props
,(collection-node :string-text-props "#(" ")")
#
# see "Float Basics"
:float ,(atom-node :float
'(choice :float-dec
:float-exp
:float-both
:float-inf
:float-nan))
#
:float-dec (sequence (opt (choice "+" "-"))
:d*
"."
:d+)
#
:float-exp (sequence (opt (choice "+" "-"))
:d*
(choice "e" "E")
:d+)
#
:float-both (sequence (opt (choice "+" "-"))
:d*
"."
:d+
(choice "e" "E")
:d+)
#
:float-inf (sequence (opt "-")
"1.0"
(choice "e" "E")
"+INF")
#
:float-nan (sequence (opt "-")
"0.0"
(choice "e" "E")
"+NaN")
# see "Integer Basics"
:integer ,(atom-node :integer
'(choice :integer-10
:integer-base))
#
:integer-10 (sequence (opt (choice "+" "-"))
:d+
(opt ".")
# XXX: hack?
(not (set "+-")))
#
:integer-base (sequence "#"
(choice "b"
"o"
"x"
# XXX: found in xml.el, but docs...(?)
"X"
(sequence :d+ "r"))
# XXX: docs contradict this(?), but works...
(opt (choice "+" "-"))
(some (choice :a :d)))
# see "Basic Char Syntax"
:char ,(atom-node :char
'(sequence "?"
(choice :char-octal
:char-hex
:char-uni-name
#:char-uni-val
:char-uni-val-low
:char-uni-val-up
:char-meta-octal
:char-key
:char-basic)))
# see "General Escape Syntax"
:char-octal (sequence "\\" (3 (range "07")))
:char-hex (sequence "\\x" :h+)
:char-uni-name (sequence "\\N{" (thru "}"))
#:char-uni-val (sequence "\\N{U+" :h+ "}")
:char-uni-val-low (sequence "\\u" (4 :h))
:char-uni-val-up (sequence "\\U" (8 :h))
# see "Meta-Char Syntax"
:char-meta-octal (sequence "\\M-" :char-octal)
# see "Ctl-Char Syntax"
# see "Other Char Bits"
:char-key
(sequence (some (sequence "\\"
(choice (sequence (set "ACHMSs") "-")
"^")))
# XXX: not strictly correct?
(choice :char-octal
:char-hex
:char-uni-name
#:char-uni-val
:char-uni-val-low
:char-uni-val-up
:char-meta-octal
:char-basic))
# XXX: not strictly correct, but perhaps it's ok?
:char-basic (choice (sequence "\\" 1)
1)
# see "Syntax for Strings"
# XXX: escaped newline and escaped space in "Syntax for Strings"?
:string
,(atom-node :string
'(sequence "\""
(any (choice :escape
(if-not "\"" 1)))
"\""))
# XXX: is this complete?
:escape (sequence "\\" (set "0abdefnrstvx\"\\"))
# see "Symbol Type"
# XXX: review about whitespace in symbol names
:symbol
,(atom-node :symbol
'(choice (sequence :sym-char-head
(any :sym-char-rest))
# XXX: some below not really symbols
# see "Circular Objects"
(sequence "#" :d+ "=")
(sequence "#" :d+ "#")
# see "Special Read Syntax"
#(sequence "#" :d+)
# see "Documentation Strings and Compilation"
"#$"
# see "Symbol Type"
"##"))
#
:sym-char-head (choice :sym-char-esc
# don't start with
#(if-not (set " \"#'(),.;?[]`") 1)) # allow .
(if-not (set " \"#'(),;?[]`") 1))
#
:sym-char-rest (choice :sym-char-esc
# . and ? are allowed "inside"
(if-not (set " \"#'(),;[]`\n") 1))
# need to be escaped
:sym-char-esc (sequence "\\" (set " \"#'(),;?[]`"))
})
(comment
(get (peg/match loc-grammar " ") 2)
# =>
'(:whitespace @{:bc 1 :bl 1 :ec 2 :el 1} " ")
(get (peg/match loc-grammar "; hi there") 2)
# =>
'(:comment @{:bc 1 :bl 1 :ec 11 :el 1} "; hi there")
(get (peg/match loc-grammar "8.3") 2)
# =>
'(:float @{:bc 1 :bl 1 :ec 4 :el 1} "8.3")
(get (peg/match loc-grammar "printf") 2)
# =>
'(:symbol @{:bc 1 :bl 1 :ec 7 :el 1} "printf")
(get (peg/match loc-grammar ":smile") 2)
# =>
'(:symbol @{:bc 1 :bl 1 :ec 7 :el 1} ":smile")
(get (peg/match loc-grammar `"fun"`) 2)
# =>
'(:string @{:bc 1 :bl 1 :ec 6 :el 1} "\"fun\"")
(get (peg/match loc-grammar "[8]") 2)
# =>
'(:vector @{:bc 1 :bl 1
:ec 4 :el 1}
(:integer @{:bc 2 :bl 1
:ec 3 :el 1} "8"))
(get (peg/match loc-grammar "(1+ 1)") 2)
# =>
'(:list @{:bc 1 :bl 1
:ec 7 :el 1}
(:symbol @{:bc 2 :bl 1
:ec 4 :el 1} "1+")
(:whitespace @{:bc 4 :bl 1
:ec 5 :el 1} " ")
(:integer @{:bc 5 :bl 1
:ec 6 :el 1} "1"))
(get (peg/match loc-grammar "`x") 2)
# =>
'(:backquote @{:bc 1 :bl 1
:ec 3 :el 1}
(:symbol @{:bc 2 :bl 1
:ec 3 :el 1} "x"))
(try
(peg/match loc-grammar "(+ 1")
([e]
e))
# =>
`line: 1 column: 5 missing ")" for :list`
)

@ -0,0 +1,320 @@
# the "GNU Emacs Lisp Reference Manual" has very useful info
# in the code below section names will be mentioned, like:
# see "Special Read Syntax"
# bl - begin line
# bc - begin column
# el - end line
# ec - end column
(defn make-attrs
[& items]
(zipcoll [:bl :bc :el :ec]
items))
(defn atom-node
[node-type peg-form]
~(cmt (capture (sequence (line) (column)
,peg-form
(line) (column)))
,|[node-type (make-attrs ;(slice $& 0 -2)) (last $&)]))
(defn reader-macro-node
[node-type sigil]
~(cmt (capture (sequence (line) (column)
,sigil
(any :non-form)
:form
(line) (column)))
,|[node-type (make-attrs ;(slice $& 0 2) ;(slice $& -4 -2))
;(slice $& 2 -4)]))
(defn collection-node
[node-type open-delim close-delim]
~(cmt
(capture
(sequence
(line) (column)
,open-delim
(any :input)
(choice ,close-delim
(error
(replace (sequence (line) (column))
,|(string/format
"line: %p column: %p missing %p for %p"
$0 $1 close-delim node-type))))
(line) (column)))
,|[node-type (make-attrs ;(slice $& 0 2) ;(slice $& -4 -2))
;(slice $& 2 -4)]))
(def loc-grammar
~{:main (sequence (line) (column)
(some :input)
(line) (column))
#
:input (choice :non-form
:form)
#
:non-form (choice :whitespace
:comment)
#
:whitespace ,(atom-node :whitespace
'(choice (some (set " \f\t\v"))
(choice "\r\n"
"\r"
"\n")))
# :whitespace
# (cmt (capture (sequence (line) (column)
# (choice (some (set " \f\t\v"))
# (choice "\r\n"
# "\r"
# "\n"))
# (line) (column)))
# ,|[:whitespace (make-attrs ;(slice $& 0 -2)) (last $&)])
#
:comment ,(atom-node :comment
'(sequence ";"
(any (if-not (set "\r\n") 1))))
#
:form (choice # reader macros
:backquote
:function
:quote
:unquote-splice
:unquote
# collections
:list
:vector
:char-table
:sub-char-table
:hash-table
:record
:bytecode
:string-text-props
# atoms
# XXX: might need assertions at end of things before
# symbols. see the partial job in :integer-10 below
:float
:integer
:char
:string
:symbol)
# see "Backquote"
:backquote ,(reader-macro-node :backquote "`")
# :backquote
# (cmt (capture (sequence (line) (column)
# "`"
# (any :non-form)
# :form
# (line) (column)))
# ,|[:backquote (make-attrs ;(slice $& 0 2) ;(slice $& -4 -2))
# ;(slice $& 2 -4)])
# see "Anonymous Functions"
:function ,(reader-macro-node :function "#'")
# see "Quoting"
:quote ,(reader-macro-node :quote "'")
# see "Backquote"
:unquote-splice ,(reader-macro-node :unquote-splice ",@")
# see "Backquote"
:unquote ,(reader-macro-node :unquote ",")
#
# see "Cons Cell Type"
:list ,(collection-node :list "(" ")")
# :list
# (cmt
# (capture
# (sequence
# (line) (column)
# "("
# (any :input)
# (choice ")"
# (error
# (replace (sequence (line) (column))
# ,|(string/format
# "line: %p column: %p missing %p for %p"
# $0 $1 ")" :list))))
# (line) (column)))
# ,|[:list (make-attrs ;(slice $& 0 2) ;(slice $& -4 -2))
# ;(slice $& 2 -4)])
# see "Vectors"
:vector ,(collection-node :vector "[" "]")
# see "Char-Table Type"
:char-table ,(collection-node :char-table "#^[" "]")
# see "Char-Table Type"
:sub-char-table ,(collection-node :sub-char-table "#^^[" "]")
# see "Byte-Code Objects"
:bytecode ,(collection-node :bytecode "#[" "]")
# see "Hash Tables"
:hash-table ,(collection-node :hash-table "#s(hash-table" ")")
# see "Records"
:record ,(collection-node :record "#s(" ")")
# see "Text Props and Strings"
:string-text-props
,(collection-node :string-text-props "#(" ")")
#
# see "Float Basics"
:float ,(atom-node :float
'(choice :float-dec
:float-exp
:float-both
:float-inf
:float-nan))
#
:float-dec (sequence (opt (choice "+" "-"))
:d*
"."
:d+)
#
:float-exp (sequence (opt (choice "+" "-"))
:d*
(choice "e" "E")
:d+)
#
:float-both (sequence (opt (choice "+" "-"))
:d*
"."
:d+
(choice "e" "E")
:d+)
#
:float-inf (sequence (opt "-")
"1.0"
(choice "e" "E")
"+INF")
#
:float-nan (sequence (opt "-")
"0.0"
(choice "e" "E")
"+NaN")
# see "Integer Basics"
:integer ,(atom-node :integer
'(choice :integer-10
:integer-base))
#
:integer-10 (sequence (opt (choice "+" "-"))
:d+
(opt ".")
# XXX: hack?
(not (set "+-")))
#
:integer-base (sequence "#"
(choice "b"
"o"
"x"
# XXX: found in xml.el, but docs...(?)
"X"
(sequence :d+ "r"))
# XXX: docs contradict this(?), but works...
(opt (choice "+" "-"))
(some (choice :a :d)))
# see "Basic Char Syntax"
:char ,(atom-node :char
'(sequence "?"
(choice :char-octal
:char-hex
:char-uni-name
#:char-uni-val
:char-uni-val-low
:char-uni-val-up
:char-meta-octal
:char-key
:char-basic)))
# see "General Escape Syntax"
:char-octal (sequence "\\" (3 (range "07")))
:char-hex (sequence "\\x" :h+)
:char-uni-name (sequence "\\N{" (thru "}"))
#:char-uni-val (sequence "\\N{U+" :h+ "}")
:char-uni-val-low (sequence "\\u" (4 :h))
:char-uni-val-up (sequence "\\U" (8 :h))
# see "Meta-Char Syntax"
:char-meta-octal (sequence "\\M-" :char-octal)
# see "Ctl-Char Syntax"
# see "Other Char Bits"
:char-key
(sequence (some (sequence "\\"
(choice (sequence (set "ACHMSs") "-")
"^")))
# XXX: not strictly correct?
(choice :char-octal
:char-hex
:char-uni-name
#:char-uni-val
:char-uni-val-low
:char-uni-val-up
:char-meta-octal
:char-basic))
# XXX: not strictly correct, but perhaps it's ok?
:char-basic (choice (sequence "\\" 1)
1)
# see "Syntax for Strings"
# XXX: escaped newline and escaped space in "Syntax for Strings"?
:string
,(atom-node :string
'(sequence "\""
(any (choice :escape
(if-not "\"" 1)))
"\""))
# XXX: is this complete?
:escape (sequence "\\" (set "0abdefnrstvx\"\\"))
# see "Symbol Type"
# XXX: review about whitespace in symbol names
:symbol
,(atom-node :symbol
'(choice (sequence :sym-char-head
(any :sym-char-rest))
# XXX: some below not really symbols
# see "Circular Objects"
(sequence "#" :d+ "=")
(sequence "#" :d+ "#")
# see "Special Read Syntax"
#(sequence "#" :d+)
# see "Documentation Strings and Compilation"
"#$"
# see "Symbol Type"
"##"))
#
:sym-char-head (choice :sym-char-esc
# don't start with
#(if-not (set " \"#'(),.;?[]`") 1)) # allow .
(if-not (set " \"#'(),;?[]`") 1))
#
:sym-char-rest (choice :sym-char-esc
# . and ? are allowed "inside"
(if-not (set " \"#'(),;[]`\n") 1))
# need to be escaped
:sym-char-esc (sequence "\\" (set " \"#'(),;?[]`"))
})
(comment
(get (peg/match loc-grammar " ") 2)
# =>
'(:whitespace @{:bc 1 :bl 1 :ec 2 :el 1} " ")
(get (peg/match loc-grammar "8.3") 2)
# =>
'(:float @{:bc 1 :bl 1 :ec 4 :el 1} "8.3")
(get (peg/match loc-grammar "printf") 2)
# =>
'(:symbol @{:bc 1 :bl 1 :ec 7 :el 1} "printf")
(get (peg/match loc-grammar ":smile") 2)
# =>
'(:symbol @{:bc 1 :bl 1 :ec 7 :el 1} ":smile")
(get (peg/match loc-grammar "[8]") 2)
# =>
'(:vector @{:bc 1 :bl 1
:ec 4 :el 1}
(:integer @{:bc 2 :bl 1
:ec 3 :el 1} "8"))
(get (peg/match loc-grammar "`x") 2)
# =>
'(:backquote @{:bc 1 :bl 1
:ec 3 :el 1}
(:symbol @{:bc 2 :bl 1
:ec 3 :el 1} "x"))
)

@ -30,6 +30,7 @@ pub enum Language {
EmacsLisp,
Go,
Haskell,
JanetSimple,
Java,
JavaScript,
Json,
@ -94,6 +95,7 @@ fn from_emacs_mode_header(src: &str) -> Option<Language> {
"emacs-lisp" => Some(EmacsLisp),
"go" => Some(Go),
"haskell" => Some(Haskell),
"janet" => Some(JanetSimple),
"java" => Some(Java),
"js" | "js2" => Some(JavaScript),
"lisp" => Some(CommonLisp),
@ -188,6 +190,7 @@ fn from_extension(extension: &OsStr, src: &str) -> Option<Language> {
"ex" | "exs" => Some(Elixir),
"go" => Some(Go),
"hs" => Some(Haskell),
"janet" | "jdn" => Some(JanetSimple),
"java" => Some(Java),
"cjs" | "js" | "mjs" => Some(JavaScript),
"jsx" => Some(Jsx),

@ -55,6 +55,7 @@ extern "C" {
fn tree_sitter_elixir() -> ts::Language;
fn tree_sitter_go() -> ts::Language;
fn tree_sitter_haskell() -> ts::Language;
fn tree_sitter_janet_simple() -> ts::Language;
fn tree_sitter_java() -> ts::Language;
fn tree_sitter_javascript() -> ts::Language;
fn tree_sitter_json() -> ts::Language;
@ -273,6 +274,29 @@ pub fn from_language(language: guess::Language) -> TreeSitterConfig {
.unwrap(),
}
}
JanetSimple => {
let language = unsafe { tree_sitter_janet_simple() };
TreeSitterConfig {
name: "Janet",
language,
atom_nodes: (vec![]).into_iter().collect(),
delimiter_tokens: (vec![
("@{", "}"),
("@(", ")"),
("@[", "]"),
("{", "}"),
("(", ")"),
("[", "]"),
])
.into_iter()
.collect(),
highlight_query: ts::Query::new(
language,
include_str!("../vendor/highlights/janet_simple.scm"),
)
.unwrap(),
}
}
Java => {
let language = unsafe { tree_sitter_java() };
TreeSitterConfig {

@ -0,0 +1 @@
../tree-sitter-janet-simple/queries/highlights.scm

@ -0,0 +1 @@
tree-sitter-janet-simple/src

@ -0,0 +1,4 @@
node_modules
bin
build
*.log

@ -0,0 +1,25 @@
[package]
name = "tree-sitter-janet-simple"
description = "janet grammar for the tree-sitter parsing library"
version = "0.0.3"
keywords = ["incremental", "parsing", "janet"]
categories = ["parsing", "text-editors"]
repository = "https://github.com/sogaiu/tree-sitter-janet-simple"
edition = "2018"
build = "bindings/rust/build.rs"
include = [
"bindings/rust/*",
"grammar.js",
"queries/*",
"src/*",
]
[lib]
path = "bindings/rust/lib.rs"
[dependencies]
tree-sitter = "0.19.3"
[build-dependencies]
cc = "1.0"

@ -0,0 +1,114 @@
# tree-sitter-janet-simple
## Status
Subject to change, grammar still evolving.
Coincidentally, it appears [another effort by GrayJack](https://github.com/GrayJack/tree-sitter-janet/) was started at about the same time.
The main difference between these two are that GrayJack's grammar supports higher level constructs (e.g. `def` is recognized by the grammar).
There might end up being different trade-offs in either approach and my belief is that there is room in the world for multiple attempts (especially for lisp-like languages).
## Prerequisites
* [emsdk](https://emscripten.org/docs/getting_started/downloads.html#installation-instructions) -- emscripten via homebrew seems to work for macos
* node >= 12 (nvm recommended) -- recently tested 12.9.1, 12,16,1
## Fine Print
* The instructions below assume emsdk has been installed, but `emcc` (tool that can be used to compile to wasm) is not necessarily on one's `PATH`. If an appropriate `emcc` is on one's `PATH` (e.g. emscripten installed via homebrew), the emsdk steps (e.g. `source ~/src/emsdk/emsdk_env.sh`) below may be ignored.
* `node-gyp` (tool for compiling native addon modules for Node.js) may fail on machines upgraded to macos Catalina. [This document](https://github.com/nodejs/node-gyp/blob/master/macOS_Catalina.md) may help cope with such a situation.
## Initial Setup
Suppose typical development sources are stored under `~/src`.
```
# clone repository
cd ~/src
git clone https://github.com/sogaiu/tree-sitter-janet-simple
cd tree-sitter-janet-simple
# create / populate
# `node_modules` with dependencies
# `src` with tree-sitter .c goodness
# `build`
# `build/Release` and build `tree_sitter_janet_simple_binding.node`
npm install
# included in previous command
#npx tree-sitter generate
#npx node-gyp configure
#npx node-gyp rebuild
```
## Grammar Development
Hack on grammar and interactively test.
```
# prepare emsdk (specifically emcc) for building .wasm
source ~/src/emsdk/emsdk_env.sh
# edit grammar.js using some editor
# rebuild tree-sitter stuff and invoke web-ui for interactive testing
npx tree-sitter generate && \
npx node-gyp rebuild && \
npx tree-sitter build-wasm && \
npx tree-sitter web-ui
# in appropriate browser window, paste code in left pane
# examine results in right pane -- can even click on nodes
# find errors and loop back to edit step above...
```
Parse individual files.
```
# create and populate sample code file for parsing named `sample.janet`
# parse sample file
npx tree-sitter parse sample.janet
# examine output similar to web-ui, but less convenient
```
## Measure Performance
```
# single measurement
npx tree-sitter parse --time sample.janet
# mutliple measurements with `multitime`
multitime -n10 -s1 npx tree-sitter parse --time --quiet sample.janet
```
## Build .wasm
Assuming emsdk is installed appropriately under `~/src/emsdk`.
```
# prepare emsdk (specifically emcc) for use
source ~/src/emsdk/emsdk_env.sh
# create `tree-sitter-janet-simple.wasm`
npx tree-sitter build-wasm
```
## Resources
* [Guide to your first Tree-sitter grammar](https://gist.github.com/Aerijo/df27228d70c633e088b0591b8857eeef)
* [tree-sitter](http://tree-sitter.github.io/tree-sitter/)
## Acknowledgments
* 314eter - handling null characters
* Aerijo - Guide to your first Tree-sitter grammar
* bakpakin - janet
* GrayJack - tree-sitter-janet
* maxbrunsfeld - tree-sitter and related

@ -0,0 +1,19 @@
{
"targets": [
{
"target_name": "tree_sitter_janet_simple_binding",
"include_dirs": [
"<!(node -e \"require('nan')\")",
"src"
],
"sources": [
"src/parser.c",
"bindings/node/binding.cc",
"src/scanner.c"
],
"cflags_c": [
"-std=c99",
]
}
]
}

@ -0,0 +1,28 @@
#include "tree_sitter/parser.h"
#include <node.h>
#include "nan.h"
using namespace v8;
extern "C" TSLanguage * tree_sitter_janet_simple();
namespace {
NAN_METHOD(New) {}
void Init(Local<Object> exports, Local<Object> module) {
Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
tpl->SetClassName(Nan::New("Language").ToLocalChecked());
tpl->InstanceTemplate()->SetInternalFieldCount(1);
Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
Nan::SetInternalFieldPointer(instance, 0, tree_sitter_janet_simple());
Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("janet_simple").ToLocalChecked());
Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
}
NODE_MODULE(tree_sitter_janet_simple_binding, Init)
} // namespace

@ -0,0 +1,19 @@
try {
module.exports = require("../../build/Release/tree_sitter_janet_simple_binding");
} catch (error1) {
if (error1.code !== 'MODULE_NOT_FOUND') {
throw error1;
}
try {
module.exports = require("../../build/Debug/tree_sitter_janet_simple_binding");
} catch (error2) {
if (error2.code !== 'MODULE_NOT_FOUND') {
throw error2;
}
throw error1
}
}
try {
module.exports.nodeTypeInfo = require("../../src/node-types.json");
} catch (_) {}

@ -0,0 +1,38 @@
fn main() {
let src_dir = std::path::Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
.flag_if_supported("-Wno-trigraphs");
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
// If your language uses an external scanner written in C,
// then include this block of code:
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
// If your language uses an external scanner written in C++,
// then include this block of code:
/*
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
}

@ -0,0 +1,52 @@
//! This crate provides janet_simple language support for the [tree-sitter][] parsing library.
//!
//! Typically, you will use the [language][language func] function to add this language to a
//! tree-sitter [Parser][], and then use the parser to parse some code:
//!
//! ```
//! let code = "";
//! let mut parser = tree_sitter::Parser::new();
//! parser.set_language(tree_sitter_janet_simple::language()).expect("Error loading janet_simple grammar");
//! let tree = parser.parse(code, None).unwrap();
//! ```
//!
//! [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
//! [language func]: fn.language.html
//! [Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
//! [tree-sitter]: https://tree-sitter.github.io/
use tree_sitter::Language;
extern "C" {
fn tree_sitter_janet_simple() -> Language;
}
/// Get the tree-sitter [Language][] for this grammar.
///
/// [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
pub fn language() -> Language {
unsafe { tree_sitter_janet_simple() }
}
/// The content of the [`node-types.json`][] file for this grammar.
///
/// [`node-types.json`]: https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
pub const NODE_TYPES: &'static str = include_str!("../../src/node-types.json");
// Uncomment these to include any queries that this grammar contains
// pub const HIGHLIGHTS_QUERY: &'static str = include_str!("../../queries/highlights.scm");
// pub const INJECTIONS_QUERY: &'static str = include_str!("../../queries/injections.scm");
// pub const LOCALS_QUERY: &'static str = include_str!("../../queries/locals.scm");
// pub const TAGS_QUERY: &'static str = include_str!("../../queries/tags.scm");
#[cfg(test)]
mod tests {
#[test]
fn test_can_load_grammar() {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(super::language())
.expect("Error loading janet_simple language");
}
}

@ -0,0 +1,228 @@
// numbers
const SIGN =
choice('-', '+');
const DIGIT =
/[0-9]/;
const HEX_DIGIT =
/[0-9A-Fa-f]/;
const RADIX =
choice('2', '3', '4', '5', '6', '7', '8', '9', '10',
'11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
'21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
'31', '32', '33', '34', '35', '36');
const ALPHA_NUM =
/[a-zA-Z0-9]/;
// symbols and keywords
// janet/tools/symcharsgen.c
const SYM_CHAR_NO_DIGIT_NO_COLON =
/[a-zA-Z!$%&*+\-./<?=>@^_]/;
const SYM_CHAR =
/[0-9:a-zA-Z!$%&*+\-./<?=>@^_]/;
// strings
const STRING_DOUBLE_QUOTE_CONTENT =
repeat(choice(/[^\\"]/,
/\\(.|\n)/)); // thanks to tree-sitter-haskell
module.exports = grammar({
name: 'janet_simple',
// mdn says \s is:
//
// [ \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
//
// but that doesn't seem to match what tree-sitter thinks as it appears that
// for example, leaving out \x0b, \x0c, or \x00 from the following yields
// different behavior (other stuff may also differ)
extras: $ => [
/\s|\x0b|\x0c|\x00/,
$.comment
],
externals: $ => [
$.long_buf_lit,
$.long_str_lit
],
rules: {
// THIS MUST BE FIRST -- even though this doesn't look like it matters
source: $ =>
repeat($._lit),
comment: $ =>
/#.*/,
_lit: $ =>
choice($.bool_lit,
$.buf_lit,
$.kwd_lit,
$.long_buf_lit,
$.long_str_lit,
$.nil_lit,
$.num_lit,
$.str_lit,
$.sym_lit,
//
$.par_arr_lit,
$.sqr_arr_lit,
$.struct_lit,
$.tbl_lit,
$.par_tup_lit,
$.sqr_tup_lit,
//
$.qq_lit,
$.quote_lit,
$.short_fn_lit,
$.splice_lit,
$.unquote_lit),
// simplest things
bool_lit: $ =>
// XXX: without the token here, false and true are exposed as
// anonymous nodes it seems...
// yet, the same does not happen for nil...strange
token(choice('false',
'true')),
kwd_lit: $ =>
prec(2, token(seq(':',
repeat(SYM_CHAR)))),
nil_lit: $ =>
'nil',
num_lit: $ =>
prec(5, choice($._dec,
$._hex,
$._radix)),
_dec: $ =>
token(seq(optional(SIGN),
choice(seq(repeat1(DIGIT),
repeat('_'),
optional('.'),
repeat('_'),
repeat(DIGIT),
repeat('_')),
seq(repeat(DIGIT),
repeat('_'),
optional('.'),
repeat('_'),
repeat1(DIGIT),
repeat('_'))),
optional(seq(choice('e', 'E'),
optional(SIGN),
repeat1(DIGIT))))),
_hex: $ =>
token(seq(optional(SIGN),
'0',
'x',
choice(seq(repeat1(HEX_DIGIT),
repeat('_'),
optional('.'),
repeat('_'),
repeat(HEX_DIGIT),
repeat('_')),
seq(repeat(HEX_DIGIT),
repeat('_'),
optional('.'),
repeat('_'),
repeat1(HEX_DIGIT),
repeat('_'))))),
_radix: $ =>
token(seq(optional(SIGN),
seq(RADIX,
choice('r', 'R'),
ALPHA_NUM,
repeat(choice(repeat(ALPHA_NUM),
repeat('_'))),
optional(seq('&',
optional(SIGN),
repeat1(DIGIT)))))),
str_lit: $ =>
token(seq('"',
STRING_DOUBLE_QUOTE_CONTENT,
'"')),
buf_lit: $ =>
token(seq('@"',
STRING_DOUBLE_QUOTE_CONTENT,
'"')),
sym_lit: $ =>
token(seq(SYM_CHAR_NO_DIGIT_NO_COLON,
repeat(SYM_CHAR))),
// collection-ish things
par_arr_lit: $ =>
seq('@(',
repeat($._lit),
')'),
sqr_arr_lit: $ =>
seq('@[',
repeat($._lit),
']'),
struct_lit: $ =>
seq('{',
repeat($._lit),
'}'),
tbl_lit: $ =>
seq('@{',
repeat($._lit),
'}'),
par_tup_lit: $ =>
seq('(',
repeat($._lit),
')'),
sqr_tup_lit: $ =>
seq('[',
repeat($._lit),
']'),
// macro-related
qq_lit: $ =>
seq('~',
$._lit),
quote_lit: $ =>
seq("'",
$._lit),
// following all work at the repl..
// |8, ||8, |||8, etc.
// |~(:x)
// |{:a 1}
// |[1 2]
// |"a"
// |:w
// |a-sym
// |@[8 9]
// |(= $ 1)
// XXX: |() doesn't work...but don't bother disallowing
short_fn_lit: $ =>
seq('|',
$._lit),
// XXX: ?
splice_lit: $ =>
seq(';',
$._lit),
unquote_lit: $ =>
seq(',',
$._lit),
}
});

@ -0,0 +1,19 @@
{
"name": "tree-sitter-janet-simple",
"version": "0.0.3",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"nan": {
"version": "2.14.2",
"resolved": "https://registry.npmjs.org/nan/-/nan-2.14.2.tgz",
"integrity": "sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ=="
},
"tree-sitter-cli": {
"version": "0.19.3",
"resolved": "https://registry.npmjs.org/tree-sitter-cli/-/tree-sitter-cli-0.19.3.tgz",
"integrity": "sha512-UlntGxLrlkQCKVrhm7guzfi+ovM4wDLVCCu3z5jmfDgFNoUoKa/23ddaQON5afD5jB9a02xv4N5MXJfCx+/mpw==",
"dev": true
}
}
}

@ -0,0 +1,27 @@
{
"name": "tree-sitter-janet-simple",
"version": "0.0.3",
"description": "Janet grammar for tree-sitter",
"main": "bindings/node",
"scripts": {
"build": "npx tree-sitter generate && npx node-gyp build",
"fresh-build": "npx tree-sitter generate && npx node-gyp configure && npx node-gyp rebuild",
"install": "npx tree-sitter generate && npx node-gyp configure && npx node-gyp rebuild"
},
"author": "",
"license": "",
"dependencies": {
"nan": "2.14.2"
},
"devDependencies": {
"tree-sitter-cli": "0.19.3"
},
"tree-sitter": [
{
"scope": "source.janet",
"file-types": [
"janet"
]
}
]
}

@ -0,0 +1,25 @@
(num_lit) @number
[
(buf_lit)
(long_buf_lit)
(long_str_lit)
(str_lit)
] @string
[
(bool_lit)
(nil_lit)
] @constant.builtin
(kwd_lit) @constant
(comment) @comment
;; Treat quasiquotation as operators for the purpose of highlighting.
[
"'"
"~"
","
] @operator

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,111 @@
#include <tree_sitter/parser.h>
#include <wctype.h>
enum TokenType {
LONG_BUF_LIT,
LONG_STR_LIT
};
void* tree_sitter_janet_simple_external_scanner_create(
void
)
{
return NULL;
}
void tree_sitter_janet_simple_external_scanner_destroy(
void* payload
)
{
}
void tree_sitter_janet_simple_external_scanner_reset(
void* payload
)
{
}
unsigned tree_sitter_janet_simple_external_scanner_serialize(
void* payload,
char* buffer
)
{
return 0;
}
void tree_sitter_janet_simple_external_scanner_deserialize(
void *payload,
const char *buffer,
unsigned length
)
{
}
bool tree_sitter_janet_simple_external_scanner_scan(
void *payload,
TSLexer *lexer,
const bool *valid_symbols
)
{
// skip a bit brother
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, true);
}
// there can be only...two?
if (valid_symbols[LONG_BUF_LIT] || valid_symbols[LONG_STR_LIT]) {
// so which one was it?
if (lexer->lookahead == '@') {
lexer->result_symbol = LONG_BUF_LIT;
lexer->advance(lexer, false);
} else {
lexer->result_symbol = LONG_STR_LIT;
}
// long strings start with one or more backticks
// consume the first backtick
if (lexer->lookahead != '`') {
return false;
}
// getting here means a backtick was encountered
lexer->advance(lexer, false);
uint32_t n_backticks = 1;
// arrive at a total number of backticks
for (;;) {
if (lexer->lookahead == 0) {
return false;
}
// found one!
if (lexer->lookahead == '`') {
n_backticks++;
lexer->advance(lexer, false);
continue;
} else { // nope, time to bail
lexer->advance(lexer, false);
break;
}
}
// getting here means the last character examined was NOT a backtick.
// now keep looking until n_backticks are found
uint32_t cbt = 0; // consecutive backticks
for (;;) {
if (lexer->lookahead == 0) {
return false;
}
// found one!
if (lexer->lookahead == '`') {
cbt++;
// are we there yet?
if (cbt == n_backticks) {
lexer->advance(lexer, false);
return true;
}
} else { // nope, better reset the count
cbt = 0;
}
// next!
lexer->advance(lexer, false);
}
}
return false;
}

@ -0,0 +1,223 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef uint16_t TSStateId;
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef struct {
bool visible;
bool named;
bool supertype;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef union {
struct {
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
int16_t dynamic_precedence;
uint16_t production_id;
} reduce;
uint8_t type;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable;
} entry;
} TSParseActionEntry;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
uint32_t state_count;
uint32_t large_state_count;
uint32_t production_id_count;
uint32_t field_count;
uint16_t max_alias_sequence_length;
const uint16_t *parse_table;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSParseActionEntry *parse_actions;
const char **symbol_names;
const char **field_names;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata;
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSLexMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
};
/*
* Lexer Macros
*/
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value \
} \
}}
#define SHIFT_REPEAT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value, \
.repetition = true \
} \
}}
#define SHIFT_EXTRA() \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
} \
}}
#define REDUCE(symbol_val, child_count_val, ...) \
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
}, \
}}
#define RECOVER() \
{{ \
.type = TSParseActionTypeRecover \
}}
#define ACCEPT_INPUT() \
{{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_