Merge commit '7dc4fb60390218b09bc351062eeede7dcdbb4d9f'

syntax_id
Wilfred Hughes 2023-08-08 23:33:05 +07:00
commit 4e77f83dd9
19 changed files with 15740 additions and 14105 deletions

@ -2,7 +2,7 @@
### Parsing
Updated Elixir, Erlang and Go parsers.
Updated Elixir, Erlang, Go and Racket parsers.
### Display

@ -298,7 +298,7 @@ fn main() {
TreeSitterParser {
name: "tree-sitter-racket",
src_dir: "vendored_parsers/tree-sitter-racket-src",
extra_files: vec!["scanner.cc"],
extra_files: vec!["scanner.c"],
},
TreeSitterParser {
name: "tree-sitter-ruby",

@ -1,7 +1,7 @@
[package]
name = "tree-sitter-racket"
description = "racket grammar for the tree-sitter parsing library"
version = "0.0.1"
version = "0.3.0"
keywords = ["incremental", "parsing", "racket"]
categories = ["parsing", "text-editors"]
repository = "https://github.com/tree-sitter/tree-sitter-racket"

@ -8,7 +8,13 @@ This grammar only implements the Racket language with the default readtable.
## Status
It should recognize most grammar with the default readtable.
It should be complete and compatible with Racket 8.9.
There are no plans to add support for new language currently.
## News
Starting from June 24, 2023, ([commit](https://github.com/6cdh/tree-sitter-racket/commit/989c3e631a7f2d87bb6a66a5394870aaeb6c56e7)) or release 0.3.0, the external scanner was written in C.
## Build and Try
@ -16,7 +22,6 @@ You need
* nodejs
* a C compiler
* a C++11 compiler
then run

@ -9,8 +9,7 @@
"sources": [
"bindings/node/binding.cc",
"src/parser.c",
"src/scanner.cc",
# If your language uses an external scanner, add it here.
"src/scanner.c",
],
"cflags_c": [
"-std=c99",

@ -2,24 +2,18 @@ fn main() {
let src_dir = std::path::Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config.include(src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
.flag_if_supported("-Wno-trigraphs");
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
c_config.compile("parser");
}

@ -1,2 +1,2 @@
win_* text eol=crlf
win_* eol=crlf
* eol=lf

@ -31,6 +31,8 @@ number
#b101
+inf.t
-nan.t
+i
0##+i
---
@ -47,6 +49,8 @@ number
(number)
(number)
(number)
(number)
(number)
(number))
===
@ -79,6 +83,8 @@ Ap\ ple
app123app123
123app123
中文
a
fec
---
(program
@ -92,6 +98,8 @@ app123app123
(symbol)
(symbol)
(symbol)
(symbol)
(symbol)
(symbol))
===

@ -0,0 +1,34 @@
# fuzz test
The directory contains the scripts that test the implementation to avoid the problem that a symbol is parsed as a number or vice versa.
## Resource
* ~6 minutes to run
* ~300M generated files
## Run
```shell
$ cd fuzztest
# 30s
$ racket gen_cases.rkt
cpu time: 26531 real time: 26835 gc time: 1069
5114173 cases generated
$ cd ..
$ tree-sitter generate
# 3 minutes
$ tree-sitter parse fuzztest/case.txt > fuzztest/res1.txt
# 2 minutes
$ cd fuzztest && racket postprocess.rkt
# should show nothing
$ sdiff -s <(cat -n expect.txt) <(cat -n res.txt)
# If there is some error, run
$ sdiff -s <(cat -n expect.txt) <(cat -n res.txt) | less
# then get the first error case at `N`-th line
$ cat case.txt | sed -n 'Np'
```
You can edit `gen_cases.rkt` to generate less cases during development.

@ -0,0 +1,57 @@
#lang racket
(require racket/extflonum)
;; all characters that can appear in a valid number/exflonum
;; remove some insignificant parts to improve performance
;; (define alphabet-char "abdefilnost")
(define alphabet-char "abdefilnostx")
(define special-char "#./@+-")
;; (define numeric-char "0123456789")
(define numeric-char "0179")
(define all-char
(string-append alphabet-char
special-char
numeric-char))
(define cnt 0)
(define max-len 5)
(define case-port (open-output-file "case.txt" #:exists 'replace))
(define expect-port (open-output-file "expect.txt" #:exists 'replace))
(define (gen i case)
(with-handlers ([exn:fail? (lambda _ (void))])
(when (> i 0)
(define case-str (list->string case))
;; ".0@.0" should be a number according the document,
;; but it's actually a symbol.
;; It's a bug of Racket reader, and will fix in new Racket release.
;; we skip these cases.
(when (not (string-contains? case-str "@."))
(with-handlers ([exn:fail? void])
(with-input-from-string case-str
(lambda ()
(define fst (read))
(define snd (read))
(when (eof-object? snd)
(cond [(symbol? fst)
(set! cnt (add1 cnt))
(displayln case-str case-port)
(displayln "symbol" expect-port)]
[(number? fst)
(set! cnt (add1 cnt))
(displayln case-str case-port)
(displayln "number" expect-port)]
;; it's here for possible future change that
;; split extflonum from number
[(extflonum? fst)
(set! cnt (add1 cnt))
(displayln case-str case-port)
(displayln "number" expect-port)]))))))))
(when (< i max-len)
(for ([c all-char])
(gen (add1 i) (cons c case)))))
(time (gen 0 '()))
(displayln (format "~a cases generated" cnt))

@ -0,0 +1,26 @@
#lang racket
(define port (open-input-file "res1.txt"))
(define all-result (drop (read port) 4))
(define all-result-line
(for/list ([r all-result])
(cons (car r) (caadr r))))
(with-output-to-file "res.txt"
#:exists 'replace
(lambda ()
(let loop ([line 0]
[firstline? #t]
[lst all-result-line])
(match lst
['() (void)]
[(cons fst rem)
#:when (= (cdr fst) line)
(when (not firstline?)
(display " "))
(display (car fst))
(loop line (if firstline? #t #f) rem)]
[(cons fst rem)
(newline)
(display (car fst))
(loop (add1 line) #f rem)]))
(newline)))

@ -1,13 +1,11 @@
const PREC = {
first: $ => prec(100, $),
last: $ => prec(-1, $),
left: prec.left,
right: prec.right,
};
const LEAF = {
// https://en.wikipedia.org/wiki/Unicode_character_property#Whitespace
whitespace: /[ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]/u,
whitespace: /[ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+/u,
newline: /[\r\n\u{85}\u{2028}\u{2029}]/,
delimiter: /[ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{FEFF}(){}",'`;\[\]]/u,
non_delimiter: /[^ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{FEFF}(){}",'`;\[\]]/u,
@ -49,23 +47,21 @@ module.exports = grammar({
_token: $ =>
choice(
token(repeat1(LEAF.whitespace)),
$._all_comment,
$._skip,
$.extension,
$._datum),
_skip: $ => choice(token(repeat1(LEAF.whitespace)), $._all_comment),
dot: _ => ".",
// comment {{{
_all_comment: $ =>
_skip: $ =>
choice(
LEAF.whitespace,
$.comment,
$.sexp_comment,
$.block_comment),
dot: _ => ".",
// comment {{{
comment: $ =>
choice(
token(
@ -101,6 +97,10 @@ module.exports = grammar({
$.here_string,
$.byte_string,
$.character,
// number/symbol precedence
// for same length token, prefer number
// otherwise, prefer symbol which is also longer
$.number,
$.symbol,
@ -110,7 +110,15 @@ module.exports = grammar({
$.graph,
$.structure,
$.hash,
$._abbrev,
$.quote,
$.quasiquote,
$.syntax,
$.quasisyntax,
$.unquote,
$.unquote_splicing,
$.unsyntax,
$.unsyntax_splicing,
$.list,
$.vector),
@ -158,10 +166,14 @@ module.exports = grammar({
number: _ =>
token(
choice(
_number_base(2),
_number_base(8),
_number_base(10),
_number_base(16))),
extflonum(2),
extflonum(8),
extflonum(10),
extflonum(16),
number_base(2),
number_base(8),
number_base(10),
number_base(16))),
decimal: _ => /[0-9]+/,
@ -178,14 +190,12 @@ module.exports = grammar({
/./))),
symbol: _ =>
PREC.last(
PREC.right(
token(
choice(
/#[cC][iIsS]/, // read-case-sensitive parameter
seq(
LEAF.symbol_start,
repeat(LEAF.symbol_remain)))))),
token(
choice(
/#[cC][iIsS]/, // read-case-sensitive parameter
seq(
LEAF.symbol_start,
repeat(LEAF.symbol_remain)))),
keyword: _ =>
token(
@ -233,17 +243,6 @@ module.exports = grammar({
repeat($._skip),
$._datum))),
_abbrev: $ =>
choice(
$.quote,
$.quasiquote,
$.syntax,
$.quasisyntax,
$.unquote,
$.unquote_splicing,
$.unsyntax,
$.unsyntax_splicing),
quote: $ =>
seq(
"'",
@ -309,115 +308,216 @@ module.exports = grammar({
// number {{{
function _number_base(n) {
const number = _ =>
function number_base(n) {
const digit = {
2: /[01]/,
8: /[0-7]/,
10: /[0-9]/,
16: /[0-9a-fA-F]/,
}[n];
const exp_mark = {
2: /[sldefSLDEF]/,
8: /[sldefSLDEF]/,
10: /[sldefSLDEF]/,
16: /[slSL]/,
}[n];
const prefix = {
2: /#[bB]/,
8: /#[oO]/,
10: optional(/#[dD]/),
16: /#[xX]/,
}[n];
const exactness =
/#[eiEI]/;
const sign = /[+-]/;
const digits_hash =
seq(
choice(
seq(radix(), optional(exactness())),
seq(optional(exactness()), radix()),
),
choice(
// Inexact number pattern already contains exact pattern.
// So we don't need to parse exact number explicitly
inexact()));
const sign = _ => /[+-]/;
const digit = _ => {
return {
2: /[01]/,
8: /[0-7]/,
10: /[0-9]/,
16: /[0-9a-fA-F]/,
}[n];
};
const radix = _ => {
return {
2: /#[bB]/,
8: /#[oO]/,
10: optional(/#[dD]/),
16: /#[xX]/,
}[n];
};
const exactness = _ =>
choice("#e", "#E", "#i", "#I");
const exp_mark = _ => /[sldeftSLDEFT]/;
const unsigned_integer = _ =>
repeat1(digit());
const inexact = _ =>
repeat1(digit),
repeat("#"));
const unsigned_integer =
repeat1(digit);
// exact
const exact_integer =
seq(
optional(sign),
unsigned_integer);
const unsigned_rational =
choice(
unsigned_integer,
seq(unsigned_integer, "/", unsigned_integer));
const exact_rational =
seq(
optional(sign),
unsigned_rational);
const exact_complex =
seq(
optional(exact_rational),
sign,
optional(unsigned_rational),
/[iI]/);
const exact =
choice(exact_rational, exact_complex);
// inexact
const inexact_special =
choice(
inexact_real(),
inexact_complex());
/[iI][nN][fF]\.[0fF]/,
/[nN][aA][nN]\.[0fF]/);
const inexact_simple =
choice(
seq(
digits_hash,
optional("."),
repeat("#")),
seq(
optional(unsigned_integer),
".",
digits_hash),
seq(
digits_hash,
"/",
digits_hash));
const inexact_normal =
seq(
inexact_simple,
optional(
seq(
exp_mark,
exact_integer)));
const inexact_unsigned =
choice(inexact_normal, inexact_special);
const inexact_real = _ =>
const inexact_real =
choice(
seq(
optional(sign()),
inexact_normal()),
optional(sign),
inexact_normal),
seq(
sign(),
inexact_special()));
sign,
inexact_special));
const inexact_complex = _ =>
const inexact_complex =
choice(
seq(
optional(inexact_real()),
sign(),
inexact_unsigned(),
optional(inexact_real),
sign,
optional(inexact_unsigned),
/[iI]/),
seq(
inexact_real(),
inexact_real,
"@",
inexact_real()));
inexact_real));
const inexact_unsigned = _ =>
choice(
inexact_normal(),
inexact_special());
const inexact =
choice(inexact_real, inexact_complex);
const number =
choice(exact, inexact);
const inexact_normal = _ =>
const general_number =
seq(
inexact_simple(),
optional(
choice(
seq(
optional(exactness),
prefix),
seq(
exp_mark(),
optional(sign()),
unsigned_integer())));
prefix,
optional(exactness))),
number);
const inexact_special = _ =>
return general_number;
}
function extflonum(n) {
const digit = {
2: /[01]/,
8: /[0-7]/,
10: /[0-9]/,
16: /[0-9a-fA-F]/,
}[n];
const exp_mark = /[tT]/;
const prefix = {
2: /#[bB]/,
8: /#[oO]/,
10: optional(/#[dD]/),
16: /#[xX]/,
}[n];
const sign = /[+-]/;
const digits_hash =
seq(
repeat1(digit),
repeat("#"));
const unsigned_integer =
repeat1(digit);
// exact
const exact_integer =
seq(
optional(sign),
unsigned_integer);
// inexact
const inexact_special =
choice(
/[iI][nN][fF]\.0/,
/[nN][aA][nN]\.0/,
/[iI][nN][fF]\.[fFtT]/,
/[nN][aA][nN]\.[fFtT]/,
);
/[iI][nN][fF]\.[0fFtT]/,
/[nN][aA][nN]\.[0fFtT]/);
const inexact_simple = _ =>
const inexact_simple =
choice(
seq(
digits(),
digits_hash,
optional("."),
repeat("#")),
seq(
optional(unsigned_integer()),
optional(unsigned_integer),
".",
digits()),
digits_hash),
seq(
digits(),
digits_hash,
"/",
digits()));
digits_hash));
const digits = _ =>
const inexact_normal =
seq(
unsigned_integer(),
repeat("#"));
inexact_simple,
optional(
seq(
exp_mark,
exact_integer)));
const inexact_real =
choice(
seq(
optional(sign),
inexact_normal),
seq(
sign,
inexact_special));
return token(number());
return seq(prefix, inexact_real);
}
// number }}}

@ -1,6 +1,6 @@
{
"name": "tree-sitter-racket",
"version": "0.1.0",
"version": "0.3.0",
"description": "Tree-sitter grammar for Racket",
"main": "bindings/node",
"scripts": {

File diff suppressed because it is too large Load Diff

@ -1154,11 +1154,6 @@
]
}
},
{
"type": "symbol",
"named": true,
"fields": {}
},
{
"type": "syntax",
"named": true,
@ -1897,6 +1892,10 @@
"type": "number",
"named": true
},
{
"type": "symbol",
"named": true
},
{
"type": "{",
"named": false

File diff suppressed because it is too large Load Diff

@ -0,0 +1,153 @@
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <tree_sitter/parser.h>
enum TokenType {
HERE_STRING_BODY,
};
// a hand written string implmentation
// data[0], data[1], ..., data[len-1] is the content of string
// data[len] is `\0` for typical string `char*` compatibility
// So 0 <= len < cap
typedef struct {
size_t len;
size_t cap;
char *data;
} String;
static void check_alloc(void *ptr) {
if (ptr == NULL) {
fprintf(stderr, "Scanner: Failed to allocate memory\n");
exit(EXIT_FAILURE);
}
}
static String string_new(void) {
size_t init_len = 16;
// (init_len + 1) for null terminator
size_t cap = init_len + 1;
void *tmp = calloc(1, sizeof(char) * cap);
check_alloc(tmp);
return (String){.cap = cap, .len = 0, .data = tmp};
}
static void string_resize(String *str, size_t new_cap) {
void *block = realloc(str->data, new_cap * sizeof(char));
check_alloc(block);
str->data = block;
str->cap = new_cap;
memset(str->data + str->len, 0, (new_cap - str->len) * sizeof(char));
}
static void string_push(String *str, int32_t elem) {
if (str->len + sizeof(elem) >= str->cap) {
// str->cap * 2 + 1 > str->len + sizeof(elem) always holds
// as str->cap > 16
string_resize(str, str->cap * 2 + 1);
}
// NOTE: we don't consider little-endian/big-endian here
// the character in string is only for compare.
// They only need to be store in consistent way
memcpy(str->data + str->len, &elem, sizeof(elem));
str->len += sizeof(elem);
}
static void string_free(String *str) {
if (str->data != NULL) {
free(str->data);
str->data = NULL;
str->len = 0;
str->cap = 0;
}
}
static void string_clear(String *str) {
memset(str->data, 0, str->len * sizeof(char));
str->len = 0;
}
static void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
static void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
// NOTE: only "\n" is allowed as newline here,
// It implies that "\r" can also be terminator.
static bool isnewline(int32_t chr) {
return chr == '\n';
}
// `read_line` read strings until a newline or EOF
static void read_line(String *str, TSLexer *lexer) {
while (!isnewline(lexer->lookahead) && !lexer->eof(lexer)) {
string_push(str, lexer->lookahead);
advance(lexer);
}
}
// Suppose terminator is `T`, newline (\n) is `$`,
// It should accept "#<<T$T" or "#<<T$...$T", where `...`
// is the string content.
static bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (!valid_symbols[HERE_STRING_BODY]) {
return false;
}
String terminator = string_new();
read_line(&terminator, lexer);
if (lexer->eof(lexer)) {
string_free(&terminator);
return false;
}
// skip `\n`
skip(lexer);
String current_line = string_new();
while (true) {
read_line(&current_line, lexer);
if (strcmp(terminator.data, current_line.data) == 0) {
lexer->result_symbol = HERE_STRING_BODY;
string_free(&terminator);
string_free(&current_line);
return true;
}
if (lexer->eof(lexer)) {
string_free(&terminator);
string_free(&current_line);
return false;
}
string_clear(&current_line);
// skip `\n`
skip(lexer);
}
}
void *tree_sitter_racket_external_scanner_create() {
return NULL;
}
unsigned tree_sitter_racket_external_scanner_serialize(void *payload,
char *buffer) {
return 0;
}
void tree_sitter_racket_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
}
bool tree_sitter_racket_external_scanner_scan(void *payload,
TSLexer *lexer,
const bool *valid_symbols) {
return scan(lexer, valid_symbols);
}
void tree_sitter_racket_external_scanner_destroy(void *payload) {
}

@ -1,119 +0,0 @@
#include <string>
#include "tree_sitter/parser.h"
namespace {
using std::u32string;
enum TokenType {
HERE_STRING_BODY,
};
class optional_str {
bool valid;
u32string str;
public:
optional_str() : valid(true) {}
static optional_str none() {
optional_str emp;
emp.valid = false;
return emp;
}
bool is_none() const { return !this->valid; }
const u32string &content() const { return this->str; }
u32string &content() { return this->str; }
};
// NOTE: only "\n" is allowed as newline here,
// It implies that "\r" can also be terminator.
inline bool isnewline(int32_t c) {
return c == '\n';
}
inline optional_str read_terminator(TSLexer *lexer) {
optional_str line;
while (true) {
if (isnewline(lexer->lookahead)) {
return line;
} else if (lexer->eof(lexer)) {
return optional_str::none();
} else {
line.content().push_back(lexer->lookahead);
lexer->advance(lexer, false);
}
}
}
// `read_line` read strings until a newline or EOF
inline u32string read_line(TSLexer *lexer) {
u32string line;
while (!isnewline(lexer->lookahead) && !lexer->eof(lexer)) {
line.push_back(lexer->lookahead);
lexer->advance(lexer, false);
}
return line;
}
// Suppose terminator is `T`, newline (\n) is `$`,
// It should accept "#<<T$T" or "#<<T$...$T", where `...`
// is the string content.
bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (!valid_symbols[HERE_STRING_BODY]) {
return false;
}
const optional_str terminator = read_terminator(lexer);
if (terminator.is_none()) {
return false;
}
// skip `\n`
lexer->advance(lexer, false);
while (true) {
const u32string line = read_line(lexer);
if (line == terminator.content()) {
lexer->result_symbol = HERE_STRING_BODY;
return true;
}
if (lexer->eof(lexer)) {
return false;
}
// skip `\n`
lexer->advance(lexer, false);
}
}
} // namespace
extern "C" {
void *tree_sitter_racket_external_scanner_create(void) {
return NULL;
}
void tree_sitter_racket_external_scanner_destroy(void *payload) {
}
unsigned tree_sitter_racket_external_scanner_serialize(void *payload,
char *buffer) {
return 0;
}
void tree_sitter_racket_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
}
bool tree_sitter_racket_external_scanner_scan(void *payload,
TSLexer *lexer,
const bool *valid_symbols) {
return scan(lexer, valid_symbols);
}
}