Treat tag and metadata as extras

unified_patch
sogaiu 2020-03-05 02:04:59 +07:00
parent 976736e251
commit 319eae813b
3 changed files with 223 additions and 11 deletions

@ -0,0 +1,19 @@
{
"targets": [
{
"target_name": "tree_sitter_clojure_binding",
"include_dirs": [
"<!(node -e \"require('nan')\")",
"src"
],
"sources": [
"src/parser.c",
"src/binding.cc",
"src/scanner.cc"
],
"cflags_c": [
"-std=c99",
]
}
]
}

@ -293,7 +293,14 @@ module.exports = grammar({
name: 'clojure',
extras: $ =>
[/[\s,]/, $.comment],
[/\s/, ',', $.comment, $.tag, $.metadata_non_map, $.metadata_map],
conflicts: $ => [
],
externals: $ => [
$.metadata_map,
],
rules: {
source_file: $ =>
@ -512,7 +519,6 @@ module.exports = grammar({
_reader_macro: $ =>
choice($.anonymous_function,
$.deref,
$.metadata,
$.quote,
$.reader_conditional,
$.regular_expression,
@ -520,7 +526,6 @@ module.exports = grammar({
$.splicing_reader_conditional,
$.symbolic_value,
$.syntax_quote,
$.tag,
$.unquote,
$.unquote_splicing,
$.var_quote),
@ -537,12 +542,30 @@ module.exports = grammar({
$._form),
// older code uses #^
metadata: $ =>
seq(choice('^', '#^'),
choice($.keyword,
$.map,
$.symbol,
$.string)),
metadata_non_map: $ =>
token(seq(choice('^', '#^'),
optional(/\s+/), // ^String (.toString 1) == ^ String (.toString 1)
choice(// definition of keyword
choice(SIMPLE_KEYWORD,
seq(/(:|::)/,
NON_SLASH_SIMPLE_KEYWORD,
'/',
// at repl: :user/8 => Invalid token
NON_SLASH_SIMPLE_SYMBOL)),
// definition of map
// XXX: map handled as member of externals because regex not up to task?
///{[^}]+}/,
// definition of symbol
choice(SIMPLE_SYMBOL,
// $._qualified_symbol
seq(NON_SLASH_SIMPLE_SYMBOL,
'/',
// because clojure.core// is allowed
SIMPLE_SYMBOL)),
// definition of string
seq('"',
STRING_CONTENT,
'"')))),
// at repl: 'a == ' a
quote: $ =>
@ -590,9 +613,15 @@ module.exports = grammar({
// at repl:
// #uuid "40fff7cc-2e57-42dd-b737-533820ed53e9" ==
// # uuid "40fff7cc-2e57-42dd-b737-533820ed53e9"
// XXX: repeated because don't know how to reuse certain things within a token
tag: $ =>
seq('#',
$.symbol),
token(seq('#',
choice(SIMPLE_SYMBOL,
// $._qualified_symbol
seq(NON_SLASH_SIMPLE_SYMBOL,
'/',
// because clojure.core// is allowed
SIMPLE_SYMBOL)))),
// at repl: `~a == `~ a == ` ~ a
unquote: $ =>

@ -0,0 +1,164 @@
#include <tree_sitter/parser.h>
#include <string>
#include <cwctype>
namespace {
using std::iswspace;
enum TokenType {
METADATA_MAP,
};
struct Scanner {
void skipWsOrCommas(TSLexer *lexer) {
while (iswspace(lexer->lookahead) || (lexer->lookahead == ',')) {
lexer->advance(lexer, true);
}
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
// looking for things like:
// ^{:a 1}
// #^{:fun "woa"}
// ^{:nesting #{}}
// ^{:escaped "\}"}
// ^{:who-does-this \{}
if (valid_symbols[METADATA_MAP]) {
skipWsOrCommas(lexer);
if (lexer->lookahead == 0) {
return false;
}
// look for ^ or #^
if (lexer->lookahead == '#') { // old-style metadata?
lexer->advance(lexer, false);
// better have a ^ next
if ((lexer->lookahead == 0) || (lexer->lookahead != '^')) {
return false;
}
} else if (lexer->lookahead != '^') { // contemporary metadata?
return false;
}
// at this point either ^ or #^ should have been detected
lexer->advance(lexer, false);
skipWsOrCommas(lexer);
if (lexer->lookahead == 0) {
return false;
}
// Consume '{'
if (lexer->lookahead != '{') {
return false;
}
lexer->advance(lexer, false);
// XXX: would check if EOF here, but this happens at top of loop below
// 1. count curly braces that are delimiters
// - \{ and \} in non-string context => not delimiters
// - { and } within string delimiters (") => not delimiters
// - all other occurences => count as delimiters
// 2. ascertain whether current character is within a literal string
// - track whether current context is in a string or notes
// - any non-escaped double quote => toggle in_string state
// 3. count consecutive backslashes to support 1 and 2
// - even consecutive backslashes before char => char not escaped
// - odd consecutive backslashes before char => char escaped
uint32_t brace_depth = 1;
bool in_string = false;
// trying to count CONSECUTIVE backslashes, but 1 means just saw a
// backslash that wasn't preceded by a backslash.
uint32_t cbs = 0;
// Consume content, '}'
for (;;) {
switch (lexer->lookahead) {
// EOF
case '\0':
return false;
// track escaping
case '\\':
if (cbs > 0) {
cbs++;
} else {
cbs = 1;
}
break;
// possibly an opening brace to count
case '{':
if (cbs % 2 == 0) {
if (!in_string) {
brace_depth++;
}
}
cbs = 0;
break;
// possibly a closing brace to count
case '}':
if (cbs % 2 == 0) {
if (!in_string) {
if (brace_depth == 1) {
lexer->result_symbol = METADATA_MAP;
lexer->advance(lexer, false);
//lexer->mark_end(lexer);
return true;
} else {
brace_depth--;
}
}
}
cbs = 0;
break;
// possibly a string delimiter
case '"':
if (cbs % 2 == 0) {
if (in_string) {
in_string = false;
} else {
in_string = true;
}
}
// XXX: not sure if above logic is complete
cbs = 0;
break;
// any other char means chain of backslashes broken
default:
cbs = 0;
break;
}
lexer->advance(lexer, false);
}
}
return false;
}
};
}
extern "C" {
void *tree_sitter_clojure_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_clojure_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_clojure_external_scanner_serialize(void *payload, char *buffer) {
return 0;
}
void tree_sitter_clojure_external_scanner_deserialize(void *payload, const char *buffer,
unsigned length) {
}
void tree_sitter_clojure_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}