diff --git a/binding.gyp b/binding.gyp new file mode 100644 index 000000000..5ed360c27 --- /dev/null +++ b/binding.gyp @@ -0,0 +1,19 @@ +{ + "targets": [ + { + "target_name": "tree_sitter_clojure_binding", + "include_dirs": [ + " - [/[\s,]/, $.comment], + [/\s/, ',', $.comment, $.tag, $.metadata_non_map, $.metadata_map], + + conflicts: $ => [ + ], + + externals: $ => [ + $.metadata_map, + ], rules: { source_file: $ => @@ -512,7 +519,6 @@ module.exports = grammar({ _reader_macro: $ => choice($.anonymous_function, $.deref, - $.metadata, $.quote, $.reader_conditional, $.regular_expression, @@ -520,7 +526,6 @@ module.exports = grammar({ $.splicing_reader_conditional, $.symbolic_value, $.syntax_quote, - $.tag, $.unquote, $.unquote_splicing, $.var_quote), @@ -537,12 +542,30 @@ module.exports = grammar({ $._form), // older code uses #^ - metadata: $ => - seq(choice('^', '#^'), - choice($.keyword, - $.map, - $.symbol, - $.string)), + metadata_non_map: $ => + token(seq(choice('^', '#^'), + optional(/\s+/), // ^String (.toString 1) == ^ String (.toString 1) + choice(// definition of keyword + choice(SIMPLE_KEYWORD, + seq(/(:|::)/, + NON_SLASH_SIMPLE_KEYWORD, + '/', + // at repl: :user/8 => Invalid token + NON_SLASH_SIMPLE_SYMBOL)), + // definition of map + // XXX: map handled as member of externals because regex not up to task? + ///{[^}]+}/, + // definition of symbol + choice(SIMPLE_SYMBOL, + // $._qualified_symbol + seq(NON_SLASH_SIMPLE_SYMBOL, + '/', + // because clojure.core// is allowed + SIMPLE_SYMBOL)), + // definition of string + seq('"', + STRING_CONTENT, + '"')))), // at repl: 'a == ' a quote: $ => @@ -590,9 +613,15 @@ module.exports = grammar({ // at repl: // #uuid "40fff7cc-2e57-42dd-b737-533820ed53e9" == // # uuid "40fff7cc-2e57-42dd-b737-533820ed53e9" + // XXX: repeated because don't know how to reuse certain things within a token tag: $ => - seq('#', - $.symbol), + token(seq('#', + choice(SIMPLE_SYMBOL, + // $._qualified_symbol + seq(NON_SLASH_SIMPLE_SYMBOL, + '/', + // because clojure.core// is allowed + SIMPLE_SYMBOL)))), // at repl: `~a == `~ a == ` ~ a unquote: $ => diff --git a/src/scanner.cc b/src/scanner.cc new file mode 100644 index 000000000..7bb4681fe --- /dev/null +++ b/src/scanner.cc @@ -0,0 +1,164 @@ +#include +#include +#include + +namespace { + +using std::iswspace; + +enum TokenType { + METADATA_MAP, +}; + +struct Scanner { + void skipWsOrCommas(TSLexer *lexer) { + while (iswspace(lexer->lookahead) || (lexer->lookahead == ',')) { + lexer->advance(lexer, true); + } + } + bool scan(TSLexer *lexer, const bool *valid_symbols) { + // looking for things like: + // ^{:a 1} + // #^{:fun "woa"} + // ^{:nesting #{}} + // ^{:escaped "\}"} + // ^{:who-does-this \{} + if (valid_symbols[METADATA_MAP]) { + skipWsOrCommas(lexer); + if (lexer->lookahead == 0) { + return false; + } + + // look for ^ or #^ + if (lexer->lookahead == '#') { // old-style metadata? + lexer->advance(lexer, false); + // better have a ^ next + if ((lexer->lookahead == 0) || (lexer->lookahead != '^')) { + return false; + } + } else if (lexer->lookahead != '^') { // contemporary metadata? + return false; + } + // at this point either ^ or #^ should have been detected + lexer->advance(lexer, false); + + skipWsOrCommas(lexer); + if (lexer->lookahead == 0) { + return false; + } + + // Consume '{' + if (lexer->lookahead != '{') { + return false; + } + lexer->advance(lexer, false); + // XXX: would check if EOF here, but this happens at top of loop below + + // 1. count curly braces that are delimiters + // - \{ and \} in non-string context => not delimiters + // - { and } within string delimiters (") => not delimiters + // - all other occurences => count as delimiters + // 2. ascertain whether current character is within a literal string + // - track whether current context is in a string or notes + // - any non-escaped double quote => toggle in_string state + // 3. count consecutive backslashes to support 1 and 2 + // - even consecutive backslashes before char => char not escaped + // - odd consecutive backslashes before char => char escaped + uint32_t brace_depth = 1; + bool in_string = false; + // trying to count CONSECUTIVE backslashes, but 1 means just saw a + // backslash that wasn't preceded by a backslash. + uint32_t cbs = 0; + + // Consume content, '}' + for (;;) { + switch (lexer->lookahead) { + // EOF + case '\0': + return false; + // track escaping + case '\\': + if (cbs > 0) { + cbs++; + } else { + cbs = 1; + } + break; + // possibly an opening brace to count + case '{': + if (cbs % 2 == 0) { + if (!in_string) { + brace_depth++; + } + } + cbs = 0; + break; + // possibly a closing brace to count + case '}': + if (cbs % 2 == 0) { + if (!in_string) { + if (brace_depth == 1) { + lexer->result_symbol = METADATA_MAP; + lexer->advance(lexer, false); + //lexer->mark_end(lexer); + return true; + } else { + brace_depth--; + } + } + } + cbs = 0; + break; + // possibly a string delimiter + case '"': + if (cbs % 2 == 0) { + if (in_string) { + in_string = false; + } else { + in_string = true; + } + } + // XXX: not sure if above logic is complete + cbs = 0; + break; + // any other char means chain of backslashes broken + default: + cbs = 0; + break; + } + + lexer->advance(lexer, false); + } + } + return false; + } +}; + +} + +extern "C" { + +void *tree_sitter_clojure_external_scanner_create() { + return new Scanner(); +} + +bool tree_sitter_clojure_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = static_cast(payload); + return scanner->scan(lexer, valid_symbols); +} + +unsigned tree_sitter_clojure_external_scanner_serialize(void *payload, char *buffer) { + return 0; +} + +void tree_sitter_clojure_external_scanner_deserialize(void *payload, const char *buffer, + unsigned length) { +} + +void tree_sitter_clojure_external_scanner_destroy(void *payload) { + Scanner *scanner = static_cast(payload); + delete scanner; +} + +}