Treat tag and metadata as extras

2020-03-05 02:04:59 +07:00 · 2020-03-05 02:04:59 +07:00 · 319eae813b
parent 976736e251
commit 319eae813b
3 changed files with 223 additions and 11 deletions
--- a/binding.gyp
+++ b/binding.gyp
@ -0,0 +1,19 @@
+{
+  "targets": [
+    {
+      "target_name": "tree_sitter_clojure_binding",
+      "include_dirs": [
+        "<!(node -e \"require('nan')\")",
+        "src"
+      ],
+      "sources": [
+        "src/parser.c",
+        "src/binding.cc",
+        "src/scanner.cc"
+      ],
+      "cflags_c": [
+        "-std=c99",
+      ]
+    }
+  ]
+}
--- a/grammar.js
+++ b/grammar.js
@ -293,7 +293,14 @@ module.exports = grammar({
  name: 'clojure',

  extras: $ =>
-    [/[\s,]/, $.comment],
+    [/\s/, ',', $.comment, $.tag, $.metadata_non_map, $.metadata_map],
+
+  conflicts: $ => [
+  ],
+
+  externals: $ => [
+    $.metadata_map,
+  ],

  rules: {
    source_file: $ =>
@ -512,7 +519,6 @@ module.exports = grammar({
    _reader_macro: $ =>
      choice($.anonymous_function,
             $.deref,
-             $.metadata,
             $.quote,
             $.reader_conditional,
             $.regular_expression,
@ -520,7 +526,6 @@ module.exports = grammar({
             $.splicing_reader_conditional,
             $.symbolic_value,
             $.syntax_quote,
-             $.tag,
             $.unquote,
             $.unquote_splicing,
             $.var_quote),
@ -537,12 +542,30 @@ module.exports = grammar({
          $._form),

    // older code uses #^
-    metadata: $ =>
-      seq(choice('^', '#^'),
-          choice($.keyword,
-                 $.map,
-                 $.symbol,
-                 $.string)),
+    metadata_non_map: $ =>
+      token(seq(choice('^', '#^'),
+                optional(/\s+/), // ^String (.toString 1) == ^ String (.toString 1)
+                choice(// definition of keyword
+                       choice(SIMPLE_KEYWORD,
+                              seq(/(:|::)/,
+                                  NON_SLASH_SIMPLE_KEYWORD,
+                                  '/',
+                                  // at repl: :user/8 => Invalid token
+                                  NON_SLASH_SIMPLE_SYMBOL)),
+                       // definition of map
+                       // XXX: map handled as member of externals because regex not up to task?
+                       ///{[^}]+}/,
+                       // definition of symbol
+                       choice(SIMPLE_SYMBOL,
+                              // $._qualified_symbol
+                              seq(NON_SLASH_SIMPLE_SYMBOL,
+                                  '/',
+                                  // because clojure.core// is allowed
+                                  SIMPLE_SYMBOL)),
+                       // definition of string
+                       seq('"',
+                           STRING_CONTENT,
+                           '"')))),

    // at repl: 'a == ' a
    quote: $ =>
@ -590,9 +613,15 @@ module.exports = grammar({
    // at repl:
    //   #uuid "40fff7cc-2e57-42dd-b737-533820ed53e9" ==
    //   # uuid "40fff7cc-2e57-42dd-b737-533820ed53e9"
+    // XXX: repeated because don't know how to reuse certain things within a token
    tag: $ =>
-      seq('#',
-          $.symbol),
+      token(seq('#',
+                choice(SIMPLE_SYMBOL,
+                       // $._qualified_symbol
+                       seq(NON_SLASH_SIMPLE_SYMBOL,
+                           '/',
+                           // because clojure.core// is allowed
+                           SIMPLE_SYMBOL)))),

    // at repl: `~a == `~ a == ` ~ a
    unquote: $ =>
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -0,0 +1,164 @@
+#include <tree_sitter/parser.h>
+#include <string>
+#include <cwctype>
+
+namespace {
+
+using std::iswspace;
+
+enum TokenType {
+  METADATA_MAP,
+};
+
+struct Scanner {
+  void skipWsOrCommas(TSLexer *lexer) {
+    while (iswspace(lexer->lookahead) || (lexer->lookahead == ',')) {
+      lexer->advance(lexer, true);
+    }    
+  }
+  bool scan(TSLexer *lexer, const bool *valid_symbols) {
+    // looking for things like:
+    //   ^{:a 1}
+    //   #^{:fun "woa"}
+    //   ^{:nesting #{}}
+    //   ^{:escaped "\}"}
+    //   ^{:who-does-this \{}
+    if (valid_symbols[METADATA_MAP]) {
+      skipWsOrCommas(lexer);
+      if (lexer->lookahead == 0) {
+        return false;
+      }
+
+      // look for ^ or #^
+      if (lexer->lookahead == '#') { // old-style metadata?
+        lexer->advance(lexer, false);
+        // better have a ^ next
+        if ((lexer->lookahead == 0) || (lexer->lookahead != '^')) {
+          return false;
+        }
+      } else if (lexer->lookahead != '^') { // contemporary metadata?
+        return false;
+      }
+      // at this point either ^ or #^ should have been detected
+      lexer->advance(lexer, false);
+
+      skipWsOrCommas(lexer);
+      if (lexer->lookahead == 0) {
+        return false;
+      }
+
+      // Consume '{'
+      if (lexer->lookahead != '{') {
+        return false;
+      }
+      lexer->advance(lexer, false);
+      // XXX: would check if EOF here, but this happens at top of loop below
+
+      // 1. count curly braces that are delimiters
+      //    - \{ and \} in non-string context => not delimiters
+      //    - { and } within string delimiters (") => not delimiters
+      //    - all other occurences => count as delimiters
+      // 2. ascertain whether current character is within a literal string
+      //    - track whether current context is in a string or notes
+      //    - any non-escaped double quote => toggle in_string state
+      // 3. count consecutive backslashes to support 1 and 2
+      //    - even consecutive backslashes before char => char not escaped
+      //    - odd consecutive backslashes before char => char escaped
+      uint32_t brace_depth = 1;
+      bool in_string = false;
+      // trying to count CONSECUTIVE backslashes, but 1 means just saw a
+      // backslash that wasn't preceded by a backslash.
+      uint32_t cbs = 0;
+      
+      // Consume content, '}'
+      for (;;) {
+        switch (lexer->lookahead) {
+          // EOF
+          case '\0':
+            return false;
+          // track escaping
+          case '\\':
+            if (cbs > 0) {
+              cbs++;
+            } else {
+              cbs = 1;
+            }
+            break;
+          // possibly an opening brace to count
+          case '{':
+            if (cbs % 2 == 0) {
+              if (!in_string) {
+                brace_depth++;
+              }
+            }
+            cbs = 0;
+            break;
+          // possibly a closing brace to count
+          case '}':
+            if (cbs % 2 == 0) {
+              if (!in_string) {
+                if (brace_depth == 1) {
+                  lexer->result_symbol = METADATA_MAP;
+                  lexer->advance(lexer, false);
+                  //lexer->mark_end(lexer);
+                  return true;
+                } else {
+                  brace_depth--;
+                }
+              }
+            }
+            cbs = 0;
+            break;
+          // possibly a string delimiter
+          case '"':
+            if (cbs % 2 == 0) {
+              if (in_string) {
+                in_string = false;
+              } else {
+                in_string = true;
+              }
+            }
+            // XXX: not sure if above logic is complete
+            cbs = 0;
+            break;
+          // any other char means chain of backslashes broken
+          default:
+            cbs = 0;
+            break;
+        }
+
+        lexer->advance(lexer, false);
+      }
+    }
+    return false;
+  }
+};
+
+}
+
+extern "C" {
+
+void *tree_sitter_clojure_external_scanner_create() {
+  return new Scanner();
+}
+
+bool tree_sitter_clojure_external_scanner_scan(void *payload, TSLexer *lexer,
+                                               const bool *valid_symbols) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  return scanner->scan(lexer, valid_symbols);
+}
+
+unsigned tree_sitter_clojure_external_scanner_serialize(void *payload, char *buffer) {
+  return 0;
+}
+
+void tree_sitter_clojure_external_scanner_deserialize(void *payload, const char *buffer,
+                                                      unsigned length) {
+}
+
+void tree_sitter_clojure_external_scanner_destroy(void *payload) {
+  Scanner *scanner = static_cast<Scanner *>(payload);
+  delete scanner;
+}
+
+}