Merge commit '7dc4fb60390218b09bc351062eeede7dcdbb4d9f'

2023-08-08 23:33:05 +07:00 · 2023-08-08 23:33:05 +07:00 · 4e77f83dd9
parent d0cf8c6d0d 7dc4fb6039
commit 4e77f83dd9
19 changed files with 15740 additions and 14105 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,7 +2,7 @@

 ### Parsing

-Updated Elixir, Erlang and Go parsers.
+Updated Elixir, Erlang, Go and Racket parsers.

 ### Display

--- a/build.rs
+++ b/build.rs
@ -298,7 +298,7 @@ fn main() {
        TreeSitterParser {
            name: "tree-sitter-racket",
            src_dir: "vendored_parsers/tree-sitter-racket-src",
-            extra_files: vec!["scanner.cc"],
+            extra_files: vec!["scanner.c"],
        },
        TreeSitterParser {
            name: "tree-sitter-ruby",
--- a/vendored_parsers/tree-sitter-racket/Cargo.toml
+++ b/vendored_parsers/tree-sitter-racket/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 name = "tree-sitter-racket"
 description = "racket grammar for the tree-sitter parsing library"
-version = "0.0.1"
+version = "0.3.0"
 keywords = ["incremental", "parsing", "racket"]
 categories = ["parsing", "text-editors"]
 repository = "https://github.com/tree-sitter/tree-sitter-racket"
--- a/vendored_parsers/tree-sitter-racket/README.md
+++ b/vendored_parsers/tree-sitter-racket/README.md
@ -8,7 +8,13 @@ This grammar only implements the Racket language with the default readtable.

 ## Status

-It should recognize most grammar with the default readtable.
+It should be complete and compatible with Racket 8.9.
+
+There are no plans to add support for new language currently.
+
+## News
+
+Starting from June 24, 2023, ([commit](https://github.com/6cdh/tree-sitter-racket/commit/989c3e631a7f2d87bb6a66a5394870aaeb6c56e7)) or release 0.3.0, the external scanner was written in C.

 ## Build and Try

@ -16,7 +22,6 @@ You need

 * nodejs
 * a C compiler
-* a C++11 compiler

 then run

--- a/vendored_parsers/tree-sitter-racket/binding.gyp
+++ b/vendored_parsers/tree-sitter-racket/binding.gyp
@ -9,8 +9,7 @@
      "sources": [
        "bindings/node/binding.cc",
        "src/parser.c",
-        "src/scanner.cc",
-        # If your language uses an external scanner, add it here.
+        "src/scanner.c",
      ],
      "cflags_c": [
        "-std=c99",
--- a/vendored_parsers/tree-sitter-racket/bindings/rust/build.rs
+++ b/vendored_parsers/tree-sitter-racket/bindings/rust/build.rs
@ -2,24 +2,18 @@ fn main() {
    let src_dir = std::path::Path::new("src");

    let mut c_config = cc::Build::new();
-    c_config.include(&src_dir);
+    c_config.include(src_dir);
    c_config
        .flag_if_supported("-Wno-unused-parameter")
        .flag_if_supported("-Wno-unused-but-set-variable")
        .flag_if_supported("-Wno-trigraphs");
    let parser_path = src_dir.join("parser.c");
    c_config.file(&parser_path);
-    c_config.compile("parser");
-    println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());

-    let mut cpp_config = cc::Build::new();
-    cpp_config.cpp(true);
-    cpp_config.include(&src_dir);
-    cpp_config
-        .flag_if_supported("-Wno-unused-parameter")
-        .flag_if_supported("-Wno-unused-but-set-variable");
-    let scanner_path = src_dir.join("scanner.cc");
-    cpp_config.file(&scanner_path);
-    cpp_config.compile("scanner");
+    let scanner_path = src_dir.join("scanner.c");
+    c_config.file(&scanner_path);
    println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
+
+    println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
+    c_config.compile("parser");
 }
--- a/vendored_parsers/tree-sitter-racket/corpus/here_string/.gitattributes
+++ b/vendored_parsers/tree-sitter-racket/corpus/here_string/.gitattributes
@ -1,2 +1,2 @@
-win_* text eol=crlf
+win_* eol=crlf
 * eol=lf
--- a/vendored_parsers/tree-sitter-racket/corpus/simple.rkt
+++ b/vendored_parsers/tree-sitter-racket/corpus/simple.rkt
@ -31,6 +31,8 @@ number
 #b101
 +inf.t
 -nan.t
+i
+0##+i

 ---

@ -47,6 +49,8 @@ number
  (number)
  (number)
  (number)
+  (number)
+  (number)
  (number))

 ===
@ -79,6 +83,8 @@ Ap\ ple
 app123app123
 123app123
 中文
+a
+fec

 ---
 (program
@ -92,6 +98,8 @@ app123app123
  (symbol)
  (symbol)
  (symbol)
+  (symbol)
+  (symbol)
  (symbol))

 ===
--- a/vendored_parsers/tree-sitter-racket/fuzztest/.gitignore
+++ b/vendored_parsers/tree-sitter-racket/fuzztest/.gitignore
@ -0,0 +1 @@
+*.txt
--- a/vendored_parsers/tree-sitter-racket/fuzztest/README.md
+++ b/vendored_parsers/tree-sitter-racket/fuzztest/README.md
@ -0,0 +1,34 @@
+# fuzz test
+
+The directory contains the scripts that test the implementation to avoid the problem that a symbol is parsed as a number or vice versa.
+
+## Resource
+
+* ~6 minutes to run
+* ~300M generated files
+
+## Run
+
+```shell
+$ cd fuzztest
+# 30s
+$ racket gen_cases.rkt
+cpu time: 26531 real time: 26835 gc time: 1069
+5114173 cases generated
+$ cd ..
+$ tree-sitter generate
+# 3 minutes
+$ tree-sitter parse fuzztest/case.txt > fuzztest/res1.txt
+# 2 minutes
+$ cd fuzztest && racket postprocess.rkt
+# should show nothing
+$ sdiff -s <(cat -n expect.txt) <(cat -n res.txt)
+
+# If there is some error, run
+$ sdiff -s <(cat -n expect.txt) <(cat -n res.txt) | less
+# then get the first error case at `N`-th line
+$ cat case.txt | sed -n 'Np'
+```
+
+You can edit `gen_cases.rkt` to generate less cases during development.
+
--- a/vendored_parsers/tree-sitter-racket/fuzztest/gen_cases.rkt
+++ b/vendored_parsers/tree-sitter-racket/fuzztest/gen_cases.rkt
@ -0,0 +1,57 @@
+#lang racket
+
+(require racket/extflonum)
+
+;; all characters that can appear in a valid number/exflonum
+;; remove some insignificant parts to improve performance
+;; (define alphabet-char "abdefilnost")
+(define alphabet-char "abdefilnostx")
+(define special-char "#./@+-")
+;; (define numeric-char "0123456789")
+(define numeric-char "0179")
+(define all-char
+  (string-append alphabet-char
+                 special-char
+                 numeric-char))
+
+(define cnt 0)
+(define max-len 5)
+
+(define case-port (open-output-file "case.txt" #:exists 'replace))
+(define expect-port (open-output-file "expect.txt" #:exists 'replace))
+
+(define (gen i case)
+  (with-handlers ([exn:fail? (lambda _ (void))])
+    (when (> i 0)
+      (define case-str (list->string case))
+      ;; ".0@.0" should be a number according the document,
+      ;; but it's actually a symbol.
+      ;; It's a bug of Racket reader, and will fix in new Racket release.
+      ;; we skip these cases.
+      (when (not (string-contains? case-str "@."))
+        (with-handlers ([exn:fail? void])
+          (with-input-from-string case-str
+            (lambda ()
+              (define fst (read))
+              (define snd (read))
+              (when (eof-object? snd)
+                (cond [(symbol? fst)
+                       (set! cnt (add1 cnt))
+                       (displayln case-str case-port)
+                       (displayln "symbol" expect-port)]
+                      [(number? fst)
+                       (set! cnt (add1 cnt))
+                       (displayln case-str case-port)
+                       (displayln "number" expect-port)]
+                      ;; it's here for possible future change that
+                      ;; split extflonum from number
+                      [(extflonum? fst)
+                       (set! cnt (add1 cnt))
+                       (displayln case-str case-port)
+                       (displayln "number" expect-port)]))))))))
+  (when (< i max-len)
+    (for ([c all-char])
+      (gen (add1 i) (cons c case)))))
+
+(time (gen 0 '()))
+(displayln (format "~a cases generated" cnt))
--- a/vendored_parsers/tree-sitter-racket/fuzztest/postprocess.rkt
+++ b/vendored_parsers/tree-sitter-racket/fuzztest/postprocess.rkt
@ -0,0 +1,26 @@
+#lang racket
+
+(define port (open-input-file "res1.txt"))
+(define all-result (drop (read port) 4))
+(define all-result-line
+  (for/list ([r all-result])
+    (cons (car r) (caadr r))))
+(with-output-to-file "res.txt"
+  #:exists 'replace
+  (lambda ()
+    (let loop ([line 0]
+               [firstline? #t]
+               [lst all-result-line])
+      (match lst
+        ['() (void)]
+        [(cons fst rem)
+         #:when (= (cdr fst) line)
+         (when (not firstline?)
+           (display " "))
+         (display (car fst))
+         (loop line (if firstline? #t #f) rem)]
+        [(cons fst rem)
+         (newline)
+         (display (car fst))
+         (loop (add1 line) #f rem)]))
+    (newline)))
--- a/vendored_parsers/tree-sitter-racket/grammar.js
+++ b/vendored_parsers/tree-sitter-racket/grammar.js
@ -1,13 +1,11 @@
 const PREC = {
  first: $ => prec(100, $),
  last: $ => prec(-1, $),
-  left: prec.left,
-  right: prec.right,
 };

 const LEAF = {
  // https://en.wikipedia.org/wiki/Unicode_character_property#Whitespace
-  whitespace: /[ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]/u,
+  whitespace: /[ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}]+/u,
  newline: /[\r\n\u{85}\u{2028}\u{2029}]/,
  delimiter: /[ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{FEFF}(){}",'`;\[\]]/u,
  non_delimiter: /[^ \t\n\v\f\r\u{0085}\u{00A0}\u{1680}\u{2000}-\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{FEFF}(){}",'`;\[\]]/u,
@ -49,23 +47,21 @@ module.exports = grammar({

    _token: $ =>
      choice(
-        token(repeat1(LEAF.whitespace)),
-        $._all_comment,
+        $._skip,
        $.extension,
        $._datum),

-    _skip: $ => choice(token(repeat1(LEAF.whitespace)), $._all_comment),
-
-    dot: _ => ".",
-
-    // comment {{{
-
-    _all_comment: $ =>
+    _skip: $ =>
      choice(
+        LEAF.whitespace,
        $.comment,
        $.sexp_comment,
        $.block_comment),

+    dot: _ => ".",
+
+    // comment {{{
+
    comment: $ =>
      choice(
        token(
@ -101,6 +97,10 @@ module.exports = grammar({
        $.here_string,
        $.byte_string,
        $.character,
+
+        // number/symbol precedence
+        // for same length token, prefer number
+        // otherwise, prefer symbol which is also longer
        $.number,
        $.symbol,

@ -110,7 +110,15 @@ module.exports = grammar({
        $.graph,
        $.structure,
        $.hash,
-        $._abbrev,
+
+        $.quote,
+        $.quasiquote,
+        $.syntax,
+        $.quasisyntax,
+        $.unquote,
+        $.unquote_splicing,
+        $.unsyntax,
+        $.unsyntax_splicing,

        $.list,
        $.vector),
@ -158,10 +166,14 @@ module.exports = grammar({
    number: _ =>
      token(
        choice(
-          _number_base(2),
-          _number_base(8),
-          _number_base(10),
-          _number_base(16))),
+          extflonum(2),
+          extflonum(8),
+          extflonum(10),
+          extflonum(16),
+          number_base(2),
+          number_base(8),
+          number_base(10),
+          number_base(16))),

    decimal: _ => /[0-9]+/,

@ -178,14 +190,12 @@ module.exports = grammar({
            /./))),

    symbol: _ =>
-      PREC.last(
-        PREC.right(
-          token(
-            choice(
-              /#[cC][iIsS]/, // read-case-sensitive parameter
-              seq(
-                LEAF.symbol_start,
-                repeat(LEAF.symbol_remain)))))),
+      token(
+        choice(
+          /#[cC][iIsS]/, // read-case-sensitive parameter
+          seq(
+            LEAF.symbol_start,
+            repeat(LEAF.symbol_remain)))),

    keyword: _ =>
      token(
@ -233,17 +243,6 @@ module.exports = grammar({
            repeat($._skip),
            $._datum))),

-    _abbrev: $ =>
-      choice(
-        $.quote,
-        $.quasiquote,
-        $.syntax,
-        $.quasisyntax,
-        $.unquote,
-        $.unquote_splicing,
-        $.unsyntax,
-        $.unsyntax_splicing),
-
    quote: $ =>
      seq(
        "'",
@ -309,115 +308,216 @@ module.exports = grammar({

 // number {{{

-function _number_base(n) {
-  const number = _ =>
+function number_base(n) {
+  const digit = {
+    2: /[01]/,
+    8: /[0-7]/,
+    10: /[0-9]/,
+    16: /[0-9a-fA-F]/,
+  }[n];
+
+  const exp_mark = {
+    2: /[sldefSLDEF]/,
+    8: /[sldefSLDEF]/,
+    10: /[sldefSLDEF]/,
+    16: /[slSL]/,
+  }[n];
+
+  const prefix = {
+    2: /#[bB]/,
+    8: /#[oO]/,
+    10: optional(/#[dD]/),
+    16: /#[xX]/,
+  }[n];
+
+  const exactness =
+    /#[eiEI]/;
+
+  const sign = /[+-]/;
+
+  const digits_hash =
    seq(
-      choice(
-        seq(radix(), optional(exactness())),
-        seq(optional(exactness()), radix()),
-      ),
-      choice(
-        // Inexact number pattern already contains exact pattern.
-        // So we don't need to parse exact number explicitly
-        inexact()));
-
-  const sign = _ => /[+-]/;
-
-  const digit = _ => {
-    return {
-      2: /[01]/,
-      8: /[0-7]/,
-      10: /[0-9]/,
-      16: /[0-9a-fA-F]/,
-    }[n];
-  };
-
-  const radix = _ => {
-    return {
-      2: /#[bB]/,
-      8: /#[oO]/,
-      10: optional(/#[dD]/),
-      16: /#[xX]/,
-    }[n];
-  };
-
-  const exactness = _ =>
-    choice("#e", "#E", "#i", "#I");
-
-  const exp_mark = _ => /[sldeftSLDEFT]/;
-
-  const unsigned_integer = _ =>
-    repeat1(digit());
-
-  const inexact = _ =>
+      repeat1(digit),
+      repeat("#"));
+
+  const unsigned_integer =
+    repeat1(digit);
+
+  // exact
+
+  const exact_integer =
+    seq(
+      optional(sign),
+      unsigned_integer);
+
+  const unsigned_rational =
+    choice(
+      unsigned_integer,
+      seq(unsigned_integer, "/", unsigned_integer));
+
+  const exact_rational =
+    seq(
+      optional(sign),
+      unsigned_rational);
+
+  const exact_complex =
+    seq(
+      optional(exact_rational),
+      sign,
+      optional(unsigned_rational),
+      /[iI]/);
+
+  const exact =
+    choice(exact_rational, exact_complex);
+
+  // inexact
+
+  const inexact_special =
    choice(
-      inexact_real(),
-      inexact_complex());
+      /[iI][nN][fF]\.[0fF]/,
+      /[nN][aA][nN]\.[0fF]/);
+
+  const inexact_simple =
+    choice(
+      seq(
+        digits_hash,
+        optional("."),
+        repeat("#")),
+      seq(
+        optional(unsigned_integer),
+        ".",
+        digits_hash),
+      seq(
+        digits_hash,
+        "/",
+        digits_hash));
+
+  const inexact_normal =
+    seq(
+      inexact_simple,
+      optional(
+        seq(
+          exp_mark,
+          exact_integer)));
+
+  const inexact_unsigned =
+    choice(inexact_normal, inexact_special);

-  const inexact_real = _ =>
+  const inexact_real =
    choice(
      seq(
-        optional(sign()),
-        inexact_normal()),
+        optional(sign),
+        inexact_normal),
      seq(
-        sign(),
-        inexact_special()));
+        sign,
+        inexact_special));

-  const inexact_complex = _ =>
+  const inexact_complex =
    choice(
      seq(
-        optional(inexact_real()),
-        sign(),
-        inexact_unsigned(),
+        optional(inexact_real),
+        sign,
+        optional(inexact_unsigned),
        /[iI]/),
      seq(
-        inexact_real(),
+        inexact_real,
        "@",
-        inexact_real()));
+        inexact_real));

-  const inexact_unsigned = _ =>
-    choice(
-      inexact_normal(),
-      inexact_special());
+  const inexact =
+    choice(inexact_real, inexact_complex);
+
+  const number =
+    choice(exact, inexact);

-  const inexact_normal = _ =>
+  const general_number =
    seq(
-      inexact_simple(),
-      optional(
+      choice(
+        seq(
+          optional(exactness),
+          prefix),
        seq(
-          exp_mark(),
-          optional(sign()),
-          unsigned_integer())));
+          prefix,
+          optional(exactness))),
+      number);

-  const inexact_special = _ =>
+  return general_number;
+}
+
+function extflonum(n) {
+  const digit = {
+    2: /[01]/,
+    8: /[0-7]/,
+    10: /[0-9]/,
+    16: /[0-9a-fA-F]/,
+  }[n];
+
+  const exp_mark = /[tT]/;
+
+  const prefix = {
+    2: /#[bB]/,
+    8: /#[oO]/,
+    10: optional(/#[dD]/),
+    16: /#[xX]/,
+  }[n];
+
+  const sign = /[+-]/;
+
+  const digits_hash =
+    seq(
+      repeat1(digit),
+      repeat("#"));
+
+  const unsigned_integer =
+    repeat1(digit);
+
+  // exact
+
+  const exact_integer =
+    seq(
+      optional(sign),
+      unsigned_integer);
+
+  // inexact
+
+  const inexact_special =
    choice(
-      /[iI][nN][fF]\.0/,
-      /[nN][aA][nN]\.0/,
-      /[iI][nN][fF]\.[fFtT]/,
-      /[nN][aA][nN]\.[fFtT]/,
-    );
+      /[iI][nN][fF]\.[0fFtT]/,
+      /[nN][aA][nN]\.[0fFtT]/);

-  const inexact_simple = _ =>
+  const inexact_simple =
    choice(
      seq(
-        digits(),
+        digits_hash,
        optional("."),
        repeat("#")),
      seq(
-        optional(unsigned_integer()),
+        optional(unsigned_integer),
        ".",
-        digits()),
+        digits_hash),
      seq(
-        digits(),
+        digits_hash,
        "/",
-        digits()));
+        digits_hash));

-  const digits = _ =>
+  const inexact_normal =
    seq(
-      unsigned_integer(),
-      repeat("#"));
+      inexact_simple,
+      optional(
+        seq(
+          exp_mark,
+          exact_integer)));
+
+  const inexact_real =
+    choice(
+      seq(
+        optional(sign),
+        inexact_normal),
+      seq(
+        sign,
+        inexact_special));

-  return token(number());
+  return seq(prefix, inexact_real);
 }

 // number }}}
--- a/vendored_parsers/tree-sitter-racket/package.json
+++ b/vendored_parsers/tree-sitter-racket/package.json
@ -1,6 +1,6 @@
 {
  "name": "tree-sitter-racket",
-  "version": "0.1.0",
+  "version": "0.3.0",
  "description": "Tree-sitter grammar for Racket",
  "main": "bindings/node",
  "scripts": {
--- a/vendored_parsers/tree-sitter-racket/src/grammar.json
+++ b/vendored_parsers/tree-sitter-racket/src/grammar.json
--- a/vendored_parsers/tree-sitter-racket/src/node-types.json
+++ b/vendored_parsers/tree-sitter-racket/src/node-types.json
@ -1154,11 +1154,6 @@
      ]
    }
  },
-  {
-    "type": "symbol",
-    "named": true,
-    "fields": {}
-  },
  {
    "type": "syntax",
    "named": true,
@ -1897,6 +1892,10 @@
    "type": "number",
    "named": true
  },
+  {
+    "type": "symbol",
+    "named": true
+  },
  {
    "type": "{",
    "named": false
--- a/vendored_parsers/tree-sitter-racket/src/parser.c
+++ b/vendored_parsers/tree-sitter-racket/src/parser.c
--- a/vendored_parsers/tree-sitter-racket/src/scanner.c
+++ b/vendored_parsers/tree-sitter-racket/src/scanner.c
@ -0,0 +1,153 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <tree_sitter/parser.h>
+
+enum TokenType {
+    HERE_STRING_BODY,
+};
+
+// a hand written string implmentation
+// data[0], data[1], ..., data[len-1] is the content of string
+// data[len] is `\0` for typical string `char*` compatibility
+// So 0 <= len < cap
+typedef struct {
+    size_t len;
+    size_t cap;
+    char *data;
+} String;
+
+static void check_alloc(void *ptr) {
+    if (ptr == NULL) {
+        fprintf(stderr, "Scanner: Failed to allocate memory\n");
+        exit(EXIT_FAILURE);
+    }
+}
+
+static String string_new(void) {
+    size_t init_len = 16;
+    // (init_len + 1) for null terminator
+    size_t cap = init_len + 1;
+    void *tmp = calloc(1, sizeof(char) * cap);
+    check_alloc(tmp);
+    return (String){.cap = cap, .len = 0, .data = tmp};
+}
+
+static void string_resize(String *str, size_t new_cap) {
+    void *block = realloc(str->data, new_cap * sizeof(char));
+    check_alloc(block);
+    str->data = block;
+    str->cap = new_cap;
+    memset(str->data + str->len, 0, (new_cap - str->len) * sizeof(char));
+}
+
+static void string_push(String *str, int32_t elem) {
+    if (str->len + sizeof(elem) >= str->cap) {
+        // str->cap * 2 + 1 > str->len + sizeof(elem) always holds
+        // as str->cap > 16
+        string_resize(str, str->cap * 2 + 1);
+    }
+    // NOTE: we don't consider little-endian/big-endian here
+    // the character in string is only for compare.
+    // They only need to be store in consistent way
+    memcpy(str->data + str->len, &elem, sizeof(elem));
+    str->len += sizeof(elem);
+}
+
+static void string_free(String *str) {
+    if (str->data != NULL) {
+        free(str->data);
+        str->data = NULL;
+        str->len = 0;
+        str->cap = 0;
+    }
+}
+
+static void string_clear(String *str) {
+    memset(str->data, 0, str->len * sizeof(char));
+    str->len = 0;
+}
+
+static void advance(TSLexer *lexer) {
+    lexer->advance(lexer, false);
+}
+
+static void skip(TSLexer *lexer) {
+    lexer->advance(lexer, true);
+}
+
+// NOTE: only "\n" is allowed as newline here,
+// It implies that "\r" can also be terminator.
+static bool isnewline(int32_t chr) {
+    return chr == '\n';
+}
+
+// `read_line` read strings until a newline or EOF
+static void read_line(String *str, TSLexer *lexer) {
+    while (!isnewline(lexer->lookahead) && !lexer->eof(lexer)) {
+        string_push(str, lexer->lookahead);
+        advance(lexer);
+    }
+}
+
+// Suppose terminator is `T`, newline (\n) is `$`,
+// It should accept "#<<T$T" or "#<<T$...$T", where `...`
+// is the string content.
+static bool scan(TSLexer *lexer, const bool *valid_symbols) {
+    if (!valid_symbols[HERE_STRING_BODY]) {
+        return false;
+    }
+
+    String terminator = string_new();
+    read_line(&terminator, lexer);
+
+    if (lexer->eof(lexer)) {
+        string_free(&terminator);
+        return false;
+    }
+
+    // skip `\n`
+    skip(lexer);
+
+    String current_line = string_new();
+    while (true) {
+        read_line(&current_line, lexer);
+        if (strcmp(terminator.data, current_line.data) == 0) {
+            lexer->result_symbol = HERE_STRING_BODY;
+            string_free(&terminator);
+            string_free(&current_line);
+            return true;
+        }
+        if (lexer->eof(lexer)) {
+            string_free(&terminator);
+            string_free(&current_line);
+            return false;
+        }
+        string_clear(&current_line);
+        // skip `\n`
+        skip(lexer);
+    }
+}
+
+void *tree_sitter_racket_external_scanner_create() {
+    return NULL;
+}
+
+unsigned tree_sitter_racket_external_scanner_serialize(void *payload,
+                                                       char *buffer) {
+    return 0;
+}
+
+void tree_sitter_racket_external_scanner_deserialize(void *payload,
+                                                     const char *buffer,
+                                                     unsigned length) {
+}
+
+bool tree_sitter_racket_external_scanner_scan(void *payload,
+                                              TSLexer *lexer,
+                                              const bool *valid_symbols) {
+    return scan(lexer, valid_symbols);
+}
+
+void tree_sitter_racket_external_scanner_destroy(void *payload) {
+}
--- a/vendored_parsers/tree-sitter-racket/src/scanner.cc
+++ b/vendored_parsers/tree-sitter-racket/src/scanner.cc
@ -1,119 +0,0 @@
-#include <string>
-
-#include "tree_sitter/parser.h"
-
-namespace {
-
-using std::u32string;
-
-enum TokenType {
-    HERE_STRING_BODY,
-};
-
-class optional_str {
-    bool valid;
-    u32string str;
-
-   public:
-    optional_str() : valid(true) {}
-
-    static optional_str none() {
-        optional_str emp;
-        emp.valid = false;
-        return emp;
-    }
-
-    bool is_none() const { return !this->valid; }
-    const u32string &content() const { return this->str; }
-    u32string &content() { return this->str; }
-};
-
-// NOTE: only "\n" is allowed as newline here,
-// It implies that "\r" can also be terminator.
-inline bool isnewline(int32_t c) {
-    return c == '\n';
-}
-
-inline optional_str read_terminator(TSLexer *lexer) {
-    optional_str line;
-
-    while (true) {
-        if (isnewline(lexer->lookahead)) {
-            return line;
-        } else if (lexer->eof(lexer)) {
-            return optional_str::none();
-        } else {
-            line.content().push_back(lexer->lookahead);
-            lexer->advance(lexer, false);
-        }
-    }
-}
-
-// `read_line` read strings until a newline or EOF
-inline u32string read_line(TSLexer *lexer) {
-    u32string line;
-
-    while (!isnewline(lexer->lookahead) && !lexer->eof(lexer)) {
-        line.push_back(lexer->lookahead);
-        lexer->advance(lexer, false);
-    }
-    return line;
-}
-
-// Suppose terminator is `T`, newline (\n) is `$`,
-// It should accept "#<<T$T" or "#<<T$...$T", where `...`
-// is the string content.
-bool scan(TSLexer *lexer, const bool *valid_symbols) {
-    if (!valid_symbols[HERE_STRING_BODY]) {
-        return false;
-    }
-
-    const optional_str terminator = read_terminator(lexer);
-
-    if (terminator.is_none()) {
-        return false;
-    }
-
-    // skip `\n`
-    lexer->advance(lexer, false);
-    while (true) {
-        const u32string line = read_line(lexer);
-        if (line == terminator.content()) {
-            lexer->result_symbol = HERE_STRING_BODY;
-            return true;
-        }
-        if (lexer->eof(lexer)) {
-            return false;
-        }
-        // skip `\n`
-        lexer->advance(lexer, false);
-    }
-}
-
-}  // namespace
-
-extern "C" {
-
-void *tree_sitter_racket_external_scanner_create(void) {
-    return NULL;
-}
-
-void tree_sitter_racket_external_scanner_destroy(void *payload) {
-}
-
-unsigned tree_sitter_racket_external_scanner_serialize(void *payload,
-                                                       char *buffer) {
-    return 0;
-}
-
-void tree_sitter_racket_external_scanner_deserialize(void *payload,
-                                                     const char *buffer,
-                                                     unsigned length) {
-}
-
-bool tree_sitter_racket_external_scanner_scan(void *payload,
-                                              TSLexer *lexer,
-                                              const bool *valid_symbols) {
-    return scan(lexer, valid_symbols);
-}
-}