mirror of https://github.com/Wilfred/difftastic/
170 lines
5.1 KiB
JavaScript
170 lines
5.1 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
// This script generates a JSON file that is used by the CLI to handle unicode property escapes.
|
|
|
|
const CATEGORY_OUTPUT_PATH = 'unicode-categories.json'
|
|
const PROPERTY_OUTPUT_PATH = 'unicode-properties.json'
|
|
|
|
const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
|
|
const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt'
|
|
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt'
|
|
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const {spawnSync} = require('child_process');
|
|
|
|
// Download the unicode data files, caching them inside the 'target' directory.
|
|
const categoryData = cachedDownload(CATEGORY_URL);
|
|
const propertyData = cachedDownload(PROPERTY_URL);
|
|
const derivedPopertyData = cachedDownload(DERIVED_PROPERTY_URL);
|
|
function cachedDownload(url) {
|
|
let downloadPath = path.join('.', 'target', path.basename(url))
|
|
if (fs.existsSync(downloadPath)) {
|
|
return fs.readFileSync(downloadPath, 'utf8');
|
|
} else {
|
|
const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
|
|
fs.writeFileSync(downloadPath, data, 'utf8');
|
|
return data;
|
|
}
|
|
}
|
|
|
|
const categories = {};
|
|
const properties = {};
|
|
let data, row, lineStart, lineEnd;
|
|
|
|
// Parse the properties
|
|
data = propertyData + derivedPopertyData;
|
|
row = 0;
|
|
lineStart = 0;
|
|
lineEnd = -1;
|
|
const CODE_POINT = /[0-9A-Fa-f]/
|
|
while (lineStart < data.length) {
|
|
row++;
|
|
lineStart = lineEnd + 1;
|
|
lineEnd = data.indexOf('\n', lineStart);
|
|
if (lineEnd === -1) break;
|
|
|
|
// Skip over blank and comment lines
|
|
if (!CODE_POINT.test(data[lineStart])) continue;
|
|
|
|
// Parse the first two semicolon fields:
|
|
// * code point or code point range
|
|
// * property
|
|
const codePointEnd = data.indexOf(';', lineStart);
|
|
const propertyStart = codePointEnd + 1;
|
|
const propertyEnd = data.indexOf('#', propertyStart);
|
|
|
|
if (
|
|
codePointEnd === -1 ||
|
|
propertyEnd === -1
|
|
) {
|
|
throw new Error(`Unexpected format on line ${row}`);
|
|
}
|
|
|
|
// Process ranges (separated by '..)
|
|
const codePoints = data.slice(lineStart, codePointEnd).trim()
|
|
.split('..')
|
|
.map(p => parseInt(p, 16));
|
|
if (codePoints.length === 1) {
|
|
codePoints.push(codePoints[0]);
|
|
}
|
|
|
|
const property = data.slice(propertyStart, propertyEnd).trim();
|
|
|
|
// console.log(codePoints, property);
|
|
|
|
|
|
for (let c = codePoints[0]; c <= codePoints[1]; c++) {
|
|
if (!properties[property]) {
|
|
properties[property] = [];
|
|
}
|
|
properties[property].push(c);
|
|
}
|
|
}
|
|
|
|
// Parse the categories.
|
|
// Each line represents a code point.
|
|
data = categoryData;
|
|
row = 0;
|
|
lineStart = 0;
|
|
lineEnd = -1;
|
|
while (lineStart < data.length) {
|
|
row++;
|
|
lineStart = lineEnd + 1;
|
|
lineEnd = data.indexOf('\n', lineStart);
|
|
if (lineEnd === -1) break;
|
|
|
|
// Parse the first three semicolon-separated fields:
|
|
// * code point (hexadecimal)
|
|
// * name
|
|
// * category
|
|
const codePointEnd = data.indexOf(';', lineStart);
|
|
const nameStart = codePointEnd + 1;
|
|
const nameEnd = data.indexOf(';', nameStart);
|
|
const categoryStart = nameEnd + 1;
|
|
const categoryEnd = data.indexOf(';', categoryStart)
|
|
if (
|
|
nameStart === 0 ||
|
|
categoryStart == 0 ||
|
|
categoryEnd === 0
|
|
) {
|
|
throw new Error(`Unexpected format on line ${row}`);
|
|
}
|
|
|
|
const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
|
|
const name = data.slice(nameStart, nameEnd);
|
|
const category = data.slice(categoryStart, categoryEnd);
|
|
|
|
// console.log(codePoint, category, name);
|
|
|
|
// Group the code points by their category.
|
|
if (!categories[category]) {
|
|
categories[category] = [];
|
|
}
|
|
categories[category].push(codePoint);
|
|
}
|
|
|
|
function deriveRanges(array) {
|
|
array.sort((a, b) => a - b)
|
|
const ranges = []
|
|
let start = null, last = null;
|
|
for (let i = 0; i < array.length; i++) {
|
|
const current = array[i];
|
|
if (start === null && last === null) {
|
|
start = current;
|
|
last = current
|
|
} else if (current !== last + 1) {
|
|
ranges.push([start, last]);
|
|
start = current;
|
|
last = current;
|
|
} else {
|
|
last++;
|
|
}
|
|
}
|
|
|
|
if (start !== null && last !== null) {
|
|
ranges.push([start, last])
|
|
}
|
|
return ranges
|
|
}
|
|
|
|
function print(property) {
|
|
properties[property].forEach((x) => console.log(String.fromCharCode(x)));
|
|
}
|
|
|
|
function remove(a, b) {
|
|
return a.filter((x) => b.indexOf(x) < 0);
|
|
}
|
|
|
|
function generateFunction(name, list) {
|
|
const ranges = deriveRanges(list)
|
|
const fnBody = ranges.map((range) => "(c >= " + range[0].toString() + " && c <= " + range[1].toString() + ')')
|
|
.join(' || ')
|
|
return "bool is_unicode_" + name.toLowerCase() + "(int32_t c) {\n" + ' return ' + fnBody + ";\n}\n";
|
|
}
|
|
|
|
console.log(generateFunction('ID_Start', properties['ID_Start']))
|
|
console.log(generateFunction('ID_Continue', properties['ID_Continue']))
|
|
console.log(generateFunction('ID_Start_Without_Lu_Lt', remove(remove(properties['ID_Start'], categories['Lu']), categories['Lt'])))
|
|
|