speced · sidvishnoi · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/profiles/w3c.js b/profiles/w3c.js
@@ -34,6 +34,7 @@ const modules = [
   import("../src/core/data-cite.js"),
   import("../src/core/render-biblio.js"),
   import("../src/core/dfn-index.js"),
+  import("../src/core/unicode.js"),
   import("../src/core/contrib.js"),
   import("../src/core/sections.js"),
   import("../src/core/fix-headers.js"),

diff --git a/src/core/unicode.js b/src/core/unicode.js
@@ -0,0 +1,221 @@
+// @ts-check
+/**
+ * @module core/unicode
+ *
+ * Expand char markup (.hx, .ch) to .codepoint spans
+ * https://github.com/speced/respec/issues/4462
+ * Based on https://github.com/r12a/scripts/blob/gh-pages/common29/functions.js
+ */
+
+import { lang as docLang } from "./l10n.js";
+import { html } from "./import-maps.js";
+import { showError } from "./utils.js";
+
+export const name = "core/unicode";
+
+const DEFAULT_API_URL = "http://localhost:8001/api/unicode/names";
+
+/**
+ * @param {Conf} conf
+ */
+export async function run(conf) {
+  // convert char markup to .codepoint spans (has to be done before the indexing)
+  // the .ch and .hx classes should only be used for characters in the
+  // spreadsheet.  For other characters, generate the markup in a picker
+  // if the svg class is appended, use an svg image to display the char
+  // if the split class used, the characters will be separated by +
+  // split puts + signs between the characters in a sequence
+  // init, medi, fina produce positional forms of cursive text using zwj
+  // skip  puts a circle before a mark, and zwj between it and the following consonant
+  // circle puts a dotted circle before the item - used for combining marks
+  // coda puts a dotted circle after the item - used for closed syllables
+  // noname prevents the production of the Unicode name
+
+  /** @type {NodeListOf<HTMLElement>} */
+  const elements = document.querySelectorAll(".ch, .hx");
+  if (!elements.length) {
+    return;
+  }
+
+  /** @type {Set<string>} */
+  const queryHex = new Set();
+  /** @type {Map<HTMLElement, Array<ParsedData>>} */
+  const elementMap = new Map();
+
+  for (const elem of elements) {
+    const parsed = elem.classList.contains("ch")
+      ? parseCh(elem)
+      : parseHx(elem);
+
+    parsed.forEach(p => queryHex.add(p.hex));
+    elementMap.set(elem, parsed);
+  }
+
+  if (!queryHex.size) {
+    return;
+  }
+  const apiUrl = conf.unicode?.apiUrl || DEFAULT_API_URL;
+  const result = await getData(
+    [...queryHex].map(hex => ({ hex })),
+    apiUrl
+  );
+  const dataByHex = new Map(
+    Object.values(result.data).map(d => [d.query.hex, d.result])
+  );
+
+  for (const elem of elements) {
+    const parsedData = elementMap.get(elem);
+    const hexMap = new Map(parsedData.map(e => [e.hex, dataByHex.get(e.hex)]));
+    expandCharMarkup(elem, parsedData, hexMap);
+  }
+}
+
+/**
+ * @param {HTMLElement} elem
+ * @param {ParsedData[]} parsedData
+ * @param {Map<string, Result | null>} hexMap
+ */
+function expandCharMarkup(elem, parsedData, hexMap) {
+  console.log(elem, ...parsedData);
+
+  const split = elem.classList.contains("split");
+  // const _svg = elem.classList.contains("svg");
+  // const img = elem.classList.contains("img");
+  const initial = elem.classList.contains("init");
+  const medial = elem.classList.contains("medi");
+  const final = elem.classList.contains("fina");
+  // const skipDiacritic = elem.classList.contains("skip");
+  const circle = elem.classList.contains("circle");
+  const coda = elem.classList.contains("coda") ? "◌" : "";
+  // const noname = elem.classList.contains("noname");
+  const lang = elem.lang || docLang;
+
+  const isHex = elem.classList.contains("hx");
+  const isCh = elem.classList.contains("ch");
+
+  /** @type {string[]} */
+  const unicodeChars = [];
+  /** @type {(HTMLElement | Text)[]} */
+  const unicodeNames = [];
+  let chars = "";
+
+  if (final || medial) {
+    chars += isCh ? " \u200D" : "\u200D"; // the space is needed for Safari to work
+  }
+  if (isHex) {
+    if (circle) {
+      chars = `\u25CC${chars}`;
+    }
+  }
+
+  for (const [i, entry] of parsedData.entries()) {
+    chars += isHex ? `&#x${entry.hex};` : entry.ch;
+    // todo: support images
+
+    if (split) {
+      unicodeChars.push(chars);
+      chars = "";
+    }
+
+    const res = hexMap.get(entry.hex);
+    if (!res?.name) {
+      showError(`No name found for ${entry.hex}`, name, { elements: [elem] });
+      continue;
+    }
+    const text = `U+${entry.hex} ${res.name}`;
+    unicodeNames.push(html`<span class="uname">${text}</span>`);
+    if (i < parsedData.length - 1) {
+      unicodeNames.push(new Text(" + "));
+    }
+  }
+  if (chars) {
+    unicodeChars.push(chars);
+  }
+
+  if (isHex) {
+    if (initial || medial) {
+      chars += "\u200D ";
+    }
+  }
+  if (isCh) {
+    if (circle) {
+      chars = `\u25CC${chars}`;
+    }
+  }
+  chars += coda;
+
+  const expanded = document.createElement("span");
+  expanded.classList.add("codepoint");
+  expanded.setAttribute("translate", "no");
+  for (const text of unicodeChars) {
+    const bdi = document.createElement("bdi");
+    bdi.lang = lang;
+    bdi.innerHTML = text;
+    expanded.append(bdi);
+  }
+  for (const entry of unicodeNames) {
+    expanded.append(entry);
+  }
+
+  elem.replaceWith(expanded);
+}
+
+/**
+ * @typedef {{ ch: string; hex: string; dec: number }} ParsedData
+ * @typedef {{ hex: string }} Query
+ * @typedef {{ name: string }} Result
+ * @typedef {{
+ *  data: Array<{ query: Query; result: Result | null }>;
+ *  metadata: { lastParsedAt: string; dataSource: string };
+ * }} ResponseData
+ *
+ * @param {Query[]} queries
+ * @param {string} apiUrl
+ * @return {Promise<ResponseData>}
+ */
+async function getData(queries, apiUrl) {
+  const res = await fetch(apiUrl, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({ queries }),
+  });
+  const data = await res.json();
+  return data;
+}
+
+/**
+ * @param {HTMLElement} elem
+ * @returns {ParsedData[]}
+ */
+function parseCh(elem) {
+  const result = [];
+  const charlist = [...elem.textContent];
+  for (let i = 0; i < charlist.length; i++) {
+    const ch = charlist[i];
+    const dec = ch.codePointAt(0);
+    const hex = dec.toString(16).toUpperCase().padStart(4, "0");
+    result.push({ ch, hex, dec });
+  }
+  return result;
+}
+
+/**
+ * @param {HTMLElement} elem
+ * @returns {ParsedData[]}
+ */
+function parseHx(elem) {
+  const charlist = elem.textContent.trim().split(" ");
+  if (charlist[0] === "") {
+    return [];
+  }
+  const result = [];
+  for (let i = 0; i < charlist.length; i++) {
+    const ch = charlist[i];
+    const hex = ch;
+    const dec = parseInt(hex, 16);
+    result.push({ ch, hex, dec });
+  }
+  return result;
+}
diff --git a/src/type-helper.d.ts b/src/type-helper.d.ts
@@ -120,6 +120,9 @@ interface Conf {
   biblio: Record<string, BiblioData>;
   editors?: Person[];
   formerEditors?: Person[];
+  unicode?: {
+    apiUrl?: string;
+  };
   /** Set of informative references */
   informativeReferences: Set<string>;
   localBiblio?: Record<string, BiblioData>;

diff --git a/tests/spec/core/unicode-spec.js b/tests/spec/core/unicode-spec.js
@@ -0,0 +1,83 @@
+"use strict";
+
+import {
+  flushIframes,
+  html,
+  makeRSDoc,
+  makeStandardOps,
+} from "../SpecHelper.js";
+
+describe("Core — unicode", () => {
+  afterAll(flushIframes);
+
+  it("expands single .hx, .ch to codepoint markup", async () => {
+    const body = html`<div id="test">
+      <span class="hx" lang="fr">00E9</span>
+      <span class="ch" lang="fr">é</span>
+    </div>`;
+    const doc = await makeRSDoc(makeStandardOps(null, body));
+
+    const expanded = doc.querySelectorAll("#test .codepoint");
+    expect(expanded).toHaveSize(2);
+
+    for (let i = 0; i < expanded.length; i++) {
+      const context = i === 0 ? "hx" : "ch";
+      const elem = expanded[i];
+
+      expect(elem.localName).withContext(context).toBe("span");
+      expect(elem.getAttribute("translate")).withContext(context).toBe("no");
+      expect(elem.children).withContext(context).toHaveSize(2);
+      const [bdi, uname] = elem.children;
+
+      expect(bdi.localName).withContext(context).toBe("bdi");
+      expect(bdi.textContent).withContext(context).toBe("é");
+      expect(bdi.getAttribute("lang")).withContext(context).toBe("fr");
+
+      expect(uname.localName).withContext(context).toBe("span");
+      expect(uname.getAttribute("class")).withContext(context).toBe("uname");
+      expect(uname.textContent)
+        .withContext(context)
+        .toBe("U+00E9 LATIN SMALL LETTER E WITH ACUTE");
+    }
+  });
+
+  it("expands .hx, .ch sequence to codepoint markup", async () => {
+    const body = html`<div id="test">
+      <span class="hx" lang="hi">0928 093F</span>
+      <span class="ch" lang="hi">नि</span>
+    </div>`;
+    const doc = await makeRSDoc(makeStandardOps(null, body));
+
+    const expanded = doc.querySelectorAll("#test .codepoint");
+    expect(expanded).toHaveSize(2);
+    for (let i = 0; i < expanded.length; i++) {
+      const context = i === 0 ? "hx" : "ch";
+      const elem = expanded[i];
+
+      expect(elem.getAttribute("translate")).withContext(context).toBe("no");
+
+      expect(elem.children).toHaveSize(3);
+      expect(elem.childNodes.length).withContext(context).toBe(4);
+      const [bdi, uname1, plusSign, uname2] = elem.childNodes;
+
+      expect(bdi.localName).withContext(context).toBe("bdi");
+      expect(bdi.textContent).withContext(context).toBe("नि");
+      expect(bdi.getAttribute("lang")).withContext(context).toBe("hi");
+
+      expect(uname1.localName).withContext(context).toBe("span");
+      expect(uname1.getAttribute("class")).withContext(context).toBe("uname");
+      expect(uname1.textContent)
+        .withContext(context)
+        .toBe("U+0928 DEVANAGARI LETTER NA");
+
+      expect(plusSign.nodeName).withContext(context).toBe("#text");
+      expect(plusSign.textContent).withContext(context).toBe(" + ");
+
+      expect(uname2.localName).withContext(context).toBe("span");
+      expect(uname2.getAttribute("class")).withContext(context).toBe("uname");
+      expect(uname2.textContent)
+        .withContext(context)
+        .toBe("U+093F DEVANAGARI VOWEL SIGN I");
+    }
+  });
+});