Skip to content

Commit

Permalink
Fixed 咖啡 transliteration for Singapore dialect, fixed error with brow…
Browse files Browse the repository at this point in the history
…ser import of the library, minor edits to words and vars datasets
  • Loading branch information
andreihar committed Aug 2, 2024
1 parent 672e99a commit c7d4e3f
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 31 deletions.
28 changes: 10 additions & 18 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "taibun",
"version": "1.1.1",
"version": "1.1.2",
"description": "Taiwanese Hokkien Transliterator and Tokeniser",
"main": "taibun/index.js",
"types": "taibun/index.d.ts",
Expand Down
3 changes: 2 additions & 1 deletion taibun/data/vars.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,6 @@
"譁": "",
"廕": "",
"鍾": "",
"歎": ""
"歎": "",
"儂": ""
}
2 changes: 1 addition & 1 deletion taibun/data/words.json
Original file line number Diff line number Diff line change
Expand Up @@ -20810,7 +20810,7 @@
"群英": "kûn-ing",
"裙裾": "kûn-ki",
"拳擊": "kûn-kik",
"群墘儂": "kûn-kînn-lâng",
"群墘人": "kûn-kînn-lâng",
"拳路": "kûn-lōo",
"群山": "kûn-suann",
"拳賽": "kûn-sài",
Expand Down
23 changes: 13 additions & 10 deletions taibun/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
let wordDict, tradDict, simpDict, varsDict;
let wordDict, tradDict, simpDict, varsDict, pronsDict;

if (typeof window === 'undefined') {
// Node.js
Expand Down Expand Up @@ -125,7 +125,7 @@ class Converter {

constructor({ system = 'Tailo', dialect = 'south', format = 'mark', delimiter = Converter.defaultDelimiter, sandhi = Converter.defaultSandhi, punctuation = 'format', convertNonCjk = false } = {}) {
this.system = system.toLowerCase();
this.dialect = dialect;
this.dialect = dialect.toLowerCase();
this.format = format;
this.delimiter = delimiter !== Converter.defaultDelimiter ? delimiter : this.setDefaultDelimiter();
this.sandhi = sandhi !== Converter.defaultSandhi ? sandhi : this.setDefaultSandhi();
Expand Down Expand Up @@ -175,12 +175,18 @@ class Converter {
if (!value || dialect === 'south') return value;
const parts = value.toLowerCase().split(/(--|-)/).filter(s => s);
const variations = Object.fromEntries(Array.from(property).map(char => [char, Object.fromEntries((pronsDictProxy[char] || []).map(v => v.split('/').length > 1 ? v.split('/') : [v, v]))]));
if (property in Converter.singaporeWords && dialect === 'singapore') {
Object.keys(Converter.singaporeWords[property]).forEach(char => {
if (char in variations) {
Object.assign(variations[char], Converter.singaporeWords[property][char]);
if (dialect === 'singapore') {
const substrings = new Set(
[...property].flatMap((_, i) => [...property.slice(i)].map((_, j) => property.slice(i, i + j + 1)))
);
substrings.forEach(substring => {
if (substring in Converter.singaporeWords) {
Object.entries(Converter.singaporeWords[substring]).forEach(([char, mappings]) => {
if (char in variations) Object.assign(variations[char], mappings);
});
}
});
value = value.split('').map(char => variations[char]?.[char] || char).join('');
}
let newParts = [];
let charIndex = 0;
Expand Down Expand Up @@ -404,10 +410,7 @@ class Converter {

// Helper to convert Taiwanese pronunciation to Singaporean
convertVariant(input) {
if (this.dialect === 'singapore') {
return input.replace('ing', 'eng');
}
return input;
return this.dialect === 'singapore' ? input.replace('ing', 'eng') : input;
}


Expand Down
57 changes: 57 additions & 0 deletions tests/singapore.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
const { Converter } = require('taibun');

test('tones', () => {
const c = new Converter({ system: 'IPA', dialect: "Singapore", punctuation: 'none' });
const tones = { '衫': 'sã⁴⁴', '短': 'te⁴²', '褲': 'kʰɔ²¹', '闊': 'kʰuaʔ³²', '人': 'laŋ²⁴', '鼻': 'pʰĩ²²', '直': 'tit̚⁴' };
for (let [tone, expected] of Object.entries(tones)) {
expect(c.get(tone)).toBe(expected);
}
});

test('o conversion', () => {
const c = new Converter({ system: 'IPA', dialect: "Singapore", punctuation: 'none' });
const oWords = { '高': 'ko⁴⁴', '唔': 'ɔ̃⁴⁴', '烏': 'ɔ⁴⁴', '王': 'ɔŋ²⁴' };
for (let [word, expected] of Object.entries(oWords)) {
expect(c.get(word)).toBe(expected);
}
});

describe('eng conversion', () => {
const hanjiData = ['用', '冰', '兵', '幸啊', '無閒', '無政府'];
const testData = [
[['ēng', 'peng', 'peng', 'hēng--ah', 'bô-êng', 'bô tsèng-hú'], "Tailo"],
[['ēng', 'peng', 'peng', 'hēng--ah', 'bô-êng', 'bô chèng-hú'], "POJ"],
[['ㆤㄥ˫', 'ㄅㆤㄥ', 'ㄅㆤㄥ', 'ㄏㆤㄥ˫ ㄚ', 'ㆠㄜˊ ㆤㄥˊ', 'ㆠㄜˊ ㄗㆤㄥ˪ ㄏㄨˋ'], "Zhuyin"],
[['eng7', 'peng1', 'peng1', 'heng7 ah0', 'bo5 eng5', 'bo5 ceng3 hu2'], "TLPA"],
[['êng', 'bēng', 'bēng', 'hêng ah', 'bbóéng', 'bbó zènghǔ'], "Pingyim"],
[['ēng', 'beng', 'beng', 'hēng--åh', 'bhôr-ĕng', 'bhôr zèng-hù'], "Tongiong"],
[['eŋ²²', 'peŋ⁴⁴', 'peŋ⁴⁴', 'heŋ²² a', 'bo²⁴ eŋ²⁴', 'bo²⁴ tseŋ²¹ hu⁴²'], "IPA"]
];
testData.forEach(([transl, system]) => {
const data = hanjiData.map((h, i) => [h, transl[i]]);
test(`testing: ${system}`, () => {
const c = new Converter({ system, dialect: "Singapore", punctuation: 'none' });
data.forEach(([hanji, expected]) => { expect(c.get(hanji)).toBe(expected); });
});
});
});

test('sandhi', () => {
const sandhis = ['auto', 'none', 'excLast', 'inclLast'];
const expectedResults = ['Tài-uân', 'Tâi-uân', 'Tài-uân', 'Tài-uàn'];

sandhis.forEach((sandhi, index) => {
const c = new Converter({ dialect: "Singapore", punctuation: "none", sandhi });
expect(c.get('台灣')).toBe(expectedResults[index]);
});
});

test('kopi', () => {
const c = new Converter({ dialect: "Singapore", punctuation: 'none' });
const kos = ['咖啡', '烏咖啡', '咖啡杯', '咖哩', '咖咖仔'];
const expectedResults = ['ko-pi', 'oo-ko-pi', 'ko-pi-pue', 'ka-lí', 'ka-ka-á'];

kos.forEach((ko, index) => {
expect(c.get(ko)).toBe(expectedResults[index]);
});
});

0 comments on commit c7d4e3f

Please sign in to comment.