-
Notifications
You must be signed in to change notification settings - Fork 84
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #417 from puripuri2100/dev-0-1-0-string
Add split-grapheme-cluster and string normalization function
- Loading branch information
Showing
9 changed files
with
188 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,10 @@ | |
menhirLib | ||
otfed | ||
uutf | ||
uunf | ||
uunf.string | ||
uuseg | ||
uuseg.string | ||
yojson-with-position | ||
omd | ||
ocamlgraph | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
let split_utf8 str = | ||
str | ||
|> Uuseg_string.fold_utf_8 `Grapheme_cluster (fun lst s -> s::lst) [] | ||
|> List.rev | ||
|
||
|
||
let split_utf16be str = | ||
str | ||
|> Uuseg_string.fold_utf_16be `Grapheme_cluster (fun lst s -> s::lst) [] | ||
|> List.rev |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
|
||
val split_utf8 : string -> string list | ||
|
||
val split_utf16be : string -> string list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
type t = string | ||
|
||
(* | ||
<https://erratique.ch/software/uunf/doc/Uunf/index.html#type-form> | ||
*) | ||
let of_utf8_nfd str = Uunf_string.normalize_utf_8 `NFD str | ||
let of_utf8_nfc str = Uunf_string.normalize_utf_8 `NFC str | ||
|
||
let of_utf16be_nfd str = | ||
let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in | ||
Uunf_string.normalize_utf_8 `NFD str_utf8 | ||
|
||
let of_utf16be_nfc str = | ||
let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in | ||
Uunf_string.normalize_utf_8 `NFC str_utf8 | ||
|
||
let to_utf8 t = t | ||
|
||
let to_utf16be t = | ||
t |> InternalText.of_utf8 |> InternalText.to_utf16be | ||
|
||
let to_utf16be_hex t = | ||
t |> InternalText.of_utf8 |> InternalText.to_utf16be_hex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
type t | ||
|
||
val of_utf8_nfd : string -> t | ||
|
||
val of_utf8_nfc : string -> t | ||
|
||
val of_utf16be_nfd : string -> t | ||
|
||
val of_utf16be_nfc : string -> t | ||
|
||
val to_utf8 : t -> string | ||
|
||
val to_utf16be_hex : t -> string | ||
|
||
val to_utf16be : t -> string |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
% -*- coding: utf-8 -*- | ||
@import: head | ||
@import: ../lib-satysfi/dist/packages/color | ||
@import: ../lib-satysfi/dist/packages/list | ||
|
||
let open Pervasives in | ||
let open Head in | ||
|
||
let () = | ||
let s1 = | ||
string-unexplode [ | ||
0x1F1EF, 0x1F1F5, %%% 🇯🇵 | ||
0x30AB, 0x3099, %%% ガ | ||
0x30AC, %%% ガ | ||
0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466, %%% 👨👩👦(family: man, woman, boy) | ||
0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467, %%% 👩👩👧👧(family: woman, woman, girl, girl) | ||
0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F, %%% 🏴(Wales) | ||
0x1F469, 0x1F3FE, 0x200D, 0x1F393, %%% 👩🎓(woman student: medium-dark skin tone) | ||
|
||
] | ||
in | ||
let s2 = | ||
[ | ||
[0x1F1EF, 0x1F1F5], | ||
[0x30AC], | ||
[0x30AC], | ||
[0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466], | ||
[0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467], | ||
[0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F], | ||
[0x1F469, 0x1F3FE, 0x200D, 0x1F393] | ||
] | ||
in | ||
let s3 = | ||
[ | ||
[0x1F1EF, 0x1F1F5], | ||
[0x30AB, 0x3099], | ||
[0x30AB, 0x3099], | ||
[0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466], | ||
[0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467], | ||
[0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F], | ||
[0x1F469, 0x1F3FE, 0x200D, 0x1F393] | ||
] | ||
in | ||
let slst1 = s1 |> normalize-string-to-nfc |> split-grapheme-cluster in | ||
let slst2 = s1 |> normalize-string-to-nfd |> split-grapheme-cluster in | ||
let rec check-loop l1 l2 = | ||
match (l1, l2) with | ||
| (s1::xs1, s2::xs2)-> string-same s1 (string-unexplode s2) && (check-loop xs1 xs2) | ||
| ([], []) -> true | ||
| _ -> false | ||
end | ||
in | ||
let is-ok1 = check-loop slst1 s2 in | ||
let is-ok2 = check-loop slst2 s3 in | ||
if is-ok1 && is-ok2 then | ||
display-message `ok` | ||
else | ||
if not is-ok1 && not is-ok2 then | ||
abort-with-message `err1, 2` | ||
else | ||
if not is-ok1 then | ||
abort-with-message `err1` | ||
else | ||
abort-with-message `err2` | ||
in | ||
|
||
document (| | ||
title = {Normalize string and grapheme cluster}, | ||
author = {\SATySFi; Contributors}, | ||
|) '<> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters