gfngfn · gfngfn · Apr 7, 2024 · Oct 2, 2023 · Oct 3, 2023 · Oct 3, 2023
diff --git a/satysfi.opam b/satysfi.opam
@@ -34,6 +34,8 @@ depends: [
   "ppx_deriving"
   "re" {build}
   "uutf"
+  "uunf"
+  "uuseg"
   "yojson-with-position" {= "1.4.2+satysfi"}
   "omd" {< "2.0.0~"}
   "ocamlgraph"

diff --git a/src/dune b/src/dune
@@ -13,6 +13,10 @@
             menhirLib
             otfed
             uutf
+            uunf
+            uunf.string
+            uuseg
+            uuseg.string
             yojson-with-position
             omd
             ocamlgraph

diff --git a/src/frontend/graphemeCluster.ml b/src/frontend/graphemeCluster.ml
@@ -0,0 +1,11 @@
+
+let split_utf8 str =
+  str
+  |> Uuseg_string.fold_utf_8 `Grapheme_cluster (fun lst s -> s::lst) []
+  |> List.rev
+
+
+let split_utf16be str =
+  str
+  |> Uuseg_string.fold_utf_16be `Grapheme_cluster (fun lst s -> s::lst) []
+  |> List.rev
diff --git a/src/frontend/graphemeCluster.mli b/src/frontend/graphemeCluster.mli
@@ -0,0 +1,4 @@
+
+val split_utf8 : string -> string list
+
+val split_utf16be : string -> string list
diff --git a/src/frontend/lexer.mll b/src/frontend/lexer.mll
@@ -575,8 +575,7 @@ and lex_inline stack = parse
         else
           report_error lexbuf "unexpected end of input while reading an inline text area"
       }
-  | str+
-      { let tok = Lexing.lexeme lexbuf in CHAR(get_pos lexbuf, tok) }
+  | str+ { CHAR(get_pos lexbuf, Lexing.lexeme lexbuf) }
 
   | _ as c
       { report_error lexbuf (Printf.sprintf "illegal token '%s' in an inline text area" (String.make 1 c)) }
@@ -643,8 +642,7 @@ and lex_math stack = parse
       { MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
   | mathascii
       { MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
-  | mathstr+
-      { MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
+  | mathstr+ { MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
   | ("#" (((upper ".")* (lower | upper)) as s))
       {
         let pos = get_pos lexbuf in
@@ -743,9 +741,8 @@ and literal quote_length buffer = parse
         end else if len > quote_length then
           report_error lexbuf "literal area was closed with too many '`'s"
         else
-          let s = Buffer.contents buffer in
           let pos_last = get_pos lexbuf in
-          (pos_last, s, true)
+          (pos_last, Buffer.contents buffer, true)
     }
   | (("`"+ as backticks) "#")
       {
@@ -757,9 +754,8 @@ and literal quote_length buffer = parse
         end else if len > quote_length then
           report_error lexbuf "literal area was closed with too many '`'s"
         else
-          let s = Buffer.contents buffer in
           let pos_last = get_pos lexbuf in
-          (pos_last, s, false)
+          (pos_last, Buffer.contents buffer, false)
     }
   | break
       {

diff --git a/src/frontend/normalizeString.ml b/src/frontend/normalizeString.ml
@@ -0,0 +1,24 @@
+type t = string
+
+(*
+  <https://erratique.ch/software/uunf/doc/Uunf/index.html#type-form>
+
+*)
+let of_utf8_nfd str = Uunf_string.normalize_utf_8 `NFD str
+let of_utf8_nfc str = Uunf_string.normalize_utf_8 `NFC str
+
+let of_utf16be_nfd str =
+  let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in
+  Uunf_string.normalize_utf_8 `NFD str_utf8
+
+let of_utf16be_nfc str =
+  let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in
+  Uunf_string.normalize_utf_8 `NFC str_utf8
+
+let to_utf8 t = t
+
+let to_utf16be t =
+  t |> InternalText.of_utf8 |> InternalText.to_utf16be
+
+let to_utf16be_hex t =
+  t |> InternalText.of_utf8 |> InternalText.to_utf16be_hex
diff --git a/src/frontend/normalizeString.mli b/src/frontend/normalizeString.mli
@@ -0,0 +1,15 @@
+type t
+
+val of_utf8_nfd : string -> t
+
+val of_utf8_nfc : string -> t
+
+val of_utf16be_nfd : string -> t
+
+val of_utf16be_nfc : string -> t
+
+val to_utf8 : t -> string
+
+val to_utf16be_hex : t -> string
+
+val to_utf16be : t -> string
diff --git a/tests/split-grapheme-cluster.saty b/tests/split-grapheme-cluster.saty
@@ -0,0 +1,70 @@
+% -*- coding: utf-8 -*-
+@import: head
+@import: ../lib-satysfi/dist/packages/color
+@import: ../lib-satysfi/dist/packages/list
+
+let open Pervasives in
+let open Head in
+
+let () =
+  let s1 =
+    string-unexplode [
+      0x1F1EF, 0x1F1F5, %%% 🇯🇵
+      0x30AB, 0x3099, %%% ガ
+      0x30AC, %%% ガ
+      0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466, %%% 👨‍👩‍👦(family: man, woman, boy)
+      0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467, %%% 👩‍👩‍👧‍👧(family: woman, woman, girl, girl)
+      0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F, %%% 🏴󠁧󠁢󠁷󠁬󠁳󠁿(Wales)
+      0x1F469, 0x1F3FE, 0x200D, 0x1F393, %%% 👩‍🎓(woman student: medium-dark skin tone)
+
+    ]
+  in
+  let s2 =
+    [
+      [0x1F1EF, 0x1F1F5],
+      [0x30AC],
+      [0x30AC],
+      [0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466],
+      [0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467],
+      [0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F],
+      [0x1F469, 0x1F3FE, 0x200D, 0x1F393]
+    ]
+  in
+  let s3 =
+    [
+      [0x1F1EF, 0x1F1F5],
+      [0x30AB, 0x3099],
+      [0x30AB, 0x3099],
+      [0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466],
+      [0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467],
+      [0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F],
+      [0x1F469, 0x1F3FE, 0x200D, 0x1F393]
+    ]
+  in
+  let slst1 = s1 |> normalize-string-to-nfc |> split-grapheme-cluster in
+  let slst2 = s1 |> normalize-string-to-nfd |> split-grapheme-cluster in
+  let rec check-loop l1 l2 =
+    match (l1, l2) with
+      | (s1::xs1, s2::xs2)-> string-same s1 (string-unexplode s2) && (check-loop xs1 xs2)
+      | ([], []) -> true
+      | _ -> false
+    end
+  in
+  let is-ok1 = check-loop slst1 s2 in
+  let is-ok2 = check-loop slst2 s3 in
+  if is-ok1 && is-ok2 then
+    display-message `ok`
+  else
+    if not is-ok1 && not is-ok2 then
+      abort-with-message `err1, 2`
+    else
+      if not is-ok1 then
+        abort-with-message `err1`
+      else
+        abort-with-message `err2`
+in
+
+document (|
+  title  = {Normalize string and grapheme cluster},
+  author = {\SATySFi; Contributors},
+|) '<>
diff --git a/tools/gencode/vminst.ml b/tools/gencode/vminst.ml
@@ -1618,7 +1618,9 @@ CompiledInlineTextClosure([CompiledImInlineTextText(str)], env)
         ~is_pdf_mode_primitive:true
         ~is_text_mode_primitive:true
         ~code:{|
-make_string (HorzBox.extract_string ibs)
+ibs
+|> HorzBox.extract_string
+|> make_string
 |}
     ; inst "PrimitiveInlineSkip"
         ~name:"inline-skip"
@@ -2019,7 +2021,11 @@ else
         ~is_text_mode_primitive:true
         ~code:{|
 let ilst = get_list get_int valueilst in
-let s = (List.map Uchar.of_int ilst) |> InternalText.of_uchar_list |> InternalText.to_utf8 in
+let s =
+  (List.map Uchar.of_int ilst)
+  |> InternalText.of_uchar_list
+  |> InternalText.to_utf8
+in
 make_string s
 |}
     ; inst "PrimitiveStringExplode"
@@ -2040,6 +2046,52 @@ let ilst =
   |> List.map Uchar.to_int
 in
 make_list make_int ilst
+|}
+    ; inst "PrimitiveNormlizeStringToNfc"
+        ~name:"normalize-string-to-nfc"
+        ~type_:Type.(tS @-> tS)
+        ~fields:[
+        ]
+        ~params:[
+          param "str" ~type_:"string";
+        ]
+        ~is_pdf_mode_primitive:true
+        ~is_text_mode_primitive:true
+        ~code:{|
+str
+|> NormalizeString.of_utf8_nfc
+|> NormalizeString.to_utf8
+|> make_string
+|}
+    ; inst "PrimitiveNormlizeStringToNfd"
+        ~name:"normalize-string-to-nfd"
+        ~type_:Type.(tS @-> tS)
+        ~fields:[
+        ]
+        ~params:[
+          param "str" ~type_:"string";
+        ]
+        ~is_pdf_mode_primitive:true
+        ~is_text_mode_primitive:true
+        ~code:{|
+str
+|> NormalizeString.of_utf8_nfd
+|> NormalizeString.to_utf8
+|> make_string
+|}
+    ; inst "PrimitiveSplitGraphemeCluster"
+        ~name:"split-grapheme-cluster"
+        ~type_:Type.(tS @-> (tL tS))
+        ~fields:[
+        ]
+        ~params:[
+          param "str" ~type_:"string";
+        ]
+        ~is_pdf_mode_primitive:true
+        ~is_text_mode_primitive:true
+        ~code:{|
+let slst = GraphemeCluster.split_utf8 str in
+make_list make_string slst
 |}
     ; inst "PrimitiveRegexpOfString"
         ~name:"regexp-of-string"