Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add split-grapheme-cluster and string normalization function #417

Merged
merged 9 commits into from
Apr 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions satysfi.opam
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ depends: [
"ppx_deriving"
"re" {build}
"uutf"
"uunf"
"uuseg"
"yojson-with-position" {= "1.4.2+satysfi"}
"omd" {< "2.0.0~"}
"ocamlgraph"
Expand Down
4 changes: 4 additions & 0 deletions src/dune
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
menhirLib
otfed
uutf
uunf
uunf.string
uuseg
uuseg.string
yojson-with-position
omd
ocamlgraph
Expand Down
11 changes: 11 additions & 0 deletions src/frontend/graphemeCluster.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

let split_utf8 str =
str
|> Uuseg_string.fold_utf_8 `Grapheme_cluster (fun lst s -> s::lst) []
|> List.rev


let split_utf16be str =
str
|> Uuseg_string.fold_utf_16be `Grapheme_cluster (fun lst s -> s::lst) []
|> List.rev
4 changes: 4 additions & 0 deletions src/frontend/graphemeCluster.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

val split_utf8 : string -> string list

val split_utf16be : string -> string list
12 changes: 4 additions & 8 deletions src/frontend/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -575,8 +575,7 @@ and lex_inline stack = parse
else
report_error lexbuf "unexpected end of input while reading an inline text area"
}
| str+
{ let tok = Lexing.lexeme lexbuf in CHAR(get_pos lexbuf, tok) }
| str+ { CHAR(get_pos lexbuf, Lexing.lexeme lexbuf) }

| _ as c
{ report_error lexbuf (Printf.sprintf "illegal token '%s' in an inline text area" (String.make 1 c)) }
Expand Down Expand Up @@ -643,8 +642,7 @@ and lex_math stack = parse
{ MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| mathascii
{ MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| mathstr+
{ MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| mathstr+ { MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| ("#" (((upper ".")* (lower | upper)) as s))
{
let pos = get_pos lexbuf in
Expand Down Expand Up @@ -743,9 +741,8 @@ and literal quote_length buffer = parse
end else if len > quote_length then
report_error lexbuf "literal area was closed with too many '`'s"
else
let s = Buffer.contents buffer in
let pos_last = get_pos lexbuf in
(pos_last, s, true)
(pos_last, Buffer.contents buffer, true)
}
| (("`"+ as backticks) "#")
{
Expand All @@ -757,9 +754,8 @@ and literal quote_length buffer = parse
end else if len > quote_length then
report_error lexbuf "literal area was closed with too many '`'s"
else
let s = Buffer.contents buffer in
let pos_last = get_pos lexbuf in
(pos_last, s, false)
(pos_last, Buffer.contents buffer, false)
}
| break
{
Expand Down
24 changes: 24 additions & 0 deletions src/frontend/normalizeString.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
type t = string

(*
<https://erratique.ch/software/uunf/doc/Uunf/index.html#type-form>

*)
let of_utf8_nfd str = Uunf_string.normalize_utf_8 `NFD str
let of_utf8_nfc str = Uunf_string.normalize_utf_8 `NFC str

let of_utf16be_nfd str =
let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in
Uunf_string.normalize_utf_8 `NFD str_utf8

let of_utf16be_nfc str =
let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in
Uunf_string.normalize_utf_8 `NFC str_utf8

let to_utf8 t = t

let to_utf16be t =
t |> InternalText.of_utf8 |> InternalText.to_utf16be

let to_utf16be_hex t =
t |> InternalText.of_utf8 |> InternalText.to_utf16be_hex
15 changes: 15 additions & 0 deletions src/frontend/normalizeString.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
type t

val of_utf8_nfd : string -> t

val of_utf8_nfc : string -> t

val of_utf16be_nfd : string -> t

val of_utf16be_nfc : string -> t

val to_utf8 : t -> string

val to_utf16be_hex : t -> string

val to_utf16be : t -> string
70 changes: 70 additions & 0 deletions tests/split-grapheme-cluster.saty
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
% -*- coding: utf-8 -*-
@import: head
@import: ../lib-satysfi/dist/packages/color
@import: ../lib-satysfi/dist/packages/list

let open Pervasives in
let open Head in

let () =
let s1 =
string-unexplode [
0x1F1EF, 0x1F1F5, %%% 🇯🇵
0x30AB, 0x3099, %%% ガ
0x30AC, %%% ガ
0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466, %%% 👨‍👩‍👦(family: man, woman, boy)
0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467, %%% 👩‍👩‍👧‍👧(family: woman, woman, girl, girl)
0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F, %%% 🏴󠁧󠁢󠁷󠁬󠁳󠁿(Wales)
0x1F469, 0x1F3FE, 0x200D, 0x1F393, %%% 👩‍🎓(woman student: medium-dark skin tone)

]
in
let s2 =
[
[0x1F1EF, 0x1F1F5],
[0x30AC],
[0x30AC],
[0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466],
[0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467],
[0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F],
[0x1F469, 0x1F3FE, 0x200D, 0x1F393]
]
in
let s3 =
[
[0x1F1EF, 0x1F1F5],
[0x30AB, 0x3099],
[0x30AB, 0x3099],
[0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466],
[0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467],
[0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F],
[0x1F469, 0x1F3FE, 0x200D, 0x1F393]
]
in
let slst1 = s1 |> normalize-string-to-nfc |> split-grapheme-cluster in
let slst2 = s1 |> normalize-string-to-nfd |> split-grapheme-cluster in
let rec check-loop l1 l2 =
match (l1, l2) with
| (s1::xs1, s2::xs2)-> string-same s1 (string-unexplode s2) && (check-loop xs1 xs2)
| ([], []) -> true
| _ -> false
end
in
let is-ok1 = check-loop slst1 s2 in
let is-ok2 = check-loop slst2 s3 in
if is-ok1 && is-ok2 then
display-message `ok`
else
if not is-ok1 && not is-ok2 then
abort-with-message `err1, 2`
else
if not is-ok1 then
abort-with-message `err1`
else
abort-with-message `err2`
in

document (|
title = {Normalize string and grapheme cluster},
author = {\SATySFi; Contributors},
|) '<>
56 changes: 54 additions & 2 deletions tools/gencode/vminst.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,9 @@ CompiledInlineTextClosure([CompiledImInlineTextText(str)], env)
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
make_string (HorzBox.extract_string ibs)
ibs
|> HorzBox.extract_string
|> make_string
|}
; inst "PrimitiveInlineSkip"
~name:"inline-skip"
Expand Down Expand Up @@ -2019,7 +2021,11 @@ else
~is_text_mode_primitive:true
~code:{|
let ilst = get_list get_int valueilst in
let s = (List.map Uchar.of_int ilst) |> InternalText.of_uchar_list |> InternalText.to_utf8 in
let s =
(List.map Uchar.of_int ilst)
|> InternalText.of_uchar_list
|> InternalText.to_utf8
in
make_string s
|}
; inst "PrimitiveStringExplode"
Expand All @@ -2040,6 +2046,52 @@ let ilst =
|> List.map Uchar.to_int
in
make_list make_int ilst
|}
; inst "PrimitiveNormlizeStringToNfc"
~name:"normalize-string-to-nfc"
~type_:Type.(tS @-> tS)
~fields:[
]
~params:[
param "str" ~type_:"string";
]
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
str
|> NormalizeString.of_utf8_nfc
|> NormalizeString.to_utf8
|> make_string
|}
; inst "PrimitiveNormlizeStringToNfd"
~name:"normalize-string-to-nfd"
~type_:Type.(tS @-> tS)
~fields:[
]
~params:[
param "str" ~type_:"string";
]
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
str
|> NormalizeString.of_utf8_nfd
|> NormalizeString.to_utf8
|> make_string
|}
; inst "PrimitiveSplitGraphemeCluster"
~name:"split-grapheme-cluster"
~type_:Type.(tS @-> (tL tS))
~fields:[
]
~params:[
param "str" ~type_:"string";
]
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
let slst = GraphemeCluster.split_utf8 str in
make_list make_string slst
|}
; inst "PrimitiveRegexpOfString"
~name:"regexp-of-string"
Expand Down
Loading