Skip to content

Commit

Permalink
Merge pull request #417 from puripuri2100/dev-0-1-0-string
Browse files Browse the repository at this point in the history
Add split-grapheme-cluster and string normalization function
  • Loading branch information
gfngfn authored Apr 7, 2024
2 parents 7fe53a5 + 3a45418 commit e9bf9bb
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 10 deletions.
2 changes: 2 additions & 0 deletions satysfi.opam
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ depends: [
"ppx_deriving"
"re" {build}
"uutf"
"uunf"
"uuseg"
"yojson-with-position" {= "1.4.2+satysfi"}
"omd" {< "2.0.0~"}
"ocamlgraph"
Expand Down
4 changes: 4 additions & 0 deletions src/dune
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
menhirLib
otfed
uutf
uunf
uunf.string
uuseg
uuseg.string
yojson-with-position
omd
ocamlgraph
Expand Down
11 changes: 11 additions & 0 deletions src/frontend/graphemeCluster.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

let split_utf8 str =
str
|> Uuseg_string.fold_utf_8 `Grapheme_cluster (fun lst s -> s::lst) []
|> List.rev


let split_utf16be str =
str
|> Uuseg_string.fold_utf_16be `Grapheme_cluster (fun lst s -> s::lst) []
|> List.rev
4 changes: 4 additions & 0 deletions src/frontend/graphemeCluster.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

val split_utf8 : string -> string list

val split_utf16be : string -> string list
12 changes: 4 additions & 8 deletions src/frontend/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -575,8 +575,7 @@ and lex_inline stack = parse
else
report_error lexbuf "unexpected end of input while reading an inline text area"
}
| str+
{ let tok = Lexing.lexeme lexbuf in CHAR(get_pos lexbuf, tok) }
| str+ { CHAR(get_pos lexbuf, Lexing.lexeme lexbuf) }

| _ as c
{ report_error lexbuf (Printf.sprintf "illegal token '%s' in an inline text area" (String.make 1 c)) }
Expand Down Expand Up @@ -643,8 +642,7 @@ and lex_math stack = parse
{ MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| mathascii
{ MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| mathstr+
{ MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| mathstr+ { MATHCHARS(get_pos lexbuf, Lexing.lexeme lexbuf) }
| ("#" (((upper ".")* (lower | upper)) as s))
{
let pos = get_pos lexbuf in
Expand Down Expand Up @@ -743,9 +741,8 @@ and literal quote_length buffer = parse
end else if len > quote_length then
report_error lexbuf "literal area was closed with too many '`'s"
else
let s = Buffer.contents buffer in
let pos_last = get_pos lexbuf in
(pos_last, s, true)
(pos_last, Buffer.contents buffer, true)
}
| (("`"+ as backticks) "#")
{
Expand All @@ -757,9 +754,8 @@ and literal quote_length buffer = parse
end else if len > quote_length then
report_error lexbuf "literal area was closed with too many '`'s"
else
let s = Buffer.contents buffer in
let pos_last = get_pos lexbuf in
(pos_last, s, false)
(pos_last, Buffer.contents buffer, false)
}
| break
{
Expand Down
24 changes: 24 additions & 0 deletions src/frontend/normalizeString.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
type t = string

(*
<https://erratique.ch/software/uunf/doc/Uunf/index.html#type-form>
*)
let of_utf8_nfd str = Uunf_string.normalize_utf_8 `NFD str
let of_utf8_nfc str = Uunf_string.normalize_utf_8 `NFC str

let of_utf16be_nfd str =
let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in
Uunf_string.normalize_utf_8 `NFD str_utf8

let of_utf16be_nfc str =
let str_utf8 = str |> InternalText.of_utf16be |> InternalText.to_utf8 in
Uunf_string.normalize_utf_8 `NFC str_utf8

let to_utf8 t = t

let to_utf16be t =
t |> InternalText.of_utf8 |> InternalText.to_utf16be

let to_utf16be_hex t =
t |> InternalText.of_utf8 |> InternalText.to_utf16be_hex
15 changes: 15 additions & 0 deletions src/frontend/normalizeString.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
type t

val of_utf8_nfd : string -> t

val of_utf8_nfc : string -> t

val of_utf16be_nfd : string -> t

val of_utf16be_nfc : string -> t

val to_utf8 : t -> string

val to_utf16be_hex : t -> string

val to_utf16be : t -> string
70 changes: 70 additions & 0 deletions tests/split-grapheme-cluster.saty
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
% -*- coding: utf-8 -*-
@import: head
@import: ../lib-satysfi/dist/packages/color
@import: ../lib-satysfi/dist/packages/list

let open Pervasives in
let open Head in

let () =
let s1 =
string-unexplode [
0x1F1EF, 0x1F1F5, %%% 🇯🇵
0x30AB, 0x3099, %%% ガ
0x30AC, %%% ガ
0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466, %%% 👨‍👩‍👦(family: man, woman, boy)
0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467, %%% 👩‍👩‍👧‍👧(family: woman, woman, girl, girl)
0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F, %%% 🏴󠁧󠁢󠁷󠁬󠁳󠁿(Wales)
0x1F469, 0x1F3FE, 0x200D, 0x1F393, %%% 👩‍🎓(woman student: medium-dark skin tone)

]
in
let s2 =
[
[0x1F1EF, 0x1F1F5],
[0x30AC],
[0x30AC],
[0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466],
[0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467],
[0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F],
[0x1F469, 0x1F3FE, 0x200D, 0x1F393]
]
in
let s3 =
[
[0x1F1EF, 0x1F1F5],
[0x30AB, 0x3099],
[0x30AB, 0x3099],
[0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466],
[0x1F469, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467],
[0x1F3F4, 0xE0067, 0xE0062, 0xE0077, 0xE006C, 0xE0073, 0xE007F],
[0x1F469, 0x1F3FE, 0x200D, 0x1F393]
]
in
let slst1 = s1 |> normalize-string-to-nfc |> split-grapheme-cluster in
let slst2 = s1 |> normalize-string-to-nfd |> split-grapheme-cluster in
let rec check-loop l1 l2 =
match (l1, l2) with
| (s1::xs1, s2::xs2)-> string-same s1 (string-unexplode s2) && (check-loop xs1 xs2)
| ([], []) -> true
| _ -> false
end
in
let is-ok1 = check-loop slst1 s2 in
let is-ok2 = check-loop slst2 s3 in
if is-ok1 && is-ok2 then
display-message `ok`
else
if not is-ok1 && not is-ok2 then
abort-with-message `err1, 2`
else
if not is-ok1 then
abort-with-message `err1`
else
abort-with-message `err2`
in

document (|
title = {Normalize string and grapheme cluster},
author = {\SATySFi; Contributors},
|) '<>
56 changes: 54 additions & 2 deletions tools/gencode/vminst.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,9 @@ CompiledInlineTextClosure([CompiledImInlineTextText(str)], env)
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
make_string (HorzBox.extract_string ibs)
ibs
|> HorzBox.extract_string
|> make_string
|}
; inst "PrimitiveInlineSkip"
~name:"inline-skip"
Expand Down Expand Up @@ -2019,7 +2021,11 @@ else
~is_text_mode_primitive:true
~code:{|
let ilst = get_list get_int valueilst in
let s = (List.map Uchar.of_int ilst) |> InternalText.of_uchar_list |> InternalText.to_utf8 in
let s =
(List.map Uchar.of_int ilst)
|> InternalText.of_uchar_list
|> InternalText.to_utf8
in
make_string s
|}
; inst "PrimitiveStringExplode"
Expand All @@ -2040,6 +2046,52 @@ let ilst =
|> List.map Uchar.to_int
in
make_list make_int ilst
|}
; inst "PrimitiveNormlizeStringToNfc"
~name:"normalize-string-to-nfc"
~type_:Type.(tS @-> tS)
~fields:[
]
~params:[
param "str" ~type_:"string";
]
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
str
|> NormalizeString.of_utf8_nfc
|> NormalizeString.to_utf8
|> make_string
|}
; inst "PrimitiveNormlizeStringToNfd"
~name:"normalize-string-to-nfd"
~type_:Type.(tS @-> tS)
~fields:[
]
~params:[
param "str" ~type_:"string";
]
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
str
|> NormalizeString.of_utf8_nfd
|> NormalizeString.to_utf8
|> make_string
|}
; inst "PrimitiveSplitGraphemeCluster"
~name:"split-grapheme-cluster"
~type_:Type.(tS @-> (tL tS))
~fields:[
]
~params:[
param "str" ~type_:"string";
]
~is_pdf_mode_primitive:true
~is_text_mode_primitive:true
~code:{|
let slst = GraphemeCluster.split_utf8 str in
make_list make_string slst
|}
; inst "PrimitiveRegexpOfString"
~name:"regexp-of-string"
Expand Down

0 comments on commit e9bf9bb

Please sign in to comment.